gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2018 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "diagnostic.h"
  44 #include "insn-attr.h"
  45 #include "alias.h"
  46 #include "fold-const.h"
  47 #include "stor-layout.h"
  48 #include "calls.h"
  49 #include "varasm.h"
  50 #include "output.h"
  51 #include "flags.h"
  52 #include "explow.h"
  53 #include "expr.h"
  54 #include "reload.h"
  55 #include "langhooks.h"
  56 #include "opts.h"
  57 #include "params.h"
  58 #include "gimplify.h"
  59 #include "dwarf2.h"
  60 #include "gimple-iterator.h"
  61 #include "tree-vectorizer.h"
  62 #include "aarch64-cost-tables.h"
  63 #include "dumpfile.h"
  64 #include "builtins.h"
  65 #include "rtl-iter.h"
  66 #include "tm-constrs.h"
  67 #include "sched-int.h"
  68 #include "target-globals.h"
  69 #include "common/common-target.h"
  70 #include "cfgrtl.h"
  71 #include "selftest.h"
  72 #include "selftest-rtl.h"
  73 #include "rtx-vector-builder.h"
  74
  75 /* This file should be included last.  */
  76 #include "target-def.h"
  77
  78 /* Defined for convenience.  */
  79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  80
  81 /* Classifies an address.
  82
  83    ADDRESS_REG_IMM
  84        A simple base register plus immediate offset.
  85
  86    ADDRESS_REG_WB
  87        A base register indexed by immediate offset with writeback.
  88
  89    ADDRESS_REG_REG
  90        A base register indexed by (optionally scaled) register.
  91
  92    ADDRESS_REG_UXTW
  93        A base register indexed by (optionally scaled) zero-extended register.
  94
  95    ADDRESS_REG_SXTW
  96        A base register indexed by (optionally scaled) sign-extended register.
  97
  98    ADDRESS_LO_SUM
  99        A LO_SUM rtx with a base register and "LO12" symbol relocation.
 100
 101    ADDRESS_SYMBOLIC:
 102        A constant symbolic address, in pc-relative literal pool.  */
 103
 104 enum aarch64_address_type {
 105   ADDRESS_REG_IMM,
 106   ADDRESS_REG_WB,
 107   ADDRESS_REG_REG,
 108   ADDRESS_REG_UXTW,
 109   ADDRESS_REG_SXTW,
 110   ADDRESS_LO_SUM,
 111   ADDRESS_SYMBOLIC
 112 };
 113
 114 struct aarch64_address_info {
 115   enum aarch64_address_type type;
 116   rtx base;
 117   rtx offset;
 118   poly_int64 const_offset;
 119   int shift;
 120   enum aarch64_symbol_type symbol_type;
 121 };
 122
 123 /* Information about a legitimate vector immediate operand.  */
 124 struct simd_immediate_info
 125 {
 126   enum insn_type { MOV, MVN };
 127   enum modifier_type { LSL, MSL };
 128
 129   simd_immediate_info () {}
 130   simd_immediate_info (scalar_float_mode, rtx);
 131   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
 132                        insn_type = MOV, modifier_type = LSL,
 133                        unsigned int = 0);
 134   simd_immediate_info (scalar_mode, rtx, rtx);
 135
 136   /* The mode of the elements.  */
 137   scalar_mode elt_mode;
 138
 139   /* The value of each element if all elements are the same, or the
 140      first value if the constant is a series.  */
 141   rtx value;
 142
 143   /* The value of the step if the constant is a series, null otherwise.  */
 144   rtx step;
 145
 146   /* The instruction to use to move the immediate into a vector.  */
 147   insn_type insn;
 148
 149   /* The kind of shift modifier to use, and the number of bits to shift.
 150      This is (LSL, 0) if no shift is needed.  */
 151   modifier_type modifier;
 152   unsigned int shift;
 153 };
 154
 155 /* Construct a floating-point immediate in which each element has mode
 156    ELT_MODE_IN and value VALUE_IN.  */
 157 inline simd_immediate_info
 158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 159   : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
 160     modifier (LSL), shift (0)
 161 {}
 162
 163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 164    and value VALUE_IN.  The other parameters are as for the structure
 165    fields.  */
 166 inline simd_immediate_info
 167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 168                        unsigned HOST_WIDE_INT value_in,
 169                        insn_type insn_in, modifier_type modifier_in,
 170                        unsigned int shift_in)
 171   : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
 172     step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
 173 {}
 174
 175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 176    and where element I is equal to VALUE_IN + I * STEP_IN.  */
 177 inline simd_immediate_info
 178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
 179   : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
 180     modifier (LSL), shift (0)
 181 {}
 182
 183 /* The current code model.  */
 184 enum aarch64_code_model aarch64_cmodel;
 185
 186 /* The number of 64-bit elements in an SVE vector.  */
 187 poly_uint16 aarch64_sve_vg;
 188
 189 #ifdef HAVE_AS_TLS
 190 #undef TARGET_HAVE_TLS
 191 #define TARGET_HAVE_TLS 1
 192 #endif
 193
 194 static bool aarch64_composite_type_p (const_tree, machine_mode);
 195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 196                                                      const_tree,
 197                                                      machine_mode *, int *,
 198                                                      bool *);
 199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 201 static void aarch64_override_options_after_change (void);
 202 static bool aarch64_vector_mode_supported_p (machine_mode);
 203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 205                                                          const_tree type,
 206                                                          int misalignment,
 207                                                          bool is_packed);
 208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode, rtx);
 210
 211 /* Major revision number of the ARM Architecture implemented by the target.  */
 212 unsigned aarch64_architecture_version;
 213
 214 /* The processor for which instructions should be scheduled.  */
 215 enum aarch64_processor aarch64_tune = cortexa53;
 216
 217 /* Mask to specify which instruction scheduling options should be used.  */
 218 unsigned long aarch64_tune_flags = 0;
 219
 220 /* Global flag for PC relative loads.  */
 221 bool aarch64_pcrelative_literal_loads;
 222
 223 /* Global flag for whether frame pointer is enabled.  */
 224 bool aarch64_use_frame_pointer;
 225
 226 /* Support for command line parsing of boolean flags in the tuning
 227    structures.  */
 228 struct aarch64_flag_desc
 229 {
 230   const char* name;
 231   unsigned int flag;
 232 };
 233
 234 #define AARCH64_FUSION_PAIR(name, internal_name) \
 235   { name, AARCH64_FUSE_##internal_name },
 236 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 237 {
 238   { "none", AARCH64_FUSE_NOTHING },
 239 #include "aarch64-fusion-pairs.def"
 240   { "all", AARCH64_FUSE_ALL },
 241   { NULL, AARCH64_FUSE_NOTHING }
 242 };
 243
 244 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 245   { name, AARCH64_EXTRA_TUNE_##internal_name },
 246 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 247 {
 248   { "none", AARCH64_EXTRA_TUNE_NONE },
 249 #include "aarch64-tuning-flags.def"
 250   { "all", AARCH64_EXTRA_TUNE_ALL },
 251   { NULL, AARCH64_EXTRA_TUNE_NONE }
 252 };
 253
 254 /* Tuning parameters.  */
 255
 256 static const struct cpu_addrcost_table generic_addrcost_table =
 257 {
 258     {
 259       1, /* hi  */
 260       0, /* si  */
 261       0, /* di  */
 262       1, /* ti  */
 263     },
 264   0, /* pre_modify  */
 265   0, /* post_modify  */
 266   0, /* register_offset  */
 267   0, /* register_sextend  */
 268   0, /* register_zextend  */
 269   0 /* imm_offset  */
 270 };
 271
 272 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 273 {
 274     {
 275       0, /* hi  */
 276       0, /* si  */
 277       0, /* di  */
 278       2, /* ti  */
 279     },
 280   0, /* pre_modify  */
 281   0, /* post_modify  */
 282   1, /* register_offset  */
 283   1, /* register_sextend  */
 284   2, /* register_zextend  */
 285   0, /* imm_offset  */
 286 };
 287
 288 static const struct cpu_addrcost_table xgene1_addrcost_table =
 289 {
 290     {
 291       1, /* hi  */
 292       0, /* si  */
 293       0, /* di  */
 294       1, /* ti  */
 295     },
 296   1, /* pre_modify  */
 297   0, /* post_modify  */
 298   0, /* register_offset  */
 299   1, /* register_sextend  */
 300   1, /* register_zextend  */
 301   0, /* imm_offset  */
 302 };
 303
 304 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 305 {
 306     {
 307       1, /* hi  */
 308       1, /* si  */
 309       1, /* di  */
 310       2, /* ti  */
 311     },
 312   0, /* pre_modify  */
 313   0, /* post_modify  */
 314   2, /* register_offset  */
 315   3, /* register_sextend  */
 316   3, /* register_zextend  */
 317   0, /* imm_offset  */
 318 };
 319
 320 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 321 {
 322     {
 323       1, /* hi  */
 324       1, /* si  */
 325       1, /* di  */
 326       2, /* ti  */
 327     },
 328   1, /* pre_modify  */
 329   1, /* post_modify  */
 330   3, /* register_offset  */
 331   4, /* register_sextend  */
 332   3, /* register_zextend  */
 333   2, /* imm_offset  */
 334 };
 335
 336 static const struct cpu_regmove_cost generic_regmove_cost =
 337 {
 338   1, /* GP2GP  */
 339   /* Avoid the use of slow int<->fp moves for spilling by setting
 340      their cost higher than memmov_cost.  */
 341   5, /* GP2FP  */
 342   5, /* FP2GP  */
 343   2 /* FP2FP  */
 344 };
 345
 346 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 347 {
 348   1, /* GP2GP  */
 349   /* Avoid the use of slow int<->fp moves for spilling by setting
 350      their cost higher than memmov_cost.  */
 351   5, /* GP2FP  */
 352   5, /* FP2GP  */
 353   2 /* FP2FP  */
 354 };
 355
 356 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 357 {
 358   1, /* GP2GP  */
 359   /* Avoid the use of slow int<->fp moves for spilling by setting
 360      their cost higher than memmov_cost.  */
 361   5, /* GP2FP  */
 362   5, /* FP2GP  */
 363   2 /* FP2FP  */
 364 };
 365
 366 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 367 {
 368   1, /* GP2GP  */
 369   /* Avoid the use of slow int<->fp moves for spilling by setting
 370      their cost higher than memmov_cost (actual, 4 and 9).  */
 371   9, /* GP2FP  */
 372   9, /* FP2GP  */
 373   1 /* FP2FP  */
 374 };
 375
 376 static const struct cpu_regmove_cost thunderx_regmove_cost =
 377 {
 378   2, /* GP2GP  */
 379   2, /* GP2FP  */
 380   6, /* FP2GP  */
 381   4 /* FP2FP  */
 382 };
 383
 384 static const struct cpu_regmove_cost xgene1_regmove_cost =
 385 {
 386   1, /* GP2GP  */
 387   /* Avoid the use of slow int<->fp moves for spilling by setting
 388      their cost higher than memmov_cost.  */
 389   8, /* GP2FP  */
 390   8, /* FP2GP  */
 391   2 /* FP2FP  */
 392 };
 393
 394 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 395 {
 396   2, /* GP2GP  */
 397   /* Avoid the use of int<->fp moves for spilling.  */
 398   6, /* GP2FP  */
 399   6, /* FP2GP  */
 400   4 /* FP2FP  */
 401 };
 402
 403 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 404 {
 405   1, /* GP2GP  */
 406   /* Avoid the use of int<->fp moves for spilling.  */
 407   8, /* GP2FP  */
 408   8, /* FP2GP  */
 409   4  /* FP2FP  */
 410 };
 411
 412 /* Generic costs for vector insn classes.  */
 413 static const struct cpu_vector_cost generic_vector_cost =
 414 {
 415   1, /* scalar_int_stmt_cost  */
 416   1, /* scalar_fp_stmt_cost  */
 417   1, /* scalar_load_cost  */
 418   1, /* scalar_store_cost  */
 419   1, /* vec_int_stmt_cost  */
 420   1, /* vec_fp_stmt_cost  */
 421   2, /* vec_permute_cost  */
 422   1, /* vec_to_scalar_cost  */
 423   1, /* scalar_to_vec_cost  */
 424   1, /* vec_align_load_cost  */
 425   1, /* vec_unalign_load_cost  */
 426   1, /* vec_unalign_store_cost  */
 427   1, /* vec_store_cost  */
 428   3, /* cond_taken_branch_cost  */
 429   1 /* cond_not_taken_branch_cost  */
 430 };
 431
 432 /* ThunderX costs for vector insn classes.  */
 433 static const struct cpu_vector_cost thunderx_vector_cost =
 434 {
 435   1, /* scalar_int_stmt_cost  */
 436   1, /* scalar_fp_stmt_cost  */
 437   3, /* scalar_load_cost  */
 438   1, /* scalar_store_cost  */
 439   4, /* vec_int_stmt_cost  */
 440   1, /* vec_fp_stmt_cost  */
 441   4, /* vec_permute_cost  */
 442   2, /* vec_to_scalar_cost  */
 443   2, /* scalar_to_vec_cost  */
 444   3, /* vec_align_load_cost  */
 445   5, /* vec_unalign_load_cost  */
 446   5, /* vec_unalign_store_cost  */
 447   1, /* vec_store_cost  */
 448   3, /* cond_taken_branch_cost  */
 449   3 /* cond_not_taken_branch_cost  */
 450 };
 451
 452 /* Generic costs for vector insn classes.  */
 453 static const struct cpu_vector_cost cortexa57_vector_cost =
 454 {
 455   1, /* scalar_int_stmt_cost  */
 456   1, /* scalar_fp_stmt_cost  */
 457   4, /* scalar_load_cost  */
 458   1, /* scalar_store_cost  */
 459   2, /* vec_int_stmt_cost  */
 460   2, /* vec_fp_stmt_cost  */
 461   3, /* vec_permute_cost  */
 462   8, /* vec_to_scalar_cost  */
 463   8, /* scalar_to_vec_cost  */
 464   4, /* vec_align_load_cost  */
 465   4, /* vec_unalign_load_cost  */
 466   1, /* vec_unalign_store_cost  */
 467   1, /* vec_store_cost  */
 468   1, /* cond_taken_branch_cost  */
 469   1 /* cond_not_taken_branch_cost  */
 470 };
 471
 472 static const struct cpu_vector_cost exynosm1_vector_cost =
 473 {
 474   1, /* scalar_int_stmt_cost  */
 475   1, /* scalar_fp_stmt_cost  */
 476   5, /* scalar_load_cost  */
 477   1, /* scalar_store_cost  */
 478   3, /* vec_int_stmt_cost  */
 479   3, /* vec_fp_stmt_cost  */
 480   3, /* vec_permute_cost  */
 481   3, /* vec_to_scalar_cost  */
 482   3, /* scalar_to_vec_cost  */
 483   5, /* vec_align_load_cost  */
 484   5, /* vec_unalign_load_cost  */
 485   1, /* vec_unalign_store_cost  */
 486   1, /* vec_store_cost  */
 487   1, /* cond_taken_branch_cost  */
 488   1 /* cond_not_taken_branch_cost  */
 489 };
 490
 491 /* Generic costs for vector insn classes.  */
 492 static const struct cpu_vector_cost xgene1_vector_cost =
 493 {
 494   1, /* scalar_int_stmt_cost  */
 495   1, /* scalar_fp_stmt_cost  */
 496   5, /* scalar_load_cost  */
 497   1, /* scalar_store_cost  */
 498   2, /* vec_int_stmt_cost  */
 499   2, /* vec_fp_stmt_cost  */
 500   2, /* vec_permute_cost  */
 501   4, /* vec_to_scalar_cost  */
 502   4, /* scalar_to_vec_cost  */
 503   10, /* vec_align_load_cost  */
 504   10, /* vec_unalign_load_cost  */
 505   2, /* vec_unalign_store_cost  */
 506   2, /* vec_store_cost  */
 507   2, /* cond_taken_branch_cost  */
 508   1 /* cond_not_taken_branch_cost  */
 509 };
 510
 511 /* Costs for vector insn classes for Vulcan.  */
 512 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 513 {
 514   1, /* scalar_int_stmt_cost  */
 515   6, /* scalar_fp_stmt_cost  */
 516   4, /* scalar_load_cost  */
 517   1, /* scalar_store_cost  */
 518   5, /* vec_int_stmt_cost  */
 519   6, /* vec_fp_stmt_cost  */
 520   3, /* vec_permute_cost  */
 521   6, /* vec_to_scalar_cost  */
 522   5, /* scalar_to_vec_cost  */
 523   8, /* vec_align_load_cost  */
 524   8, /* vec_unalign_load_cost  */
 525   4, /* vec_unalign_store_cost  */
 526   4, /* vec_store_cost  */
 527   2, /* cond_taken_branch_cost  */
 528   1  /* cond_not_taken_branch_cost  */
 529 };
 530
 531 /* Generic costs for branch instructions.  */
 532 static const struct cpu_branch_cost generic_branch_cost =
 533 {
 534   1,  /* Predictable.  */
 535   3   /* Unpredictable.  */
 536 };
 537
 538 /* Generic approximation modes.  */
 539 static const cpu_approx_modes generic_approx_modes =
 540 {
 541   AARCH64_APPROX_NONE,  /* division  */
 542   AARCH64_APPROX_NONE,  /* sqrt  */
 543   AARCH64_APPROX_NONE   /* recip_sqrt  */
 544 };
 545
 546 /* Approximation modes for Exynos M1.  */
 547 static const cpu_approx_modes exynosm1_approx_modes =
 548 {
 549   AARCH64_APPROX_NONE,  /* division  */
 550   AARCH64_APPROX_ALL,   /* sqrt  */
 551   AARCH64_APPROX_ALL    /* recip_sqrt  */
 552 };
 553
 554 /* Approximation modes for X-Gene 1.  */
 555 static const cpu_approx_modes xgene1_approx_modes =
 556 {
 557   AARCH64_APPROX_NONE,  /* division  */
 558   AARCH64_APPROX_NONE,  /* sqrt  */
 559   AARCH64_APPROX_ALL    /* recip_sqrt  */
 560 };
 561
 562 /* Generic prefetch settings (which disable prefetch).  */
 563 static const cpu_prefetch_tune generic_prefetch_tune =
 564 {
 565   0,                    /* num_slots  */
 566   -1,                   /* l1_cache_size  */
 567   -1,                   /* l1_cache_line_size  */
 568   -1,                   /* l2_cache_size  */
 569   true,                 /* prefetch_dynamic_strides */
 570   -1,                   /* minimum_stride */
 571   -1                    /* default_opt_level  */
 572 };
 573
 574 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 575 {
 576   0,                    /* num_slots  */
 577   -1,                   /* l1_cache_size  */
 578   64,                   /* l1_cache_line_size  */
 579   -1,                   /* l2_cache_size  */
 580   true,                 /* prefetch_dynamic_strides */
 581   -1,                   /* minimum_stride */
 582   -1                    /* default_opt_level  */
 583 };
 584
 585 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 586 {
 587   4,                    /* num_slots  */
 588   32,                   /* l1_cache_size  */
 589   64,                   /* l1_cache_line_size  */
 590   512,                  /* l2_cache_size  */
 591   false,                /* prefetch_dynamic_strides */
 592   2048,                 /* minimum_stride */
 593   3                     /* default_opt_level  */
 594 };
 595
 596 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 597 {
 598   8,                    /* num_slots  */
 599   32,                   /* l1_cache_size  */
 600   128,                  /* l1_cache_line_size  */
 601   16*1024,              /* l2_cache_size  */
 602   true,                 /* prefetch_dynamic_strides */
 603   -1,                   /* minimum_stride */
 604   3                     /* default_opt_level  */
 605 };
 606
 607 static const cpu_prefetch_tune thunderx_prefetch_tune =
 608 {
 609   8,                    /* num_slots  */
 610   32,                   /* l1_cache_size  */
 611   128,                  /* l1_cache_line_size  */
 612   -1,                   /* l2_cache_size  */
 613   true,                 /* prefetch_dynamic_strides */
 614   -1,                   /* minimum_stride */
 615   -1                    /* default_opt_level  */
 616 };
 617
 618 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 619 {
 620   8,                    /* num_slots  */
 621   32,                   /* l1_cache_size  */
 622   64,                   /* l1_cache_line_size  */
 623   256,                  /* l2_cache_size  */
 624   true,                 /* prefetch_dynamic_strides */
 625   -1,                   /* minimum_stride */
 626   -1                    /* default_opt_level  */
 627 };
 628
 629 static const struct tune_params generic_tunings =
 630 {
 631   &cortexa57_extra_costs,
 632   &generic_addrcost_table,
 633   &generic_regmove_cost,
 634   &generic_vector_cost,
 635   &generic_branch_cost,
 636   &generic_approx_modes,
 637   4, /* memmov_cost  */
 638   2, /* issue_rate  */
 639   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 640   "8",  /* function_align.  */
 641   "4",  /* jump_align.  */
 642   "8",  /* loop_align.  */
 643   2,    /* int_reassoc_width.  */
 644   4,    /* fp_reassoc_width.  */
 645   1,    /* vec_reassoc_width.  */
 646   2,    /* min_div_recip_mul_sf.  */
 647   2,    /* min_div_recip_mul_df.  */
 648   0,    /* max_case_values.  */
 649   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 650   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 651   &generic_prefetch_tune
 652 };
 653
 654 static const struct tune_params cortexa35_tunings =
 655 {
 656   &cortexa53_extra_costs,
 657   &generic_addrcost_table,
 658   &cortexa53_regmove_cost,
 659   &generic_vector_cost,
 660   &generic_branch_cost,
 661   &generic_approx_modes,
 662   4, /* memmov_cost  */
 663   1, /* issue_rate  */
 664   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 665    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 666   "16", /* function_align.  */
 667   "4",  /* jump_align.  */
 668   "8",  /* loop_align.  */
 669   2,    /* int_reassoc_width.  */
 670   4,    /* fp_reassoc_width.  */
 671   1,    /* vec_reassoc_width.  */
 672   2,    /* min_div_recip_mul_sf.  */
 673   2,    /* min_div_recip_mul_df.  */
 674   0,    /* max_case_values.  */
 675   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 676   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 677   &generic_prefetch_tune
 678 };
 679
 680 static const struct tune_params cortexa53_tunings =
 681 {
 682   &cortexa53_extra_costs,
 683   &generic_addrcost_table,
 684   &cortexa53_regmove_cost,
 685   &generic_vector_cost,
 686   &generic_branch_cost,
 687   &generic_approx_modes,
 688   4, /* memmov_cost  */
 689   2, /* issue_rate  */
 690   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 691    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 692   "16", /* function_align.  */
 693   "4",  /* jump_align.  */
 694   "8",  /* loop_align.  */
 695   2,    /* int_reassoc_width.  */
 696   4,    /* fp_reassoc_width.  */
 697   1,    /* vec_reassoc_width.  */
 698   2,    /* min_div_recip_mul_sf.  */
 699   2,    /* min_div_recip_mul_df.  */
 700   0,    /* max_case_values.  */
 701   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 702   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 703   &generic_prefetch_tune
 704 };
 705
 706 static const struct tune_params cortexa57_tunings =
 707 {
 708   &cortexa57_extra_costs,
 709   &generic_addrcost_table,
 710   &cortexa57_regmove_cost,
 711   &cortexa57_vector_cost,
 712   &generic_branch_cost,
 713   &generic_approx_modes,
 714   4, /* memmov_cost  */
 715   3, /* issue_rate  */
 716   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 717    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 718   "16", /* function_align.  */
 719   "4",  /* jump_align.  */
 720   "8",  /* loop_align.  */
 721   2,    /* int_reassoc_width.  */
 722   4,    /* fp_reassoc_width.  */
 723   1,    /* vec_reassoc_width.  */
 724   2,    /* min_div_recip_mul_sf.  */
 725   2,    /* min_div_recip_mul_df.  */
 726   0,    /* max_case_values.  */
 727   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 728   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 729   &generic_prefetch_tune
 730 };
 731
 732 static const struct tune_params cortexa72_tunings =
 733 {
 734   &cortexa57_extra_costs,
 735   &generic_addrcost_table,
 736   &cortexa57_regmove_cost,
 737   &cortexa57_vector_cost,
 738   &generic_branch_cost,
 739   &generic_approx_modes,
 740   4, /* memmov_cost  */
 741   3, /* issue_rate  */
 742   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 743    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 744   "16", /* function_align.  */
 745   "4",  /* jump_align.  */
 746   "8",  /* loop_align.  */
 747   2,    /* int_reassoc_width.  */
 748   4,    /* fp_reassoc_width.  */
 749   1,    /* vec_reassoc_width.  */
 750   2,    /* min_div_recip_mul_sf.  */
 751   2,    /* min_div_recip_mul_df.  */
 752   0,    /* max_case_values.  */
 753   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 754   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 755   &generic_prefetch_tune
 756 };
 757
 758 static const struct tune_params cortexa73_tunings =
 759 {
 760   &cortexa57_extra_costs,
 761   &generic_addrcost_table,
 762   &cortexa57_regmove_cost,
 763   &cortexa57_vector_cost,
 764   &generic_branch_cost,
 765   &generic_approx_modes,
 766   4, /* memmov_cost.  */
 767   2, /* issue_rate.  */
 768   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 769    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 770   "16", /* function_align.  */
 771   "4",  /* jump_align.  */
 772   "8",  /* loop_align.  */
 773   2,    /* int_reassoc_width.  */
 774   4,    /* fp_reassoc_width.  */
 775   1,    /* vec_reassoc_width.  */
 776   2,    /* min_div_recip_mul_sf.  */
 777   2,    /* min_div_recip_mul_df.  */
 778   0,    /* max_case_values.  */
 779   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 780   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 781   &generic_prefetch_tune
 782 };
 783
 784
 785
 786 static const struct tune_params exynosm1_tunings =
 787 {
 788   &exynosm1_extra_costs,
 789   &exynosm1_addrcost_table,
 790   &exynosm1_regmove_cost,
 791   &exynosm1_vector_cost,
 792   &generic_branch_cost,
 793   &exynosm1_approx_modes,
 794   4,    /* memmov_cost  */
 795   3,    /* issue_rate  */
 796   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 797   "4",  /* function_align.  */
 798   "4",  /* jump_align.  */
 799   "4",  /* loop_align.  */
 800   2,    /* int_reassoc_width.  */
 801   4,    /* fp_reassoc_width.  */
 802   1,    /* vec_reassoc_width.  */
 803   2,    /* min_div_recip_mul_sf.  */
 804   2,    /* min_div_recip_mul_df.  */
 805   48,   /* max_case_values.  */
 806   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 807   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 808   &exynosm1_prefetch_tune
 809 };
 810
 811 static const struct tune_params thunderxt88_tunings =
 812 {
 813   &thunderx_extra_costs,
 814   &generic_addrcost_table,
 815   &thunderx_regmove_cost,
 816   &thunderx_vector_cost,
 817   &generic_branch_cost,
 818   &generic_approx_modes,
 819   6, /* memmov_cost  */
 820   2, /* issue_rate  */
 821   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 822   "8",  /* function_align.  */
 823   "8",  /* jump_align.  */
 824   "8",  /* loop_align.  */
 825   2,    /* int_reassoc_width.  */
 826   4,    /* fp_reassoc_width.  */
 827   1,    /* vec_reassoc_width.  */
 828   2,    /* min_div_recip_mul_sf.  */
 829   2,    /* min_div_recip_mul_df.  */
 830   0,    /* max_case_values.  */
 831   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 832   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 833   &thunderxt88_prefetch_tune
 834 };
 835
 836 static const struct tune_params thunderx_tunings =
 837 {
 838   &thunderx_extra_costs,
 839   &generic_addrcost_table,
 840   &thunderx_regmove_cost,
 841   &thunderx_vector_cost,
 842   &generic_branch_cost,
 843   &generic_approx_modes,
 844   6, /* memmov_cost  */
 845   2, /* issue_rate  */
 846   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 847   "8",  /* function_align.  */
 848   "8",  /* jump_align.  */
 849   "8",  /* loop_align.  */
 850   2,    /* int_reassoc_width.  */
 851   4,    /* fp_reassoc_width.  */
 852   1,    /* vec_reassoc_width.  */
 853   2,    /* min_div_recip_mul_sf.  */
 854   2,    /* min_div_recip_mul_df.  */
 855   0,    /* max_case_values.  */
 856   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 857   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 858    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 859   &thunderx_prefetch_tune
 860 };
 861
 862 static const struct tune_params xgene1_tunings =
 863 {
 864   &xgene1_extra_costs,
 865   &xgene1_addrcost_table,
 866   &xgene1_regmove_cost,
 867   &xgene1_vector_cost,
 868   &generic_branch_cost,
 869   &xgene1_approx_modes,
 870   6, /* memmov_cost  */
 871   4, /* issue_rate  */
 872   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 873   "16", /* function_align.  */
 874   "8",  /* jump_align.  */
 875   "16", /* loop_align.  */
 876   2,    /* int_reassoc_width.  */
 877   4,    /* fp_reassoc_width.  */
 878   1,    /* vec_reassoc_width.  */
 879   2,    /* min_div_recip_mul_sf.  */
 880   2,    /* min_div_recip_mul_df.  */
 881   0,    /* max_case_values.  */
 882   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 883   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
 884   &generic_prefetch_tune
 885 };
 886
 887 static const struct tune_params qdf24xx_tunings =
 888 {
 889   &qdf24xx_extra_costs,
 890   &qdf24xx_addrcost_table,
 891   &qdf24xx_regmove_cost,
 892   &generic_vector_cost,
 893   &generic_branch_cost,
 894   &generic_approx_modes,
 895   4, /* memmov_cost  */
 896   4, /* issue_rate  */
 897   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 898    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 899   "16", /* function_align.  */
 900   "8",  /* jump_align.  */
 901   "16", /* loop_align.  */
 902   2,    /* int_reassoc_width.  */
 903   4,    /* fp_reassoc_width.  */
 904   1,    /* vec_reassoc_width.  */
 905   2,    /* min_div_recip_mul_sf.  */
 906   2,    /* min_div_recip_mul_df.  */
 907   0,    /* max_case_values.  */
 908   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 909   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 910   &qdf24xx_prefetch_tune
 911 };
 912
 913 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
 914    for now.  */
 915 static const struct tune_params saphira_tunings =
 916 {
 917   &generic_extra_costs,
 918   &generic_addrcost_table,
 919   &generic_regmove_cost,
 920   &generic_vector_cost,
 921   &generic_branch_cost,
 922   &generic_approx_modes,
 923   4, /* memmov_cost  */
 924   4, /* issue_rate  */
 925   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 926    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 927   "16", /* function_align.  */
 928   "8",  /* jump_align.  */
 929   "16", /* loop_align.  */
 930   2,    /* int_reassoc_width.  */
 931   4,    /* fp_reassoc_width.  */
 932   1,    /* vec_reassoc_width.  */
 933   2,    /* min_div_recip_mul_sf.  */
 934   2,    /* min_div_recip_mul_df.  */
 935   0,    /* max_case_values.  */
 936   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 937   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 938   &generic_prefetch_tune
 939 };
 940
 941 static const struct tune_params thunderx2t99_tunings =
 942 {
 943   &thunderx2t99_extra_costs,
 944   &thunderx2t99_addrcost_table,
 945   &thunderx2t99_regmove_cost,
 946   &thunderx2t99_vector_cost,
 947   &generic_branch_cost,
 948   &generic_approx_modes,
 949   4, /* memmov_cost.  */
 950   4, /* issue_rate.  */
 951   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
 952    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 953   "16", /* function_align.  */
 954   "8",  /* jump_align.  */
 955   "16", /* loop_align.  */
 956   3,    /* int_reassoc_width.  */
 957   2,    /* fp_reassoc_width.  */
 958   2,    /* vec_reassoc_width.  */
 959   2,    /* min_div_recip_mul_sf.  */
 960   2,    /* min_div_recip_mul_df.  */
 961   0,    /* max_case_values.  */
 962   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 963   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 964   &thunderx2t99_prefetch_tune
 965 };
 966
 967 /* Support for fine-grained override of the tuning structures.  */
 968 struct aarch64_tuning_override_function
 969 {
 970   const char* name;
 971   void (*parse_override)(const char*, struct tune_params*);
 972 };
 973
 974 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 975 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 976
 977 static const struct aarch64_tuning_override_function
 978 aarch64_tuning_override_functions[] =
 979 {
 980   { "fuse", aarch64_parse_fuse_string },
 981   { "tune", aarch64_parse_tune_string },
 982   { NULL, NULL }
 983 };
 984
 985 /* A processor implementing AArch64.  */
 986 struct processor
 987 {
 988   const char *const name;
 989   enum aarch64_processor ident;
 990   enum aarch64_processor sched_core;
 991   enum aarch64_arch arch;
 992   unsigned architecture_version;
 993   const unsigned long flags;
 994   const struct tune_params *const tune;
 995 };
 996
 997 /* Architectures implementing AArch64.  */
 998 static const struct processor all_architectures[] =
 999 {
1000 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1001   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1002 #include "aarch64-arches.def"
1003   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1004 };
1005
1006 /* Processor cores implementing AArch64.  */
1007 static const struct processor all_cores[] =
1008 {
1009 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1010   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1011   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1012   FLAGS, &COSTS##_tunings},
1013 #include "aarch64-cores.def"
1014   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1015     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1016   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1017 };
1018
1019
1020 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1021    handling code or by target attributes.  */
1022 static const struct processor *selected_arch;
1023 static const struct processor *selected_cpu;
1024 static const struct processor *selected_tune;
1025
1026 /* The current tuning set.  */
1027 struct tune_params aarch64_tune_params = generic_tunings;
1028
1029 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1030
1031 /* An ISA extension in the co-processor and main instruction set space.  */
1032 struct aarch64_option_extension
1033 {
1034   const char *const name;
1035   const unsigned long flags_on;
1036   const unsigned long flags_off;
1037 };
1038
1039 typedef enum aarch64_cond_code
1040 {
1041   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1042   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1043   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1044 }
1045 aarch64_cc;
1046
1047 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1048
1049 /* The condition codes of the processor, and the inverse function.  */
1050 static const char * const aarch64_condition_codes[] =
1051 {
1052   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1053   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1054 };
1055
1056 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1057 const char *
1058 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1059                         const char * branch_format)
1060 {
1061     rtx_code_label * tmp_label = gen_label_rtx ();
1062     char label_buf[256];
1063     char buffer[128];
1064     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1065                                  CODE_LABEL_NUMBER (tmp_label));
1066     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1067     rtx dest_label = operands[pos_label];
1068     operands[pos_label] = tmp_label;
1069
1070     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1071     output_asm_insn (buffer, operands);
1072
1073     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1074     operands[pos_label] = dest_label;
1075     output_asm_insn (buffer, operands);
1076     return "";
1077 }
1078
1079 void
1080 aarch64_err_no_fpadvsimd (machine_mode mode)
1081 {
1082   if (TARGET_GENERAL_REGS_ONLY)
1083     if (FLOAT_MODE_P (mode))
1084       error ("%qs is incompatible with the use of floating-point types",
1085              "-mgeneral-regs-only");
1086     else
1087       error ("%qs is incompatible with the use of vector types",
1088              "-mgeneral-regs-only");
1089   else
1090     if (FLOAT_MODE_P (mode))
1091       error ("%qs feature modifier is incompatible with the use of"
1092              " floating-point types", "+nofp");
1093     else
1094       error ("%qs feature modifier is incompatible with the use of"
1095              " vector types", "+nofp");
1096 }
1097
1098 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1099    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1100    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1101    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1102    and GENERAL_REGS is lower than the memory cost (in this case the best class
1103    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1104    cost results in bad allocations with many redundant int<->FP moves which
1105    are expensive on various cores.
1106    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1107    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1108    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1109    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1110    The result of this is that it is no longer inefficient to have a higher
1111    memory move cost than the register move cost.
1112 */
1113
1114 static reg_class_t
1115 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1116                                          reg_class_t best_class)
1117 {
1118   machine_mode mode;
1119
1120   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1121       || !reg_class_subset_p (FP_REGS, allocno_class))
1122     return allocno_class;
1123
1124   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1125       || !reg_class_subset_p (FP_REGS, best_class))
1126     return best_class;
1127
1128   mode = PSEUDO_REGNO_MODE (regno);
1129   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1130 }
1131
1132 static unsigned int
1133 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1134 {
1135   if (GET_MODE_UNIT_SIZE (mode) == 4)
1136     return aarch64_tune_params.min_div_recip_mul_sf;
1137   return aarch64_tune_params.min_div_recip_mul_df;
1138 }
1139
1140 /* Return the reassociation width of treeop OPC with mode MODE.  */
1141 static int
1142 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1143 {
1144   if (VECTOR_MODE_P (mode))
1145     return aarch64_tune_params.vec_reassoc_width;
1146   if (INTEGRAL_MODE_P (mode))
1147     return aarch64_tune_params.int_reassoc_width;
1148   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1149   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1150     return aarch64_tune_params.fp_reassoc_width;
1151   return 1;
1152 }
1153
1154 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1155 unsigned
1156 aarch64_dbx_register_number (unsigned regno)
1157 {
1158    if (GP_REGNUM_P (regno))
1159      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1160    else if (regno == SP_REGNUM)
1161      return AARCH64_DWARF_SP;
1162    else if (FP_REGNUM_P (regno))
1163      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1164    else if (PR_REGNUM_P (regno))
1165      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1166    else if (regno == VG_REGNUM)
1167      return AARCH64_DWARF_VG;
1168
1169    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1170       equivalent DWARF register.  */
1171    return DWARF_FRAME_REGISTERS;
1172 }
1173
1174 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1175 static bool
1176 aarch64_advsimd_struct_mode_p (machine_mode mode)
1177 {
1178   return (TARGET_SIMD
1179           && (mode == OImode || mode == CImode || mode == XImode));
1180 }
1181
1182 /* Return true if MODE is an SVE predicate mode.  */
1183 static bool
1184 aarch64_sve_pred_mode_p (machine_mode mode)
1185 {
1186   return (TARGET_SVE
1187           && (mode == VNx16BImode
1188               || mode == VNx8BImode
1189               || mode == VNx4BImode
1190               || mode == VNx2BImode));
1191 }
1192
1193 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1194 const unsigned int VEC_ADVSIMD  = 1;
1195 const unsigned int VEC_SVE_DATA = 2;
1196 const unsigned int VEC_SVE_PRED = 4;
1197 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1198    a structure of 2, 3 or 4 vectors.  */
1199 const unsigned int VEC_STRUCT   = 8;
1200 /* Useful combinations of the above.  */
1201 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1202 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1203
1204 /* Return a set of flags describing the vector properties of mode MODE.
1205    Ignore modes that are not supported by the current target.  */
1206 static unsigned int
1207 aarch64_classify_vector_mode (machine_mode mode)
1208 {
1209   if (aarch64_advsimd_struct_mode_p (mode))
1210     return VEC_ADVSIMD | VEC_STRUCT;
1211
1212   if (aarch64_sve_pred_mode_p (mode))
1213     return VEC_SVE_PRED;
1214
1215   scalar_mode inner = GET_MODE_INNER (mode);
1216   if (VECTOR_MODE_P (mode)
1217       && (inner == QImode
1218           || inner == HImode
1219           || inner == HFmode
1220           || inner == SImode
1221           || inner == SFmode
1222           || inner == DImode
1223           || inner == DFmode))
1224     {
1225       if (TARGET_SVE)
1226         {
1227           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1228             return VEC_SVE_DATA;
1229           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1230               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1231               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1232             return VEC_SVE_DATA | VEC_STRUCT;
1233         }
1234
1235       /* This includes V1DF but not V1DI (which doesn't exist).  */
1236       if (TARGET_SIMD
1237           && (known_eq (GET_MODE_BITSIZE (mode), 64)
1238               || known_eq (GET_MODE_BITSIZE (mode), 128)))
1239         return VEC_ADVSIMD;
1240     }
1241
1242   return 0;
1243 }
1244
1245 /* Return true if MODE is any of the data vector modes, including
1246    structure modes.  */
1247 static bool
1248 aarch64_vector_data_mode_p (machine_mode mode)
1249 {
1250   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1251 }
1252
1253 /* Return true if MODE is an SVE data vector mode; either a single vector
1254    or a structure of vectors.  */
1255 static bool
1256 aarch64_sve_data_mode_p (machine_mode mode)
1257 {
1258   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1259 }
1260
1261 /* Implement target hook TARGET_ARRAY_MODE.  */
1262 static opt_machine_mode
1263 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1264 {
1265   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1266       && IN_RANGE (nelems, 2, 4))
1267     return mode_for_vector (GET_MODE_INNER (mode),
1268                             GET_MODE_NUNITS (mode) * nelems);
1269
1270   return opt_machine_mode ();
1271 }
1272
1273 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1274 static bool
1275 aarch64_array_mode_supported_p (machine_mode mode,
1276                                 unsigned HOST_WIDE_INT nelems)
1277 {
1278   if (TARGET_SIMD
1279       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1280           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1281       && (nelems >= 2 && nelems <= 4))
1282     return true;
1283
1284   return false;
1285 }
1286
1287 /* Return the SVE predicate mode to use for elements that have
1288    ELEM_NBYTES bytes, if such a mode exists.  */
1289
1290 opt_machine_mode
1291 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1292 {
1293   if (TARGET_SVE)
1294     {
1295       if (elem_nbytes == 1)
1296         return VNx16BImode;
1297       if (elem_nbytes == 2)
1298         return VNx8BImode;
1299       if (elem_nbytes == 4)
1300         return VNx4BImode;
1301       if (elem_nbytes == 8)
1302         return VNx2BImode;
1303     }
1304   return opt_machine_mode ();
1305 }
1306
1307 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1308
1309 static opt_machine_mode
1310 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1311 {
1312   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1313     {
1314       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1315       machine_mode pred_mode;
1316       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1317         return pred_mode;
1318     }
1319
1320   return default_get_mask_mode (nunits, nbytes);
1321 }
1322
1323 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1324    prefer to use the first arithmetic operand as the else value if
1325    the else value doesn't matter, since that exactly matches the SVE
1326    destructive merging form.  For ternary operations we could either
1327    pick the first operand and use FMAD-like instructions or the last
1328    operand and use FMLA-like instructions; the latter seems more
1329    natural.  */
1330
1331 static tree
1332 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1333 {
1334   return nops == 3 ? ops[2] : ops[0];
1335 }
1336
1337 /* Implement TARGET_HARD_REGNO_NREGS.  */
1338
1339 static unsigned int
1340 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1341 {
1342   /* ??? Logically we should only need to provide a value when
1343      HARD_REGNO_MODE_OK says that the combination is valid,
1344      but at the moment we need to handle all modes.  Just ignore
1345      any runtime parts for registers that can't store them.  */
1346   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1347   switch (aarch64_regno_regclass (regno))
1348     {
1349     case FP_REGS:
1350     case FP_LO_REGS:
1351       if (aarch64_sve_data_mode_p (mode))
1352         return exact_div (GET_MODE_SIZE (mode),
1353                           BYTES_PER_SVE_VECTOR).to_constant ();
1354       return CEIL (lowest_size, UNITS_PER_VREG);
1355     case PR_REGS:
1356     case PR_LO_REGS:
1357     case PR_HI_REGS:
1358       return 1;
1359     default:
1360       return CEIL (lowest_size, UNITS_PER_WORD);
1361     }
1362   gcc_unreachable ();
1363 }
1364
1365 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1366
1367 static bool
1368 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1369 {
1370   if (GET_MODE_CLASS (mode) == MODE_CC)
1371     return regno == CC_REGNUM;
1372
1373   if (regno == VG_REGNUM)
1374     /* This must have the same size as _Unwind_Word.  */
1375     return mode == DImode;
1376
1377   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1378   if (vec_flags & VEC_SVE_PRED)
1379     return PR_REGNUM_P (regno);
1380
1381   if (PR_REGNUM_P (regno))
1382     return 0;
1383
1384   if (regno == SP_REGNUM)
1385     /* The purpose of comparing with ptr_mode is to support the
1386        global register variable associated with the stack pointer
1387        register via the syntax of asm ("wsp") in ILP32.  */
1388     return mode == Pmode || mode == ptr_mode;
1389
1390   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1391     return mode == Pmode;
1392
1393   if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1394     return true;
1395
1396   if (FP_REGNUM_P (regno))
1397     {
1398       if (vec_flags & VEC_STRUCT)
1399         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1400       else
1401         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1402     }
1403
1404   return false;
1405 }
1406
1407 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1408    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1409    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1410
1411 static bool
1412 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1413 {
1414   return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1415 }
1416
1417 /* Implement REGMODE_NATURAL_SIZE.  */
1418 poly_uint64
1419 aarch64_regmode_natural_size (machine_mode mode)
1420 {
1421   /* The natural size for SVE data modes is one SVE data vector,
1422      and similarly for predicates.  We can't independently modify
1423      anything smaller than that.  */
1424   /* ??? For now, only do this for variable-width SVE registers.
1425      Doing it for constant-sized registers breaks lower-subreg.c.  */
1426   /* ??? And once that's fixed, we should probably have similar
1427      code for Advanced SIMD.  */
1428   if (!aarch64_sve_vg.is_constant ())
1429     {
1430       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1431       if (vec_flags & VEC_SVE_PRED)
1432         return BYTES_PER_SVE_PRED;
1433       if (vec_flags & VEC_SVE_DATA)
1434         return BYTES_PER_SVE_VECTOR;
1435     }
1436   return UNITS_PER_WORD;
1437 }
1438
1439 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1440 machine_mode
1441 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1442                                      machine_mode mode)
1443 {
1444   /* The predicate mode determines which bits are significant and
1445      which are "don't care".  Decreasing the number of lanes would
1446      lose data while increasing the number of lanes would make bits
1447      unnecessarily significant.  */
1448   if (PR_REGNUM_P (regno))
1449     return mode;
1450   if (known_ge (GET_MODE_SIZE (mode), 4))
1451     return mode;
1452   else
1453     return SImode;
1454 }
1455
1456 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1457    that strcpy from constants will be faster.  */
1458
1459 static HOST_WIDE_INT
1460 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1461 {
1462   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1463     return MAX (align, BITS_PER_WORD);
1464   return align;
1465 }
1466
1467 /* Return true if calls to DECL should be treated as
1468    long-calls (ie called via a register).  */
1469 static bool
1470 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1471 {
1472   return false;
1473 }
1474
1475 /* Return true if calls to symbol-ref SYM should be treated as
1476    long-calls (ie called via a register).  */
1477 bool
1478 aarch64_is_long_call_p (rtx sym)
1479 {
1480   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1481 }
1482
1483 /* Return true if calls to symbol-ref SYM should not go through
1484    plt stubs.  */
1485
1486 bool
1487 aarch64_is_noplt_call_p (rtx sym)
1488 {
1489   const_tree decl = SYMBOL_REF_DECL (sym);
1490
1491   if (flag_pic
1492       && decl
1493       && (!flag_plt
1494           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1495       && !targetm.binds_local_p (decl))
1496     return true;
1497
1498   return false;
1499 }
1500
1501 /* Return true if the offsets to a zero/sign-extract operation
1502    represent an expression that matches an extend operation.  The
1503    operands represent the paramters from
1504
1505    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1506 bool
1507 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1508                                 rtx extract_imm)
1509 {
1510   HOST_WIDE_INT mult_val, extract_val;
1511
1512   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1513     return false;
1514
1515   mult_val = INTVAL (mult_imm);
1516   extract_val = INTVAL (extract_imm);
1517
1518   if (extract_val > 8
1519       && extract_val < GET_MODE_BITSIZE (mode)
1520       && exact_log2 (extract_val & ~7) > 0
1521       && (extract_val & 7) <= 4
1522       && mult_val == (1 << (extract_val & 7)))
1523     return true;
1524
1525   return false;
1526 }
1527
1528 /* Emit an insn that's a simple single-set.  Both the operands must be
1529    known to be valid.  */
1530 inline static rtx_insn *
1531 emit_set_insn (rtx x, rtx y)
1532 {
1533   return emit_insn (gen_rtx_SET (x, y));
1534 }
1535
1536 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1537    return the rtx for register 0 in the proper mode.  */
1538 rtx
1539 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1540 {
1541   machine_mode mode = SELECT_CC_MODE (code, x, y);
1542   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1543
1544   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1545   return cc_reg;
1546 }
1547
1548 /* Build the SYMBOL_REF for __tls_get_addr.  */
1549
1550 static GTY(()) rtx tls_get_addr_libfunc;
1551
1552 rtx
1553 aarch64_tls_get_addr (void)
1554 {
1555   if (!tls_get_addr_libfunc)
1556     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1557   return tls_get_addr_libfunc;
1558 }
1559
1560 /* Return the TLS model to use for ADDR.  */
1561
1562 static enum tls_model
1563 tls_symbolic_operand_type (rtx addr)
1564 {
1565   enum tls_model tls_kind = TLS_MODEL_NONE;
1566   if (GET_CODE (addr) == CONST)
1567     {
1568       poly_int64 addend;
1569       rtx sym = strip_offset (addr, &addend);
1570       if (GET_CODE (sym) == SYMBOL_REF)
1571         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1572     }
1573   else if (GET_CODE (addr) == SYMBOL_REF)
1574     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1575
1576   return tls_kind;
1577 }
1578
1579 /* We'll allow lo_sum's in addresses in our legitimate addresses
1580    so that combine would take care of combining addresses where
1581    necessary, but for generation purposes, we'll generate the address
1582    as :
1583    RTL                               Absolute
1584    tmp = hi (symbol_ref);            adrp  x1, foo
1585    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1586                                      nop
1587
1588    PIC                               TLS
1589    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1590    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1591                                      bl   __tls_get_addr
1592                                      nop
1593
1594    Load TLS symbol, depending on TLS mechanism and TLS access model.
1595
1596    Global Dynamic - Traditional TLS:
1597    adrp tmp, :tlsgd:imm
1598    add  dest, tmp, #:tlsgd_lo12:imm
1599    bl   __tls_get_addr
1600
1601    Global Dynamic - TLS Descriptors:
1602    adrp dest, :tlsdesc:imm
1603    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1604    add  dest, dest, #:tlsdesc_lo12:imm
1605    blr  tmp
1606    mrs  tp, tpidr_el0
1607    add  dest, dest, tp
1608
1609    Initial Exec:
1610    mrs  tp, tpidr_el0
1611    adrp tmp, :gottprel:imm
1612    ldr  dest, [tmp, #:gottprel_lo12:imm]
1613    add  dest, dest, tp
1614
1615    Local Exec:
1616    mrs  tp, tpidr_el0
1617    add  t0, tp, #:tprel_hi12:imm, lsl #12
1618    add  t0, t0, #:tprel_lo12_nc:imm
1619 */
1620
1621 static void
1622 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1623                                    enum aarch64_symbol_type type)
1624 {
1625   switch (type)
1626     {
1627     case SYMBOL_SMALL_ABSOLUTE:
1628       {
1629         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1630         rtx tmp_reg = dest;
1631         machine_mode mode = GET_MODE (dest);
1632
1633         gcc_assert (mode == Pmode || mode == ptr_mode);
1634
1635         if (can_create_pseudo_p ())
1636           tmp_reg = gen_reg_rtx (mode);
1637
1638         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1639         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1640         return;
1641       }
1642
1643     case SYMBOL_TINY_ABSOLUTE:
1644       emit_insn (gen_rtx_SET (dest, imm));
1645       return;
1646
1647     case SYMBOL_SMALL_GOT_28K:
1648       {
1649         machine_mode mode = GET_MODE (dest);
1650         rtx gp_rtx = pic_offset_table_rtx;
1651         rtx insn;
1652         rtx mem;
1653
1654         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1655            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1656            decide rtx costs, in which case pic_offset_table_rtx is not
1657            initialized.  For that case no need to generate the first adrp
1658            instruction as the final cost for global variable access is
1659            one instruction.  */
1660         if (gp_rtx != NULL)
1661           {
1662             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1663                using the page base as GOT base, the first page may be wasted,
1664                in the worst scenario, there is only 28K space for GOT).
1665
1666                The generate instruction sequence for accessing global variable
1667                is:
1668
1669                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1670
1671                Only one instruction needed. But we must initialize
1672                pic_offset_table_rtx properly.  We generate initialize insn for
1673                every global access, and allow CSE to remove all redundant.
1674
1675                The final instruction sequences will look like the following
1676                for multiply global variables access.
1677
1678                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1679
1680                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1681                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1682                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1683                  ...  */
1684
1685             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1686             crtl->uses_pic_offset_table = 1;
1687             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1688
1689             if (mode != GET_MODE (gp_rtx))
1690              gp_rtx = gen_lowpart (mode, gp_rtx);
1691
1692           }
1693
1694         if (mode == ptr_mode)
1695           {
1696             if (mode == DImode)
1697               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1698             else
1699               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1700
1701             mem = XVECEXP (SET_SRC (insn), 0, 0);
1702           }
1703         else
1704           {
1705             gcc_assert (mode == Pmode);
1706
1707             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1708             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1709           }
1710
1711         /* The operand is expected to be MEM.  Whenever the related insn
1712            pattern changed, above code which calculate mem should be
1713            updated.  */
1714         gcc_assert (GET_CODE (mem) == MEM);
1715         MEM_READONLY_P (mem) = 1;
1716         MEM_NOTRAP_P (mem) = 1;
1717         emit_insn (insn);
1718         return;
1719       }
1720
1721     case SYMBOL_SMALL_GOT_4G:
1722       {
1723         /* In ILP32, the mode of dest can be either SImode or DImode,
1724            while the got entry is always of SImode size.  The mode of
1725            dest depends on how dest is used: if dest is assigned to a
1726            pointer (e.g. in the memory), it has SImode; it may have
1727            DImode if dest is dereferenced to access the memeory.
1728            This is why we have to handle three different ldr_got_small
1729            patterns here (two patterns for ILP32).  */
1730
1731         rtx insn;
1732         rtx mem;
1733         rtx tmp_reg = dest;
1734         machine_mode mode = GET_MODE (dest);
1735
1736         if (can_create_pseudo_p ())
1737           tmp_reg = gen_reg_rtx (mode);
1738
1739         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1740         if (mode == ptr_mode)
1741           {
1742             if (mode == DImode)
1743               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1744             else
1745               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1746
1747             mem = XVECEXP (SET_SRC (insn), 0, 0);
1748           }
1749         else
1750           {
1751             gcc_assert (mode == Pmode);
1752
1753             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1754             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1755           }
1756
1757         gcc_assert (GET_CODE (mem) == MEM);
1758         MEM_READONLY_P (mem) = 1;
1759         MEM_NOTRAP_P (mem) = 1;
1760         emit_insn (insn);
1761         return;
1762       }
1763
1764     case SYMBOL_SMALL_TLSGD:
1765       {
1766         rtx_insn *insns;
1767         machine_mode mode = GET_MODE (dest);
1768         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1769
1770         start_sequence ();
1771         if (TARGET_ILP32)
1772           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1773         else
1774           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1775         insns = get_insns ();
1776         end_sequence ();
1777
1778         RTL_CONST_CALL_P (insns) = 1;
1779         emit_libcall_block (insns, dest, result, imm);
1780         return;
1781       }
1782
1783     case SYMBOL_SMALL_TLSDESC:
1784       {
1785         machine_mode mode = GET_MODE (dest);
1786         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1787         rtx tp;
1788
1789         gcc_assert (mode == Pmode || mode == ptr_mode);
1790
1791         /* In ILP32, the got entry is always of SImode size.  Unlike
1792            small GOT, the dest is fixed at reg 0.  */
1793         if (TARGET_ILP32)
1794           emit_insn (gen_tlsdesc_small_si (imm));
1795         else
1796           emit_insn (gen_tlsdesc_small_di (imm));
1797         tp = aarch64_load_tp (NULL);
1798
1799         if (mode != Pmode)
1800           tp = gen_lowpart (mode, tp);
1801
1802         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1803         if (REG_P (dest))
1804           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1805         return;
1806       }
1807
1808     case SYMBOL_SMALL_TLSIE:
1809       {
1810         /* In ILP32, the mode of dest can be either SImode or DImode,
1811            while the got entry is always of SImode size.  The mode of
1812            dest depends on how dest is used: if dest is assigned to a
1813            pointer (e.g. in the memory), it has SImode; it may have
1814            DImode if dest is dereferenced to access the memeory.
1815            This is why we have to handle three different tlsie_small
1816            patterns here (two patterns for ILP32).  */
1817         machine_mode mode = GET_MODE (dest);
1818         rtx tmp_reg = gen_reg_rtx (mode);
1819         rtx tp = aarch64_load_tp (NULL);
1820
1821         if (mode == ptr_mode)
1822           {
1823             if (mode == DImode)
1824               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1825             else
1826               {
1827                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1828                 tp = gen_lowpart (mode, tp);
1829               }
1830           }
1831         else
1832           {
1833             gcc_assert (mode == Pmode);
1834             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1835           }
1836
1837         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1838         if (REG_P (dest))
1839           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1840         return;
1841       }
1842
1843     case SYMBOL_TLSLE12:
1844     case SYMBOL_TLSLE24:
1845     case SYMBOL_TLSLE32:
1846     case SYMBOL_TLSLE48:
1847       {
1848         machine_mode mode = GET_MODE (dest);
1849         rtx tp = aarch64_load_tp (NULL);
1850
1851         if (mode != Pmode)
1852           tp = gen_lowpart (mode, tp);
1853
1854         switch (type)
1855           {
1856           case SYMBOL_TLSLE12:
1857             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1858                         (dest, tp, imm));
1859             break;
1860           case SYMBOL_TLSLE24:
1861             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1862                         (dest, tp, imm));
1863           break;
1864           case SYMBOL_TLSLE32:
1865             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1866                         (dest, imm));
1867             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1868                         (dest, dest, tp));
1869           break;
1870           case SYMBOL_TLSLE48:
1871             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1872                         (dest, imm));
1873             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1874                         (dest, dest, tp));
1875             break;
1876           default:
1877             gcc_unreachable ();
1878           }
1879
1880         if (REG_P (dest))
1881           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1882         return;
1883       }
1884
1885     case SYMBOL_TINY_GOT:
1886       emit_insn (gen_ldr_got_tiny (dest, imm));
1887       return;
1888
1889     case SYMBOL_TINY_TLSIE:
1890       {
1891         machine_mode mode = GET_MODE (dest);
1892         rtx tp = aarch64_load_tp (NULL);
1893
1894         if (mode == ptr_mode)
1895           {
1896             if (mode == DImode)
1897               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1898             else
1899               {
1900                 tp = gen_lowpart (mode, tp);
1901                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1902               }
1903           }
1904         else
1905           {
1906             gcc_assert (mode == Pmode);
1907             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1908           }
1909
1910         if (REG_P (dest))
1911           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1912         return;
1913       }
1914
1915     default:
1916       gcc_unreachable ();
1917     }
1918 }
1919
1920 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1921    handle all moves if !can_create_pseudo_p ().  The distinction is
1922    important because, unlike emit_move_insn, the move expanders know
1923    how to force Pmode objects into the constant pool even when the
1924    constant pool address is not itself legitimate.  */
1925 static rtx
1926 aarch64_emit_move (rtx dest, rtx src)
1927 {
1928   return (can_create_pseudo_p ()
1929           ? emit_move_insn (dest, src)
1930           : emit_move_insn_1 (dest, src));
1931 }
1932
1933 /* Apply UNOPTAB to OP and store the result in DEST.  */
1934
1935 static void
1936 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
1937 {
1938   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
1939   if (dest != tmp)
1940     emit_move_insn (dest, tmp);
1941 }
1942
1943 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
1944
1945 static void
1946 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
1947 {
1948   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
1949                           OPTAB_DIRECT);
1950   if (dest != tmp)
1951     emit_move_insn (dest, tmp);
1952 }
1953
1954 /* Split a 128-bit move operation into two 64-bit move operations,
1955    taking care to handle partial overlap of register to register
1956    copies.  Special cases are needed when moving between GP regs and
1957    FP regs.  SRC can be a register, constant or memory; DST a register
1958    or memory.  If either operand is memory it must not have any side
1959    effects.  */
1960 void
1961 aarch64_split_128bit_move (rtx dst, rtx src)
1962 {
1963   rtx dst_lo, dst_hi;
1964   rtx src_lo, src_hi;
1965
1966   machine_mode mode = GET_MODE (dst);
1967
1968   gcc_assert (mode == TImode || mode == TFmode);
1969   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1970   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1971
1972   if (REG_P (dst) && REG_P (src))
1973     {
1974       int src_regno = REGNO (src);
1975       int dst_regno = REGNO (dst);
1976
1977       /* Handle FP <-> GP regs.  */
1978       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1979         {
1980           src_lo = gen_lowpart (word_mode, src);
1981           src_hi = gen_highpart (word_mode, src);
1982
1983           if (mode == TImode)
1984             {
1985               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1986               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1987             }
1988           else
1989             {
1990               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1991               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1992             }
1993           return;
1994         }
1995       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1996         {
1997           dst_lo = gen_lowpart (word_mode, dst);
1998           dst_hi = gen_highpart (word_mode, dst);
1999
2000           if (mode == TImode)
2001             {
2002               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
2003               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
2004             }
2005           else
2006             {
2007               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
2008               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
2009             }
2010           return;
2011         }
2012     }
2013
2014   dst_lo = gen_lowpart (word_mode, dst);
2015   dst_hi = gen_highpart (word_mode, dst);
2016   src_lo = gen_lowpart (word_mode, src);
2017   src_hi = gen_highpart_mode (word_mode, mode, src);
2018
2019   /* At most one pairing may overlap.  */
2020   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2021     {
2022       aarch64_emit_move (dst_hi, src_hi);
2023       aarch64_emit_move (dst_lo, src_lo);
2024     }
2025   else
2026     {
2027       aarch64_emit_move (dst_lo, src_lo);
2028       aarch64_emit_move (dst_hi, src_hi);
2029     }
2030 }
2031
2032 bool
2033 aarch64_split_128bit_move_p (rtx dst, rtx src)
2034 {
2035   return (! REG_P (src)
2036           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2037 }
2038
2039 /* Split a complex SIMD combine.  */
2040
2041 void
2042 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2043 {
2044   machine_mode src_mode = GET_MODE (src1);
2045   machine_mode dst_mode = GET_MODE (dst);
2046
2047   gcc_assert (VECTOR_MODE_P (dst_mode));
2048   gcc_assert (register_operand (dst, dst_mode)
2049               && register_operand (src1, src_mode)
2050               && register_operand (src2, src_mode));
2051
2052   rtx (*gen) (rtx, rtx, rtx);
2053
2054   switch (src_mode)
2055     {
2056     case E_V8QImode:
2057       gen = gen_aarch64_simd_combinev8qi;
2058       break;
2059     case E_V4HImode:
2060       gen = gen_aarch64_simd_combinev4hi;
2061       break;
2062     case E_V2SImode:
2063       gen = gen_aarch64_simd_combinev2si;
2064       break;
2065     case E_V4HFmode:
2066       gen = gen_aarch64_simd_combinev4hf;
2067       break;
2068     case E_V2SFmode:
2069       gen = gen_aarch64_simd_combinev2sf;
2070       break;
2071     case E_DImode:
2072       gen = gen_aarch64_simd_combinedi;
2073       break;
2074     case E_DFmode:
2075       gen = gen_aarch64_simd_combinedf;
2076       break;
2077     default:
2078       gcc_unreachable ();
2079     }
2080
2081   emit_insn (gen (dst, src1, src2));
2082   return;
2083 }
2084
2085 /* Split a complex SIMD move.  */
2086
2087 void
2088 aarch64_split_simd_move (rtx dst, rtx src)
2089 {
2090   machine_mode src_mode = GET_MODE (src);
2091   machine_mode dst_mode = GET_MODE (dst);
2092
2093   gcc_assert (VECTOR_MODE_P (dst_mode));
2094
2095   if (REG_P (dst) && REG_P (src))
2096     {
2097       rtx (*gen) (rtx, rtx);
2098
2099       gcc_assert (VECTOR_MODE_P (src_mode));
2100
2101       switch (src_mode)
2102         {
2103         case E_V16QImode:
2104           gen = gen_aarch64_split_simd_movv16qi;
2105           break;
2106         case E_V8HImode:
2107           gen = gen_aarch64_split_simd_movv8hi;
2108           break;
2109         case E_V4SImode:
2110           gen = gen_aarch64_split_simd_movv4si;
2111           break;
2112         case E_V2DImode:
2113           gen = gen_aarch64_split_simd_movv2di;
2114           break;
2115         case E_V8HFmode:
2116           gen = gen_aarch64_split_simd_movv8hf;
2117           break;
2118         case E_V4SFmode:
2119           gen = gen_aarch64_split_simd_movv4sf;
2120           break;
2121         case E_V2DFmode:
2122           gen = gen_aarch64_split_simd_movv2df;
2123           break;
2124         default:
2125           gcc_unreachable ();
2126         }
2127
2128       emit_insn (gen (dst, src));
2129       return;
2130     }
2131 }
2132
2133 bool
2134 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2135                               machine_mode ymode, rtx y)
2136 {
2137   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2138   gcc_assert (r != NULL);
2139   return rtx_equal_p (x, r);
2140 }
2141
2142
2143 static rtx
2144 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2145 {
2146   if (can_create_pseudo_p ())
2147     return force_reg (mode, value);
2148   else
2149     {
2150       gcc_assert (x);
2151       aarch64_emit_move (x, value);
2152       return x;
2153     }
2154 }
2155
2156 /* Return true if we can move VALUE into a register using a single
2157    CNT[BHWD] instruction.  */
2158
2159 static bool
2160 aarch64_sve_cnt_immediate_p (poly_int64 value)
2161 {
2162   HOST_WIDE_INT factor = value.coeffs[0];
2163   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2164   return (value.coeffs[1] == factor
2165           && IN_RANGE (factor, 2, 16 * 16)
2166           && (factor & 1) == 0
2167           && factor <= 16 * (factor & -factor));
2168 }
2169
2170 /* Likewise for rtx X.  */
2171
2172 bool
2173 aarch64_sve_cnt_immediate_p (rtx x)
2174 {
2175   poly_int64 value;
2176   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2177 }
2178
2179 /* Return the asm string for an instruction with a CNT-like vector size
2180    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2181    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2182    first part of the operands template (the part that comes before the
2183    vector size itself).  FACTOR is the number of quadwords.
2184    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2185    If it is zero, we can use any element size.  */
2186
2187 static char *
2188 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2189                                   unsigned int factor,
2190                                   unsigned int nelts_per_vq)
2191 {
2192   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2193
2194   if (nelts_per_vq == 0)
2195     /* There is some overlap in the ranges of the four CNT instructions.
2196        Here we always use the smallest possible element size, so that the
2197        multiplier is 1 whereever possible.  */
2198     nelts_per_vq = factor & -factor;
2199   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2200   gcc_assert (IN_RANGE (shift, 1, 4));
2201   char suffix = "dwhb"[shift - 1];
2202
2203   factor >>= shift;
2204   unsigned int written;
2205   if (factor == 1)
2206     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2207                         prefix, suffix, operands);
2208   else
2209     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2210                         prefix, suffix, operands, factor);
2211   gcc_assert (written < sizeof (buffer));
2212   return buffer;
2213 }
2214
2215 /* Return the asm string for an instruction with a CNT-like vector size
2216    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2217    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2218    first part of the operands template (the part that comes before the
2219    vector size itself).  X is the value of the vector size operand,
2220    as a polynomial integer rtx.  */
2221
2222 char *
2223 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2224                                   rtx x)
2225 {
2226   poly_int64 value = rtx_to_poly_int64 (x);
2227   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2228   return aarch64_output_sve_cnt_immediate (prefix, operands,
2229                                            value.coeffs[1], 0);
2230 }
2231
2232 /* Return true if we can add VALUE to a register using a single ADDVL
2233    or ADDPL instruction.  */
2234
2235 static bool
2236 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2237 {
2238   HOST_WIDE_INT factor = value.coeffs[0];
2239   if (factor == 0 || value.coeffs[1] != factor)
2240     return false;
2241   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2242      and a value of 16 is one vector width.  */
2243   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2244           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2245 }
2246
2247 /* Likewise for rtx X.  */
2248
2249 bool
2250 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2251 {
2252   poly_int64 value;
2253   return (poly_int_rtx_p (x, &value)
2254           && aarch64_sve_addvl_addpl_immediate_p (value));
2255 }
2256
2257 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2258    and storing the result in operand 0.  */
2259
2260 char *
2261 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2262 {
2263   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2264   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2265   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2266
2267   /* Use INC or DEC if possible.  */
2268   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2269     {
2270       if (aarch64_sve_cnt_immediate_p (offset_value))
2271         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2272                                                  offset_value.coeffs[1], 0);
2273       if (aarch64_sve_cnt_immediate_p (-offset_value))
2274         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2275                                                  -offset_value.coeffs[1], 0);
2276     }
2277
2278   int factor = offset_value.coeffs[1];
2279   if ((factor & 15) == 0)
2280     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2281   else
2282     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2283   return buffer;
2284 }
2285
2286 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2287    instruction.  If it is, store the number of elements in each vector
2288    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2289    factor in *FACTOR_OUT (if nonnull).  */
2290
2291 bool
2292 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2293                                  unsigned int *nelts_per_vq_out)
2294 {
2295   rtx elt;
2296   poly_int64 value;
2297
2298   if (!const_vec_duplicate_p (x, &elt)
2299       || !poly_int_rtx_p (elt, &value))
2300     return false;
2301
2302   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2303   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2304     /* There's no vector INCB.  */
2305     return false;
2306
2307   HOST_WIDE_INT factor = value.coeffs[0];
2308   if (value.coeffs[1] != factor)
2309     return false;
2310
2311   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2312   if ((factor % nelts_per_vq) != 0
2313       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2314     return false;
2315
2316   if (factor_out)
2317     *factor_out = factor;
2318   if (nelts_per_vq_out)
2319     *nelts_per_vq_out = nelts_per_vq;
2320   return true;
2321 }
2322
2323 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2324    instruction.  */
2325
2326 bool
2327 aarch64_sve_inc_dec_immediate_p (rtx x)
2328 {
2329   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2330 }
2331
2332 /* Return the asm template for an SVE vector INC or DEC instruction.
2333    OPERANDS gives the operands before the vector count and X is the
2334    value of the vector count operand itself.  */
2335
2336 char *
2337 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2338 {
2339   int factor;
2340   unsigned int nelts_per_vq;
2341   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2342     gcc_unreachable ();
2343   if (factor < 0)
2344     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2345                                              nelts_per_vq);
2346   else
2347     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2348                                              nelts_per_vq);
2349 }
2350
2351 static int
2352 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2353                                 scalar_int_mode mode)
2354 {
2355   int i;
2356   unsigned HOST_WIDE_INT val, val2, mask;
2357   int one_match, zero_match;
2358   int num_insns;
2359
2360   val = INTVAL (imm);
2361
2362   if (aarch64_move_imm (val, mode))
2363     {
2364       if (generate)
2365         emit_insn (gen_rtx_SET (dest, imm));
2366       return 1;
2367     }
2368
2369   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2370      (with XXXX non-zero). In that case check to see if the move can be done in
2371      a smaller mode.  */
2372   val2 = val & 0xffffffff;
2373   if (mode == DImode
2374       && aarch64_move_imm (val2, SImode)
2375       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2376     {
2377       if (generate)
2378         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2379
2380       /* Check if we have to emit a second instruction by checking to see
2381          if any of the upper 32 bits of the original DI mode value is set.  */
2382       if (val == val2)
2383         return 1;
2384
2385       i = (val >> 48) ? 48 : 32;
2386
2387       if (generate)
2388          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2389                                     GEN_INT ((val >> i) & 0xffff)));
2390
2391       return 2;
2392     }
2393
2394   if ((val >> 32) == 0 || mode == SImode)
2395     {
2396       if (generate)
2397         {
2398           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2399           if (mode == SImode)
2400             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2401                                        GEN_INT ((val >> 16) & 0xffff)));
2402           else
2403             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2404                                        GEN_INT ((val >> 16) & 0xffff)));
2405         }
2406       return 2;
2407     }
2408
2409   /* Remaining cases are all for DImode.  */
2410
2411   mask = 0xffff;
2412   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2413     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2414   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2415     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2416
2417   if (zero_match != 2 && one_match != 2)
2418     {
2419       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2420          For a 64-bit bitmask try whether changing 16 bits to all ones or
2421          zeroes creates a valid bitmask.  To check any repeated bitmask,
2422          try using 16 bits from the other 32-bit half of val.  */
2423
2424       for (i = 0; i < 64; i += 16, mask <<= 16)
2425         {
2426           val2 = val & ~mask;
2427           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2428             break;
2429           val2 = val | mask;
2430           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2431             break;
2432           val2 = val2 & ~mask;
2433           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2434           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2435             break;
2436         }
2437       if (i != 64)
2438         {
2439           if (generate)
2440             {
2441               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2442               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2443                                          GEN_INT ((val >> i) & 0xffff)));
2444             }
2445           return 2;
2446         }
2447     }
2448
2449   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2450      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2451      otherwise skip zero bits.  */
2452
2453   num_insns = 1;
2454   mask = 0xffff;
2455   val2 = one_match > zero_match ? ~val : val;
2456   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2457
2458   if (generate)
2459     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2460                                            ? (val | ~(mask << i))
2461                                            : (val & (mask << i)))));
2462   for (i += 16; i < 64; i += 16)
2463     {
2464       if ((val2 & (mask << i)) == 0)
2465         continue;
2466       if (generate)
2467         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2468                                    GEN_INT ((val >> i) & 0xffff)));
2469       num_insns ++;
2470     }
2471
2472   return num_insns;
2473 }
2474
2475 /* Return whether imm is a 128-bit immediate which is simple enough to
2476    expand inline.  */
2477 bool
2478 aarch64_mov128_immediate (rtx imm)
2479 {
2480   if (GET_CODE (imm) == CONST_INT)
2481     return true;
2482
2483   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2484
2485   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2486   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2487
2488   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2489          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2490 }
2491
2492
2493 /* Return the number of temporary registers that aarch64_add_offset_1
2494    would need to add OFFSET to a register.  */
2495
2496 static unsigned int
2497 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2498 {
2499   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2500 }
2501
2502 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2503    a non-polynomial OFFSET.  MODE is the mode of the addition.
2504    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2505    be set and CFA adjustments added to the generated instructions.
2506
2507    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2508    temporary if register allocation is already complete.  This temporary
2509    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2510    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2511    the immediate again.
2512
2513    Since this function may be used to adjust the stack pointer, we must
2514    ensure that it cannot cause transient stack deallocation (for example
2515    by first incrementing SP and then decrementing when adjusting by a
2516    large immediate).  */
2517
2518 static void
2519 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2520                       rtx src, HOST_WIDE_INT offset, rtx temp1,
2521                       bool frame_related_p, bool emit_move_imm)
2522 {
2523   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2524   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2525
2526   HOST_WIDE_INT moffset = abs_hwi (offset);
2527   rtx_insn *insn;
2528
2529   if (!moffset)
2530     {
2531       if (!rtx_equal_p (dest, src))
2532         {
2533           insn = emit_insn (gen_rtx_SET (dest, src));
2534           RTX_FRAME_RELATED_P (insn) = frame_related_p;
2535         }
2536       return;
2537     }
2538
2539   /* Single instruction adjustment.  */
2540   if (aarch64_uimm12_shift (moffset))
2541     {
2542       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2543       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2544       return;
2545     }
2546
2547   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2548      and either:
2549
2550      a) the offset cannot be loaded by a 16-bit move or
2551      b) there is no spare register into which we can move it.  */
2552   if (moffset < 0x1000000
2553       && ((!temp1 && !can_create_pseudo_p ())
2554           || !aarch64_move_imm (moffset, mode)))
2555     {
2556       HOST_WIDE_INT low_off = moffset & 0xfff;
2557
2558       low_off = offset < 0 ? -low_off : low_off;
2559       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2560       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2561       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2562       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2563       return;
2564     }
2565
2566   /* Emit a move immediate if required and an addition/subtraction.  */
2567   if (emit_move_imm)
2568     {
2569       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2570       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2571     }
2572   insn = emit_insn (offset < 0
2573                     ? gen_sub3_insn (dest, src, temp1)
2574                     : gen_add3_insn (dest, src, temp1));
2575   if (frame_related_p)
2576     {
2577       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2578       rtx adj = plus_constant (mode, src, offset);
2579       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2580     }
2581 }
2582
2583 /* Return the number of temporary registers that aarch64_add_offset
2584    would need to move OFFSET into a register or add OFFSET to a register;
2585    ADD_P is true if we want the latter rather than the former.  */
2586
2587 static unsigned int
2588 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2589 {
2590   /* This follows the same structure as aarch64_add_offset.  */
2591   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2592     return 0;
2593
2594   unsigned int count = 0;
2595   HOST_WIDE_INT factor = offset.coeffs[1];
2596   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2597   poly_int64 poly_offset (factor, factor);
2598   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2599     /* Need one register for the ADDVL/ADDPL result.  */
2600     count += 1;
2601   else if (factor != 0)
2602     {
2603       factor = abs (factor);
2604       if (factor > 16 * (factor & -factor))
2605         /* Need one register for the CNT result and one for the multiplication
2606            factor.  If necessary, the second temporary can be reused for the
2607            constant part of the offset.  */
2608         return 2;
2609       /* Need one register for the CNT result (which might then
2610          be shifted).  */
2611       count += 1;
2612     }
2613   return count + aarch64_add_offset_1_temporaries (constant);
2614 }
2615
2616 /* If X can be represented as a poly_int64, return the number
2617    of temporaries that are required to add it to a register.
2618    Return -1 otherwise.  */
2619
2620 int
2621 aarch64_add_offset_temporaries (rtx x)
2622 {
2623   poly_int64 offset;
2624   if (!poly_int_rtx_p (x, &offset))
2625     return -1;
2626   return aarch64_offset_temporaries (true, offset);
2627 }
2628
2629 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
2630    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2631    be set and CFA adjustments added to the generated instructions.
2632
2633    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2634    temporary if register allocation is already complete.  This temporary
2635    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2636    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2637    false to avoid emitting the immediate again.
2638
2639    TEMP2, if nonnull, is a second temporary register that doesn't
2640    overlap either DEST or REG.
2641
2642    Since this function may be used to adjust the stack pointer, we must
2643    ensure that it cannot cause transient stack deallocation (for example
2644    by first incrementing SP and then decrementing when adjusting by a
2645    large immediate).  */
2646
2647 static void
2648 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2649                     poly_int64 offset, rtx temp1, rtx temp2,
2650                     bool frame_related_p, bool emit_move_imm = true)
2651 {
2652   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2653   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2654   gcc_assert (temp1 == NULL_RTX
2655               || !frame_related_p
2656               || !reg_overlap_mentioned_p (temp1, dest));
2657   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2658
2659   /* Try using ADDVL or ADDPL to add the whole value.  */
2660   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2661     {
2662       rtx offset_rtx = gen_int_mode (offset, mode);
2663       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2664       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2665       return;
2666     }
2667
2668   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2669      SVE vector register, over and above the minimum size of 128 bits.
2670      This is equivalent to half the value returned by CNTD with a
2671      vector shape of ALL.  */
2672   HOST_WIDE_INT factor = offset.coeffs[1];
2673   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2674
2675   /* Try using ADDVL or ADDPL to add the VG-based part.  */
2676   poly_int64 poly_offset (factor, factor);
2677   if (src != const0_rtx
2678       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2679     {
2680       rtx offset_rtx = gen_int_mode (poly_offset, mode);
2681       if (frame_related_p)
2682         {
2683           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2684           RTX_FRAME_RELATED_P (insn) = true;
2685           src = dest;
2686         }
2687       else
2688         {
2689           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2690           src = aarch64_force_temporary (mode, temp1, addr);
2691           temp1 = temp2;
2692           temp2 = NULL_RTX;
2693         }
2694     }
2695   /* Otherwise use a CNT-based sequence.  */
2696   else if (factor != 0)
2697     {
2698       /* Use a subtraction if we have a negative factor.  */
2699       rtx_code code = PLUS;
2700       if (factor < 0)
2701         {
2702           factor = -factor;
2703           code = MINUS;
2704         }
2705
2706       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
2707          into the multiplication.  */
2708       rtx val;
2709       int shift = 0;
2710       if (factor & 1)
2711         /* Use a right shift by 1.  */
2712         shift = -1;
2713       else
2714         factor /= 2;
2715       HOST_WIDE_INT low_bit = factor & -factor;
2716       if (factor <= 16 * low_bit)
2717         {
2718           if (factor > 16 * 8)
2719             {
2720               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2721                  the value with the minimum multiplier and shift it into
2722                  position.  */
2723               int extra_shift = exact_log2 (low_bit);
2724               shift += extra_shift;
2725               factor >>= extra_shift;
2726             }
2727           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2728         }
2729       else
2730         {
2731           /* Use CNTD, then multiply it by FACTOR.  */
2732           val = gen_int_mode (poly_int64 (2, 2), mode);
2733           val = aarch64_force_temporary (mode, temp1, val);
2734
2735           /* Go back to using a negative multiplication factor if we have
2736              no register from which to subtract.  */
2737           if (code == MINUS && src == const0_rtx)
2738             {
2739               factor = -factor;
2740               code = PLUS;
2741             }
2742           rtx coeff1 = gen_int_mode (factor, mode);
2743           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2744           val = gen_rtx_MULT (mode, val, coeff1);
2745         }
2746
2747       if (shift > 0)
2748         {
2749           /* Multiply by 1 << SHIFT.  */
2750           val = aarch64_force_temporary (mode, temp1, val);
2751           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2752         }
2753       else if (shift == -1)
2754         {
2755           /* Divide by 2.  */
2756           val = aarch64_force_temporary (mode, temp1, val);
2757           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2758         }
2759
2760       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
2761       if (src != const0_rtx)
2762         {
2763           val = aarch64_force_temporary (mode, temp1, val);
2764           val = gen_rtx_fmt_ee (code, mode, src, val);
2765         }
2766       else if (code == MINUS)
2767         {
2768           val = aarch64_force_temporary (mode, temp1, val);
2769           val = gen_rtx_NEG (mode, val);
2770         }
2771
2772       if (constant == 0 || frame_related_p)
2773         {
2774           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2775           if (frame_related_p)
2776             {
2777               RTX_FRAME_RELATED_P (insn) = true;
2778               add_reg_note (insn, REG_CFA_ADJUST_CFA,
2779                             gen_rtx_SET (dest, plus_constant (Pmode, src,
2780                                                               poly_offset)));
2781             }
2782           src = dest;
2783           if (constant == 0)
2784             return;
2785         }
2786       else
2787         {
2788           src = aarch64_force_temporary (mode, temp1, val);
2789           temp1 = temp2;
2790           temp2 = NULL_RTX;
2791         }
2792
2793       emit_move_imm = true;
2794     }
2795
2796   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2797                         frame_related_p, emit_move_imm);
2798 }
2799
2800 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2801    than a poly_int64.  */
2802
2803 void
2804 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2805                           rtx offset_rtx, rtx temp1, rtx temp2)
2806 {
2807   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2808                       temp1, temp2, false);
2809 }
2810
2811 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2812    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
2813    if TEMP1 already contains abs (DELTA).  */
2814
2815 static inline void
2816 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2817 {
2818   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2819                       temp1, temp2, true, emit_move_imm);
2820 }
2821
2822 /* Subtract DELTA from the stack pointer, marking the instructions
2823    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
2824    if nonnull.  */
2825
2826 static inline void
2827 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2828 {
2829   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2830                       temp1, temp2, frame_related_p);
2831 }
2832
2833 /* Set DEST to (vec_series BASE STEP).  */
2834
2835 static void
2836 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2837 {
2838   machine_mode mode = GET_MODE (dest);
2839   scalar_mode inner = GET_MODE_INNER (mode);
2840
2841   /* Each operand can be a register or an immediate in the range [-16, 15].  */
2842   if (!aarch64_sve_index_immediate_p (base))
2843     base = force_reg (inner, base);
2844   if (!aarch64_sve_index_immediate_p (step))
2845     step = force_reg (inner, step);
2846
2847   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2848 }
2849
2850 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2851    integer of mode INT_MODE.  Return true on success.  */
2852
2853 static bool
2854 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2855                                       rtx src)
2856 {
2857   /* If the constant is smaller than 128 bits, we can do the move
2858      using a vector of SRC_MODEs.  */
2859   if (src_mode != TImode)
2860     {
2861       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2862                                      GET_MODE_SIZE (src_mode));
2863       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2864       emit_move_insn (gen_lowpart (dup_mode, dest),
2865                       gen_const_vec_duplicate (dup_mode, src));
2866       return true;
2867     }
2868
2869   /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
2870   src = force_const_mem (src_mode, src);
2871   if (!src)
2872     return false;
2873
2874   /* Make sure that the address is legitimate.  */
2875   if (!aarch64_sve_ld1r_operand_p (src))
2876     {
2877       rtx addr = force_reg (Pmode, XEXP (src, 0));
2878       src = replace_equiv_address (src, addr);
2879     }
2880
2881   machine_mode mode = GET_MODE (dest);
2882   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2883   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2884   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2885   src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2886   emit_insn (gen_rtx_SET (dest, src));
2887   return true;
2888 }
2889
2890 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2891    isn't a simple duplicate or series.  */
2892
2893 static void
2894 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2895 {
2896   machine_mode mode = GET_MODE (src);
2897   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2898   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2899   gcc_assert (npatterns > 1);
2900
2901   if (nelts_per_pattern == 1)
2902     {
2903       /* The constant is a repeating seqeuence of at least two elements,
2904          where the repeating elements occupy no more than 128 bits.
2905          Get an integer representation of the replicated value.  */
2906       scalar_int_mode int_mode;
2907       if (BYTES_BIG_ENDIAN)
2908         /* For now, always use LD1RQ to load the value on big-endian
2909            targets, since the handling of smaller integers includes a
2910            subreg that is semantically an element reverse.  */
2911         int_mode = TImode;
2912       else
2913         {
2914           unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2915           gcc_assert (int_bits <= 128);
2916           int_mode = int_mode_for_size (int_bits, 0).require ();
2917         }
2918       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2919       if (int_value
2920           && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2921         return;
2922     }
2923
2924   /* Expand each pattern individually.  */
2925   rtx_vector_builder builder;
2926   auto_vec<rtx, 16> vectors (npatterns);
2927   for (unsigned int i = 0; i < npatterns; ++i)
2928     {
2929       builder.new_vector (mode, 1, nelts_per_pattern);
2930       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2931         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2932       vectors.quick_push (force_reg (mode, builder.build ()));
2933     }
2934
2935   /* Use permutes to interleave the separate vectors.  */
2936   while (npatterns > 1)
2937     {
2938       npatterns /= 2;
2939       for (unsigned int i = 0; i < npatterns; ++i)
2940         {
2941           rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2942           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2943           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2944           vectors[i] = tmp;
2945         }
2946     }
2947   gcc_assert (vectors[0] == dest);
2948 }
2949
2950 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
2951    is a pattern that can be used to set DEST to a replicated scalar
2952    element.  */
2953
2954 void
2955 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2956                               rtx (*gen_vec_duplicate) (rtx, rtx))
2957 {
2958   machine_mode mode = GET_MODE (dest);
2959
2960   /* Check on what type of symbol it is.  */
2961   scalar_int_mode int_mode;
2962   if ((GET_CODE (imm) == SYMBOL_REF
2963        || GET_CODE (imm) == LABEL_REF
2964        || GET_CODE (imm) == CONST
2965        || GET_CODE (imm) == CONST_POLY_INT)
2966       && is_a <scalar_int_mode> (mode, &int_mode))
2967     {
2968       rtx mem;
2969       poly_int64 offset;
2970       HOST_WIDE_INT const_offset;
2971       enum aarch64_symbol_type sty;
2972
2973       /* If we have (const (plus symbol offset)), separate out the offset
2974          before we start classifying the symbol.  */
2975       rtx base = strip_offset (imm, &offset);
2976
2977       /* We must always add an offset involving VL separately, rather than
2978          folding it into the relocation.  */
2979       if (!offset.is_constant (&const_offset))
2980         {
2981           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2982             emit_insn (gen_rtx_SET (dest, imm));
2983           else
2984             {
2985               /* Do arithmetic on 32-bit values if the result is smaller
2986                  than that.  */
2987               if (partial_subreg_p (int_mode, SImode))
2988                 {
2989                   /* It is invalid to do symbol calculations in modes
2990                      narrower than SImode.  */
2991                   gcc_assert (base == const0_rtx);
2992                   dest = gen_lowpart (SImode, dest);
2993                   int_mode = SImode;
2994                 }
2995               if (base != const0_rtx)
2996                 {
2997                   base = aarch64_force_temporary (int_mode, dest, base);
2998                   aarch64_add_offset (int_mode, dest, base, offset,
2999                                       NULL_RTX, NULL_RTX, false);
3000                 }
3001               else
3002                 aarch64_add_offset (int_mode, dest, base, offset,
3003                                     dest, NULL_RTX, false);
3004             }
3005           return;
3006         }
3007
3008       sty = aarch64_classify_symbol (base, const_offset);
3009       switch (sty)
3010         {
3011         case SYMBOL_FORCE_TO_MEM:
3012           if (const_offset != 0
3013               && targetm.cannot_force_const_mem (int_mode, imm))
3014             {
3015               gcc_assert (can_create_pseudo_p ());
3016               base = aarch64_force_temporary (int_mode, dest, base);
3017               aarch64_add_offset (int_mode, dest, base, const_offset,
3018                                   NULL_RTX, NULL_RTX, false);
3019               return;
3020             }
3021
3022           mem = force_const_mem (ptr_mode, imm);
3023           gcc_assert (mem);
3024
3025           /* If we aren't generating PC relative literals, then
3026              we need to expand the literal pool access carefully.
3027              This is something that needs to be done in a number
3028              of places, so could well live as a separate function.  */
3029           if (!aarch64_pcrelative_literal_loads)
3030             {
3031               gcc_assert (can_create_pseudo_p ());
3032               base = gen_reg_rtx (ptr_mode);
3033               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3034               if (ptr_mode != Pmode)
3035                 base = convert_memory_address (Pmode, base);
3036               mem = gen_rtx_MEM (ptr_mode, base);
3037             }
3038
3039           if (int_mode != ptr_mode)
3040             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3041
3042           emit_insn (gen_rtx_SET (dest, mem));
3043
3044           return;
3045
3046         case SYMBOL_SMALL_TLSGD:
3047         case SYMBOL_SMALL_TLSDESC:
3048         case SYMBOL_SMALL_TLSIE:
3049         case SYMBOL_SMALL_GOT_28K:
3050         case SYMBOL_SMALL_GOT_4G:
3051         case SYMBOL_TINY_GOT:
3052         case SYMBOL_TINY_TLSIE:
3053           if (const_offset != 0)
3054             {
3055               gcc_assert(can_create_pseudo_p ());
3056               base = aarch64_force_temporary (int_mode, dest, base);
3057               aarch64_add_offset (int_mode, dest, base, const_offset,
3058                                   NULL_RTX, NULL_RTX, false);
3059               return;
3060             }
3061           /* FALLTHRU */
3062
3063         case SYMBOL_SMALL_ABSOLUTE:
3064         case SYMBOL_TINY_ABSOLUTE:
3065         case SYMBOL_TLSLE12:
3066         case SYMBOL_TLSLE24:
3067         case SYMBOL_TLSLE32:
3068         case SYMBOL_TLSLE48:
3069           aarch64_load_symref_appropriately (dest, imm, sty);
3070           return;
3071
3072         default:
3073           gcc_unreachable ();
3074         }
3075     }
3076
3077   if (!CONST_INT_P (imm))
3078     {
3079       rtx base, step, value;
3080       if (GET_CODE (imm) == HIGH
3081           || aarch64_simd_valid_immediate (imm, NULL))
3082         emit_insn (gen_rtx_SET (dest, imm));
3083       else if (const_vec_series_p (imm, &base, &step))
3084         aarch64_expand_vec_series (dest, base, step);
3085       else if (const_vec_duplicate_p (imm, &value))
3086         {
3087           /* If the constant is out of range of an SVE vector move,
3088              load it from memory if we can, otherwise move it into
3089              a register and use a DUP.  */
3090           scalar_mode inner_mode = GET_MODE_INNER (mode);
3091           rtx op = force_const_mem (inner_mode, value);
3092           if (!op)
3093             op = force_reg (inner_mode, value);
3094           else if (!aarch64_sve_ld1r_operand_p (op))
3095             {
3096               rtx addr = force_reg (Pmode, XEXP (op, 0));
3097               op = replace_equiv_address (op, addr);
3098             }
3099           emit_insn (gen_vec_duplicate (dest, op));
3100         }
3101       else if (GET_CODE (imm) == CONST_VECTOR
3102                && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3103         aarch64_expand_sve_const_vector (dest, imm);
3104       else
3105         {
3106           rtx mem = force_const_mem (mode, imm);
3107           gcc_assert (mem);
3108           emit_move_insn (dest, mem);
3109         }
3110
3111       return;
3112     }
3113
3114   aarch64_internal_mov_immediate (dest, imm, true,
3115                                   as_a <scalar_int_mode> (mode));
3116 }
3117
3118 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3119    that is known to contain PTRUE.  */
3120
3121 void
3122 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3123 {
3124   emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3125                                                 gen_rtvec (2, pred, src),
3126                                                 UNSPEC_MERGE_PTRUE)));
3127 }
3128
3129 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3130    operand is in memory.  In this case we need to use the predicated LD1
3131    and ST1 instead of LDR and STR, both for correctness on big-endian
3132    targets and because LD1 and ST1 support a wider range of addressing modes.
3133    PRED_MODE is the mode of the predicate.
3134
3135    See the comment at the head of aarch64-sve.md for details about the
3136    big-endian handling.  */
3137
3138 void
3139 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3140 {
3141   machine_mode mode = GET_MODE (dest);
3142   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3143   if (!register_operand (src, mode)
3144       && !register_operand (dest, mode))
3145     {
3146       rtx tmp = gen_reg_rtx (mode);
3147       if (MEM_P (src))
3148         aarch64_emit_sve_pred_move (tmp, ptrue, src);
3149       else
3150         emit_move_insn (tmp, src);
3151       src = tmp;
3152     }
3153   aarch64_emit_sve_pred_move (dest, ptrue, src);
3154 }
3155
3156 /* Called only on big-endian targets.  See whether an SVE vector move
3157    from SRC to DEST is effectively a REV[BHW] instruction, because at
3158    least one operand is a subreg of an SVE vector that has wider or
3159    narrower elements.  Return true and emit the instruction if so.
3160
3161    For example:
3162
3163      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3164
3165    represents a VIEW_CONVERT between the following vectors, viewed
3166    in memory order:
3167
3168      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
3169      R1: { [0],      [1],      [2],      [3],     ... }
3170
3171    The high part of lane X in R2 should therefore correspond to lane X*2
3172    of R1, but the register representations are:
3173
3174          msb                                      lsb
3175      R2: ...... [1].high  [1].low   [0].high  [0].low
3176      R1: ...... [3]       [2]       [1]       [0]
3177
3178    where the low part of lane X in R2 corresponds to lane X*2 in R1.
3179    We therefore need a reverse operation to swap the high and low values
3180    around.
3181
3182    This is purely an optimization.  Without it we would spill the
3183    subreg operand to the stack in one mode and reload it in the
3184    other mode, which has the same effect as the REV.  */
3185
3186 bool
3187 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3188 {
3189   gcc_assert (BYTES_BIG_ENDIAN);
3190   if (GET_CODE (dest) == SUBREG)
3191     dest = SUBREG_REG (dest);
3192   if (GET_CODE (src) == SUBREG)
3193     src = SUBREG_REG (src);
3194
3195   /* The optimization handles two single SVE REGs with different element
3196      sizes.  */
3197   if (!REG_P (dest)
3198       || !REG_P (src)
3199       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3200       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3201       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3202           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3203     return false;
3204
3205   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
3206   rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3207   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3208                                UNSPEC_REV_SUBREG);
3209   emit_insn (gen_rtx_SET (dest, unspec));
3210   return true;
3211 }
3212
3213 /* Return a copy of X with mode MODE, without changing its other
3214    attributes.  Unlike gen_lowpart, this doesn't care whether the
3215    mode change is valid.  */
3216
3217 static rtx
3218 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3219 {
3220   if (GET_MODE (x) == mode)
3221     return x;
3222
3223   x = shallow_copy_rtx (x);
3224   set_mode_and_regno (x, mode, REGNO (x));
3225   return x;
3226 }
3227
3228 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3229    operands.  */
3230
3231 void
3232 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3233 {
3234   /* Decide which REV operation we need.  The mode with narrower elements
3235      determines the mode of the operands and the mode with the wider
3236      elements determines the reverse width.  */
3237   machine_mode mode_with_wider_elts = GET_MODE (dest);
3238   machine_mode mode_with_narrower_elts = GET_MODE (src);
3239   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3240       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3241     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3242
3243   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3244   unsigned int unspec;
3245   if (wider_bytes == 8)
3246     unspec = UNSPEC_REV64;
3247   else if (wider_bytes == 4)
3248     unspec = UNSPEC_REV32;
3249   else if (wider_bytes == 2)
3250     unspec = UNSPEC_REV16;
3251   else
3252     gcc_unreachable ();
3253   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3254
3255   /* Emit:
3256
3257        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3258                          UNSPEC_MERGE_PTRUE))
3259
3260      with the appropriate modes.  */
3261   ptrue = gen_lowpart (pred_mode, ptrue);
3262   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3263   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3264   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3265   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3266                         UNSPEC_MERGE_PTRUE);
3267   emit_insn (gen_rtx_SET (dest, src));
3268 }
3269
3270 static bool
3271 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3272                                  tree exp ATTRIBUTE_UNUSED)
3273 {
3274   /* Currently, always true.  */
3275   return true;
3276 }
3277
3278 /* Implement TARGET_PASS_BY_REFERENCE.  */
3279
3280 static bool
3281 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3282                            machine_mode mode,
3283                            const_tree type,
3284                            bool named ATTRIBUTE_UNUSED)
3285 {
3286   HOST_WIDE_INT size;
3287   machine_mode dummymode;
3288   int nregs;
3289
3290   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3291   if (mode == BLKmode && type)
3292     size = int_size_in_bytes (type);
3293   else
3294     /* No frontends can create types with variable-sized modes, so we
3295        shouldn't be asked to pass or return them.  */
3296     size = GET_MODE_SIZE (mode).to_constant ();
3297
3298   /* Aggregates are passed by reference based on their size.  */
3299   if (type && AGGREGATE_TYPE_P (type))
3300     {
3301       size = int_size_in_bytes (type);
3302     }
3303
3304   /* Variable sized arguments are always returned by reference.  */
3305   if (size < 0)
3306     return true;
3307
3308   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3309   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3310                                                &dummymode, &nregs,
3311                                                NULL))
3312     return false;
3313
3314   /* Arguments which are variable sized or larger than 2 registers are
3315      passed by reference unless they are a homogenous floating point
3316      aggregate.  */
3317   return size > 2 * UNITS_PER_WORD;
3318 }
3319
3320 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3321 static bool
3322 aarch64_return_in_msb (const_tree valtype)
3323 {
3324   machine_mode dummy_mode;
3325   int dummy_int;
3326
3327   /* Never happens in little-endian mode.  */
3328   if (!BYTES_BIG_ENDIAN)
3329     return false;
3330
3331   /* Only composite types smaller than or equal to 16 bytes can
3332      be potentially returned in registers.  */
3333   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3334       || int_size_in_bytes (valtype) <= 0
3335       || int_size_in_bytes (valtype) > 16)
3336     return false;
3337
3338   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3339      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3340      is always passed/returned in the least significant bits of fp/simd
3341      register(s).  */
3342   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3343                                                &dummy_mode, &dummy_int, NULL))
3344     return false;
3345
3346   return true;
3347 }
3348
3349 /* Implement TARGET_FUNCTION_VALUE.
3350    Define how to find the value returned by a function.  */
3351
3352 static rtx
3353 aarch64_function_value (const_tree type, const_tree func,
3354                         bool outgoing ATTRIBUTE_UNUSED)
3355 {
3356   machine_mode mode;
3357   int unsignedp;
3358   int count;
3359   machine_mode ag_mode;
3360
3361   mode = TYPE_MODE (type);
3362   if (INTEGRAL_TYPE_P (type))
3363     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3364
3365   if (aarch64_return_in_msb (type))
3366     {
3367       HOST_WIDE_INT size = int_size_in_bytes (type);
3368
3369       if (size % UNITS_PER_WORD != 0)
3370         {
3371           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3372           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3373         }
3374     }
3375
3376   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3377                                                &ag_mode, &count, NULL))
3378     {
3379       if (!aarch64_composite_type_p (type, mode))
3380         {
3381           gcc_assert (count == 1 && mode == ag_mode);
3382           return gen_rtx_REG (mode, V0_REGNUM);
3383         }
3384       else
3385         {
3386           int i;
3387           rtx par;
3388
3389           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3390           for (i = 0; i < count; i++)
3391             {
3392               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3393               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3394               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3395               XVECEXP (par, 0, i) = tmp;
3396             }
3397           return par;
3398         }
3399     }
3400   else
3401     return gen_rtx_REG (mode, R0_REGNUM);
3402 }
3403
3404 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3405    Return true if REGNO is the number of a hard register in which the values
3406    of called function may come back.  */
3407
3408 static bool
3409 aarch64_function_value_regno_p (const unsigned int regno)
3410 {
3411   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3412      of 16-byte return values are: 128-bit integers and 16-byte small
3413      structures (excluding homogeneous floating-point aggregates).  */
3414   if (regno == R0_REGNUM || regno == R1_REGNUM)
3415     return true;
3416
3417   /* Up to four fp/simd registers can return a function value, e.g. a
3418      homogeneous floating-point aggregate having four members.  */
3419   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3420     return TARGET_FLOAT;
3421
3422   return false;
3423 }
3424
3425 /* Implement TARGET_RETURN_IN_MEMORY.
3426
3427    If the type T of the result of a function is such that
3428      void func (T arg)
3429    would require that arg be passed as a value in a register (or set of
3430    registers) according to the parameter passing rules, then the result
3431    is returned in the same registers as would be used for such an
3432    argument.  */
3433
3434 static bool
3435 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3436 {
3437   HOST_WIDE_INT size;
3438   machine_mode ag_mode;
3439   int count;
3440
3441   if (!AGGREGATE_TYPE_P (type)
3442       && TREE_CODE (type) != COMPLEX_TYPE
3443       && TREE_CODE (type) != VECTOR_TYPE)
3444     /* Simple scalar types always returned in registers.  */
3445     return false;
3446
3447   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3448                                                type,
3449                                                &ag_mode,
3450                                                &count,
3451                                                NULL))
3452     return false;
3453
3454   /* Types larger than 2 registers returned in memory.  */
3455   size = int_size_in_bytes (type);
3456   return (size < 0 || size > 2 * UNITS_PER_WORD);
3457 }
3458
3459 static bool
3460 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3461                                const_tree type, int *nregs)
3462 {
3463   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3464   return aarch64_vfp_is_call_or_return_candidate (mode,
3465                                                   type,
3466                                                   &pcum->aapcs_vfp_rmode,
3467                                                   nregs,
3468                                                   NULL);
3469 }
3470
3471 /* Given MODE and TYPE of a function argument, return the alignment in
3472    bits.  The idea is to suppress any stronger alignment requested by
3473    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3474    This is a helper function for local use only.  */
3475
3476 static unsigned int
3477 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3478 {
3479   if (!type)
3480     return GET_MODE_ALIGNMENT (mode);
3481
3482   if (integer_zerop (TYPE_SIZE (type)))
3483     return 0;
3484
3485   gcc_assert (TYPE_MODE (type) == mode);
3486
3487   if (!AGGREGATE_TYPE_P (type))
3488     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3489
3490   if (TREE_CODE (type) == ARRAY_TYPE)
3491     return TYPE_ALIGN (TREE_TYPE (type));
3492
3493   unsigned int alignment = 0;
3494   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3495     if (TREE_CODE (field) == FIELD_DECL)
3496       alignment = std::max (alignment, DECL_ALIGN (field));
3497
3498   return alignment;
3499 }
3500
3501 /* Layout a function argument according to the AAPCS64 rules.  The rule
3502    numbers refer to the rule numbers in the AAPCS64.  */
3503
3504 static void
3505 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3506                     const_tree type,
3507                     bool named ATTRIBUTE_UNUSED)
3508 {
3509   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3510   int ncrn, nvrn, nregs;
3511   bool allocate_ncrn, allocate_nvrn;
3512   HOST_WIDE_INT size;
3513
3514   /* We need to do this once per argument.  */
3515   if (pcum->aapcs_arg_processed)
3516     return;
3517
3518   pcum->aapcs_arg_processed = true;
3519
3520   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
3521   if (type)
3522     size = int_size_in_bytes (type);
3523   else
3524     /* No frontends can create types with variable-sized modes, so we
3525        shouldn't be asked to pass or return them.  */
3526     size = GET_MODE_SIZE (mode).to_constant ();
3527   size = ROUND_UP (size, UNITS_PER_WORD);
3528
3529   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3530   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3531                                                  mode,
3532                                                  type,
3533                                                  &nregs);
3534
3535   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3536      The following code thus handles passing by SIMD/FP registers first.  */
3537
3538   nvrn = pcum->aapcs_nvrn;
3539
3540   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3541      and homogenous short-vector aggregates (HVA).  */
3542   if (allocate_nvrn)
3543     {
3544       if (!TARGET_FLOAT)
3545         aarch64_err_no_fpadvsimd (mode);
3546
3547       if (nvrn + nregs <= NUM_FP_ARG_REGS)
3548         {
3549           pcum->aapcs_nextnvrn = nvrn + nregs;
3550           if (!aarch64_composite_type_p (type, mode))
3551             {
3552               gcc_assert (nregs == 1);
3553               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3554             }
3555           else
3556             {
3557               rtx par;
3558               int i;
3559               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3560               for (i = 0; i < nregs; i++)
3561                 {
3562                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3563                                          V0_REGNUM + nvrn + i);
3564                   rtx offset = gen_int_mode
3565                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3566                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3567                   XVECEXP (par, 0, i) = tmp;
3568                 }
3569               pcum->aapcs_reg = par;
3570             }
3571           return;
3572         }
3573       else
3574         {
3575           /* C.3 NSRN is set to 8.  */
3576           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3577           goto on_stack;
3578         }
3579     }
3580
3581   ncrn = pcum->aapcs_ncrn;
3582   nregs = size / UNITS_PER_WORD;
3583
3584   /* C6 - C9.  though the sign and zero extension semantics are
3585      handled elsewhere.  This is the case where the argument fits
3586      entirely general registers.  */
3587   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3588     {
3589
3590       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3591
3592       /* C.8 if the argument has an alignment of 16 then the NGRN is
3593          rounded up to the next even number.  */
3594       if (nregs == 2
3595           && ncrn % 2
3596           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3597              comparison is there because for > 16 * BITS_PER_UNIT
3598              alignment nregs should be > 2 and therefore it should be
3599              passed by reference rather than value.  */
3600           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3601         {
3602           ++ncrn;
3603           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3604         }
3605
3606       /* NREGS can be 0 when e.g. an empty structure is to be passed.
3607          A reg is still generated for it, but the caller should be smart
3608          enough not to use it.  */
3609       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3610         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3611       else
3612         {
3613           rtx par;
3614           int i;
3615
3616           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3617           for (i = 0; i < nregs; i++)
3618             {
3619               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3620               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3621                                        GEN_INT (i * UNITS_PER_WORD));
3622               XVECEXP (par, 0, i) = tmp;
3623             }
3624           pcum->aapcs_reg = par;
3625         }
3626
3627       pcum->aapcs_nextncrn = ncrn + nregs;
3628       return;
3629     }
3630
3631   /* C.11  */
3632   pcum->aapcs_nextncrn = NUM_ARG_REGS;
3633
3634   /* The argument is passed on stack; record the needed number of words for
3635      this argument and align the total size if necessary.  */
3636 on_stack:
3637   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3638
3639   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3640     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3641                                        16 / UNITS_PER_WORD);
3642   return;
3643 }
3644
3645 /* Implement TARGET_FUNCTION_ARG.  */
3646
3647 static rtx
3648 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3649                       const_tree type, bool named)
3650 {
3651   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3652   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3653
3654   if (mode == VOIDmode)
3655     return NULL_RTX;
3656
3657   aarch64_layout_arg (pcum_v, mode, type, named);
3658   return pcum->aapcs_reg;
3659 }
3660
3661 void
3662 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3663                            const_tree fntype ATTRIBUTE_UNUSED,
3664                            rtx libname ATTRIBUTE_UNUSED,
3665                            const_tree fndecl ATTRIBUTE_UNUSED,
3666                            unsigned n_named ATTRIBUTE_UNUSED)
3667 {
3668   pcum->aapcs_ncrn = 0;
3669   pcum->aapcs_nvrn = 0;
3670   pcum->aapcs_nextncrn = 0;
3671   pcum->aapcs_nextnvrn = 0;
3672   pcum->pcs_variant = ARM_PCS_AAPCS64;
3673   pcum->aapcs_reg = NULL_RTX;
3674   pcum->aapcs_arg_processed = false;
3675   pcum->aapcs_stack_words = 0;
3676   pcum->aapcs_stack_size = 0;
3677
3678   if (!TARGET_FLOAT
3679       && fndecl && TREE_PUBLIC (fndecl)
3680       && fntype && fntype != error_mark_node)
3681     {
3682       const_tree type = TREE_TYPE (fntype);
3683       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
3684       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
3685       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3686                                                    &mode, &nregs, NULL))
3687         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
3688     }
3689   return;
3690 }
3691
3692 static void
3693 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3694                               machine_mode mode,
3695                               const_tree type,
3696                               bool named)
3697 {
3698   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3699   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3700     {
3701       aarch64_layout_arg (pcum_v, mode, type, named);
3702       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3703                   != (pcum->aapcs_stack_words != 0));
3704       pcum->aapcs_arg_processed = false;
3705       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3706       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3707       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3708       pcum->aapcs_stack_words = 0;
3709       pcum->aapcs_reg = NULL_RTX;
3710     }
3711 }
3712
3713 bool
3714 aarch64_function_arg_regno_p (unsigned regno)
3715 {
3716   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3717           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3718 }
3719
3720 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
3721    PARM_BOUNDARY bits of alignment, but will be given anything up
3722    to STACK_BOUNDARY bits if the type requires it.  This makes sure
3723    that both before and after the layout of each argument, the Next
3724    Stacked Argument Address (NSAA) will have a minimum alignment of
3725    8 bytes.  */
3726
3727 static unsigned int
3728 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3729 {
3730   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3731   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3732 }
3733
3734 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
3735
3736 static fixed_size_mode
3737 aarch64_get_reg_raw_mode (int regno)
3738 {
3739   if (TARGET_SVE && FP_REGNUM_P (regno))
3740     /* Don't use the SVE part of the register for __builtin_apply and
3741        __builtin_return.  The SVE registers aren't used by the normal PCS,
3742        so using them there would be a waste of time.  The PCS extensions
3743        for SVE types are fundamentally incompatible with the
3744        __builtin_return/__builtin_apply interface.  */
3745     return as_a <fixed_size_mode> (V16QImode);
3746   return default_get_reg_raw_mode (regno);
3747 }
3748
3749 /* Implement TARGET_FUNCTION_ARG_PADDING.
3750
3751    Small aggregate types are placed in the lowest memory address.
3752
3753    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
3754
3755 static pad_direction
3756 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3757 {
3758   /* On little-endian targets, the least significant byte of every stack
3759      argument is passed at the lowest byte address of the stack slot.  */
3760   if (!BYTES_BIG_ENDIAN)
3761     return PAD_UPWARD;
3762
3763   /* Otherwise, integral, floating-point and pointer types are padded downward:
3764      the least significant byte of a stack argument is passed at the highest
3765      byte address of the stack slot.  */
3766   if (type
3767       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3768          || POINTER_TYPE_P (type))
3769       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3770     return PAD_DOWNWARD;
3771
3772   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
3773   return PAD_UPWARD;
3774 }
3775
3776 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3777
3778    It specifies padding for the last (may also be the only)
3779    element of a block move between registers and memory.  If
3780    assuming the block is in the memory, padding upward means that
3781    the last element is padded after its highest significant byte,
3782    while in downward padding, the last element is padded at the
3783    its least significant byte side.
3784
3785    Small aggregates and small complex types are always padded
3786    upwards.
3787
3788    We don't need to worry about homogeneous floating-point or
3789    short-vector aggregates; their move is not affected by the
3790    padding direction determined here.  Regardless of endianness,
3791    each element of such an aggregate is put in the least
3792    significant bits of a fp/simd register.
3793
3794    Return !BYTES_BIG_ENDIAN if the least significant byte of the
3795    register has useful data, and return the opposite if the most
3796    significant byte does.  */
3797
3798 bool
3799 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3800                      bool first ATTRIBUTE_UNUSED)
3801 {
3802
3803   /* Small composite types are always padded upward.  */
3804   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3805     {
3806       HOST_WIDE_INT size;
3807       if (type)
3808         size = int_size_in_bytes (type);
3809       else
3810         /* No frontends can create types with variable-sized modes, so we
3811            shouldn't be asked to pass or return them.  */
3812         size = GET_MODE_SIZE (mode).to_constant ();
3813       if (size < 2 * UNITS_PER_WORD)
3814         return true;
3815     }
3816
3817   /* Otherwise, use the default padding.  */
3818   return !BYTES_BIG_ENDIAN;
3819 }
3820
3821 static scalar_int_mode
3822 aarch64_libgcc_cmp_return_mode (void)
3823 {
3824   return SImode;
3825 }
3826
3827 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3828
3829 /* We use the 12-bit shifted immediate arithmetic instructions so values
3830    must be multiple of (1 << 12), i.e. 4096.  */
3831 #define ARITH_FACTOR 4096
3832
3833 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3834 #error Cannot use simple address calculation for stack probing
3835 #endif
3836
3837 /* The pair of scratch registers used for stack probing.  */
3838 #define PROBE_STACK_FIRST_REG  9
3839 #define PROBE_STACK_SECOND_REG 10
3840
3841 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3842    inclusive.  These are offsets from the current stack pointer.  */
3843
3844 static void
3845 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3846 {
3847   HOST_WIDE_INT size;
3848   if (!poly_size.is_constant (&size))
3849     {
3850       sorry ("stack probes for SVE frames");
3851       return;
3852     }
3853
3854   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3855
3856   /* See the same assertion on PROBE_INTERVAL above.  */
3857   gcc_assert ((first % ARITH_FACTOR) == 0);
3858
3859   /* See if we have a constant small number of probes to generate.  If so,
3860      that's the easy case.  */
3861   if (size <= PROBE_INTERVAL)
3862     {
3863       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3864
3865       emit_set_insn (reg1,
3866                      plus_constant (Pmode,
3867                                     stack_pointer_rtx, -(first + base)));
3868       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3869     }
3870
3871   /* The run-time loop is made up of 8 insns in the generic case while the
3872      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
3873   else if (size <= 4 * PROBE_INTERVAL)
3874     {
3875       HOST_WIDE_INT i, rem;
3876
3877       emit_set_insn (reg1,
3878                      plus_constant (Pmode,
3879                                     stack_pointer_rtx,
3880                                     -(first + PROBE_INTERVAL)));
3881       emit_stack_probe (reg1);
3882
3883       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3884          it exceeds SIZE.  If only two probes are needed, this will not
3885          generate any code.  Then probe at FIRST + SIZE.  */
3886       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3887         {
3888           emit_set_insn (reg1,
3889                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3890           emit_stack_probe (reg1);
3891         }
3892
3893       rem = size - (i - PROBE_INTERVAL);
3894       if (rem > 256)
3895         {
3896           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3897
3898           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3899           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3900         }
3901       else
3902         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3903     }
3904
3905   /* Otherwise, do the same as above, but in a loop.  Note that we must be
3906      extra careful with variables wrapping around because we might be at
3907      the very top (or the very bottom) of the address space and we have
3908      to be able to handle this case properly; in particular, we use an
3909      equality test for the loop condition.  */
3910   else
3911     {
3912       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3913
3914       /* Step 1: round SIZE to the previous multiple of the interval.  */
3915
3916       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3917
3918
3919       /* Step 2: compute initial and final value of the loop counter.  */
3920
3921       /* TEST_ADDR = SP + FIRST.  */
3922       emit_set_insn (reg1,
3923                      plus_constant (Pmode, stack_pointer_rtx, -first));
3924
3925       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
3926       HOST_WIDE_INT adjustment = - (first + rounded_size);
3927       if (! aarch64_uimm12_shift (adjustment))
3928         {
3929           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3930                                           true, Pmode);
3931           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3932         }
3933       else
3934         emit_set_insn (reg2,
3935                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
3936
3937       /* Step 3: the loop
3938
3939          do
3940            {
3941              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3942              probe at TEST_ADDR
3943            }
3944          while (TEST_ADDR != LAST_ADDR)
3945
3946          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3947          until it is equal to ROUNDED_SIZE.  */
3948
3949       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3950
3951
3952       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3953          that SIZE is equal to ROUNDED_SIZE.  */
3954
3955       if (size != rounded_size)
3956         {
3957           HOST_WIDE_INT rem = size - rounded_size;
3958
3959           if (rem > 256)
3960             {
3961               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3962
3963               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3964               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3965             }
3966           else
3967             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3968         }
3969     }
3970
3971   /* Make sure nothing is scheduled before we are done.  */
3972   emit_insn (gen_blockage ());
3973 }
3974
3975 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
3976    absolute addresses.  */
3977
3978 const char *
3979 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3980 {
3981   static int labelno = 0;
3982   char loop_lab[32];
3983   rtx xops[2];
3984
3985   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3986
3987   /* Loop.  */
3988   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3989
3990   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
3991   xops[0] = reg1;
3992   xops[1] = GEN_INT (PROBE_INTERVAL);
3993   output_asm_insn ("sub\t%0, %0, %1", xops);
3994
3995   /* Probe at TEST_ADDR.  */
3996   output_asm_insn ("str\txzr, [%0]", xops);
3997
3998   /* Test if TEST_ADDR == LAST_ADDR.  */
3999   xops[1] = reg2;
4000   output_asm_insn ("cmp\t%0, %1", xops);
4001
4002   /* Branch.  */
4003   fputs ("\tb.ne\t", asm_out_file);
4004   assemble_name_raw (asm_out_file, loop_lab);
4005   fputc ('\n', asm_out_file);
4006
4007   return "";
4008 }
4009
4010 /* Determine whether a frame chain needs to be generated.  */
4011 static bool
4012 aarch64_needs_frame_chain (void)
4013 {
4014   /* Force a frame chain for EH returns so the return address is at FP+8.  */
4015   if (frame_pointer_needed || crtl->calls_eh_return)
4016     return true;
4017
4018   /* A leaf function cannot have calls or write LR.  */
4019   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4020
4021   /* Don't use a frame chain in leaf functions if leaf frame pointers
4022      are disabled.  */
4023   if (flag_omit_leaf_frame_pointer && is_leaf)
4024     return false;
4025
4026   return aarch64_use_frame_pointer;
4027 }
4028
4029 /* Mark the registers that need to be saved by the callee and calculate
4030    the size of the callee-saved registers area and frame record (both FP
4031    and LR may be omitted).  */
4032 static void
4033 aarch64_layout_frame (void)
4034 {
4035   HOST_WIDE_INT offset = 0;
4036   int regno, last_fp_reg = INVALID_REGNUM;
4037
4038   if (reload_completed && cfun->machine->frame.laid_out)
4039     return;
4040
4041   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4042
4043 #define SLOT_NOT_REQUIRED (-2)
4044 #define SLOT_REQUIRED     (-1)
4045
4046   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4047   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4048
4049   /* First mark all the registers that really need to be saved...  */
4050   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4051     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4052
4053   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4054     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4055
4056   /* ... that includes the eh data registers (if needed)...  */
4057   if (crtl->calls_eh_return)
4058     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4059       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4060         = SLOT_REQUIRED;
4061
4062   /* ... and any callee saved register that dataflow says is live.  */
4063   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4064     if (df_regs_ever_live_p (regno)
4065         && (regno == R30_REGNUM
4066             || !call_used_regs[regno]))
4067       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4068
4069   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4070     if (df_regs_ever_live_p (regno)
4071         && !call_used_regs[regno])
4072       {
4073         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4074         last_fp_reg = regno;
4075       }
4076
4077   if (cfun->machine->frame.emit_frame_chain)
4078     {
4079       /* FP and LR are placed in the linkage record.  */
4080       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4081       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4082       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4083       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4084       offset = 2 * UNITS_PER_WORD;
4085     }
4086
4087   /* Now assign stack slots for them.  */
4088   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4089     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4090       {
4091         cfun->machine->frame.reg_offset[regno] = offset;
4092         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4093           cfun->machine->frame.wb_candidate1 = regno;
4094         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4095           cfun->machine->frame.wb_candidate2 = regno;
4096         offset += UNITS_PER_WORD;
4097       }
4098
4099   HOST_WIDE_INT max_int_offset = offset;
4100   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4101   bool has_align_gap = offset != max_int_offset;
4102
4103   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4104     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4105       {
4106         /* If there is an alignment gap between integer and fp callee-saves,
4107            allocate the last fp register to it if possible.  */
4108         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4109           {
4110             cfun->machine->frame.reg_offset[regno] = max_int_offset;
4111             break;
4112           }
4113
4114         cfun->machine->frame.reg_offset[regno] = offset;
4115         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4116           cfun->machine->frame.wb_candidate1 = regno;
4117         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4118                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4119           cfun->machine->frame.wb_candidate2 = regno;
4120         offset += UNITS_PER_WORD;
4121       }
4122
4123   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4124
4125   cfun->machine->frame.saved_regs_size = offset;
4126
4127   HOST_WIDE_INT varargs_and_saved_regs_size
4128     = offset + cfun->machine->frame.saved_varargs_size;
4129
4130   cfun->machine->frame.hard_fp_offset
4131     = aligned_upper_bound (varargs_and_saved_regs_size
4132                            + get_frame_size (),
4133                            STACK_BOUNDARY / BITS_PER_UNIT);
4134
4135   /* Both these values are already aligned.  */
4136   gcc_assert (multiple_p (crtl->outgoing_args_size,
4137                           STACK_BOUNDARY / BITS_PER_UNIT));
4138   cfun->machine->frame.frame_size
4139     = (cfun->machine->frame.hard_fp_offset
4140        + crtl->outgoing_args_size);
4141
4142   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4143
4144   cfun->machine->frame.initial_adjust = 0;
4145   cfun->machine->frame.final_adjust = 0;
4146   cfun->machine->frame.callee_adjust = 0;
4147   cfun->machine->frame.callee_offset = 0;
4148
4149   HOST_WIDE_INT max_push_offset = 0;
4150   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4151     max_push_offset = 512;
4152   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4153     max_push_offset = 256;
4154
4155   HOST_WIDE_INT const_size, const_fp_offset;
4156   if (cfun->machine->frame.frame_size.is_constant (&const_size)
4157       && const_size < max_push_offset
4158       && known_eq (crtl->outgoing_args_size, 0))
4159     {
4160       /* Simple, small frame with no outgoing arguments:
4161          stp reg1, reg2, [sp, -frame_size]!
4162          stp reg3, reg4, [sp, 16]  */
4163       cfun->machine->frame.callee_adjust = const_size;
4164     }
4165   else if (known_lt (crtl->outgoing_args_size
4166                      + cfun->machine->frame.saved_regs_size, 512)
4167            && !(cfun->calls_alloca
4168                 && known_lt (cfun->machine->frame.hard_fp_offset,
4169                              max_push_offset)))
4170     {
4171       /* Frame with small outgoing arguments:
4172          sub sp, sp, frame_size
4173          stp reg1, reg2, [sp, outgoing_args_size]
4174          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
4175       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4176       cfun->machine->frame.callee_offset
4177         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4178     }
4179   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4180            && const_fp_offset < max_push_offset)
4181     {
4182       /* Frame with large outgoing arguments but a small local area:
4183          stp reg1, reg2, [sp, -hard_fp_offset]!
4184          stp reg3, reg4, [sp, 16]
4185          sub sp, sp, outgoing_args_size  */
4186       cfun->machine->frame.callee_adjust = const_fp_offset;
4187       cfun->machine->frame.final_adjust
4188         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4189     }
4190   else
4191     {
4192       /* Frame with large local area and outgoing arguments using frame pointer:
4193          sub sp, sp, hard_fp_offset
4194          stp x29, x30, [sp, 0]
4195          add x29, sp, 0
4196          stp reg3, reg4, [sp, 16]
4197          sub sp, sp, outgoing_args_size  */
4198       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4199       cfun->machine->frame.final_adjust
4200         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4201     }
4202
4203   cfun->machine->frame.laid_out = true;
4204 }
4205
4206 /* Return true if the register REGNO is saved on entry to
4207    the current function.  */
4208
4209 static bool
4210 aarch64_register_saved_on_entry (int regno)
4211 {
4212   return cfun->machine->frame.reg_offset[regno] >= 0;
4213 }
4214
4215 /* Return the next register up from REGNO up to LIMIT for the callee
4216    to save.  */
4217
4218 static unsigned
4219 aarch64_next_callee_save (unsigned regno, unsigned limit)
4220 {
4221   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4222     regno ++;
4223   return regno;
4224 }
4225
4226 /* Push the register number REGNO of mode MODE to the stack with write-back
4227    adjusting the stack by ADJUSTMENT.  */
4228
4229 static void
4230 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4231                            HOST_WIDE_INT adjustment)
4232  {
4233   rtx base_rtx = stack_pointer_rtx;
4234   rtx insn, reg, mem;
4235
4236   reg = gen_rtx_REG (mode, regno);
4237   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4238                             plus_constant (Pmode, base_rtx, -adjustment));
4239   mem = gen_frame_mem (mode, mem);
4240
4241   insn = emit_move_insn (mem, reg);
4242   RTX_FRAME_RELATED_P (insn) = 1;
4243 }
4244
4245 /* Generate and return an instruction to store the pair of registers
4246    REG and REG2 of mode MODE to location BASE with write-back adjusting
4247    the stack location BASE by ADJUSTMENT.  */
4248
4249 static rtx
4250 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4251                           HOST_WIDE_INT adjustment)
4252 {
4253   switch (mode)
4254     {
4255     case E_DImode:
4256       return gen_storewb_pairdi_di (base, base, reg, reg2,
4257                                     GEN_INT (-adjustment),
4258                                     GEN_INT (UNITS_PER_WORD - adjustment));
4259     case E_DFmode:
4260       return gen_storewb_pairdf_di (base, base, reg, reg2,
4261                                     GEN_INT (-adjustment),
4262                                     GEN_INT (UNITS_PER_WORD - adjustment));
4263     default:
4264       gcc_unreachable ();
4265     }
4266 }
4267
4268 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4269    stack pointer by ADJUSTMENT.  */
4270
4271 static void
4272 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4273 {
4274   rtx_insn *insn;
4275   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4276
4277   if (regno2 == INVALID_REGNUM)
4278     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4279
4280   rtx reg1 = gen_rtx_REG (mode, regno1);
4281   rtx reg2 = gen_rtx_REG (mode, regno2);
4282
4283   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4284                                               reg2, adjustment));
4285   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4286   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4287   RTX_FRAME_RELATED_P (insn) = 1;
4288 }
4289
4290 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4291    adjusting it by ADJUSTMENT afterwards.  */
4292
4293 static rtx
4294 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4295                          HOST_WIDE_INT adjustment)
4296 {
4297   switch (mode)
4298     {
4299     case E_DImode:
4300       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4301                                    GEN_INT (UNITS_PER_WORD));
4302     case E_DFmode:
4303       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4304                                    GEN_INT (UNITS_PER_WORD));
4305     default:
4306       gcc_unreachable ();
4307     }
4308 }
4309
4310 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4311    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4312    into CFI_OPS.  */
4313
4314 static void
4315 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4316                   rtx *cfi_ops)
4317 {
4318   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4319   rtx reg1 = gen_rtx_REG (mode, regno1);
4320
4321   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4322
4323   if (regno2 == INVALID_REGNUM)
4324     {
4325       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4326       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4327       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4328     }
4329   else
4330     {
4331       rtx reg2 = gen_rtx_REG (mode, regno2);
4332       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4333       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4334                                           reg2, adjustment));
4335     }
4336 }
4337
4338 /* Generate and return a store pair instruction of mode MODE to store
4339    register REG1 to MEM1 and register REG2 to MEM2.  */
4340
4341 static rtx
4342 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4343                         rtx reg2)
4344 {
4345   switch (mode)
4346     {
4347     case E_DImode:
4348       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4349
4350     case E_DFmode:
4351       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4352
4353     default:
4354       gcc_unreachable ();
4355     }
4356 }
4357
4358 /* Generate and regurn a load pair isntruction of mode MODE to load register
4359    REG1 from MEM1 and register REG2 from MEM2.  */
4360
4361 static rtx
4362 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4363                        rtx mem2)
4364 {
4365   switch (mode)
4366     {
4367     case E_DImode:
4368       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4369
4370     case E_DFmode:
4371       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4372
4373     default:
4374       gcc_unreachable ();
4375     }
4376 }
4377
4378 /* Return TRUE if return address signing should be enabled for the current
4379    function, otherwise return FALSE.  */
4380
4381 bool
4382 aarch64_return_address_signing_enabled (void)
4383 {
4384   /* This function should only be called after frame laid out.   */
4385   gcc_assert (cfun->machine->frame.laid_out);
4386
4387   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4388      if it's LR is pushed onto stack.  */
4389   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4390           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4391               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4392 }
4393
4394 /* Emit code to save the callee-saved registers from register number START
4395    to LIMIT to the stack at the location starting at offset START_OFFSET,
4396    skipping any write-back candidates if SKIP_WB is true.  */
4397
4398 static void
4399 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4400                            unsigned start, unsigned limit, bool skip_wb)
4401 {
4402   rtx_insn *insn;
4403   unsigned regno;
4404   unsigned regno2;
4405
4406   for (regno = aarch64_next_callee_save (start, limit);
4407        regno <= limit;
4408        regno = aarch64_next_callee_save (regno + 1, limit))
4409     {
4410       rtx reg, mem;
4411       poly_int64 offset;
4412
4413       if (skip_wb
4414           && (regno == cfun->machine->frame.wb_candidate1
4415               || regno == cfun->machine->frame.wb_candidate2))
4416         continue;
4417
4418       if (cfun->machine->reg_is_wrapped_separately[regno])
4419        continue;
4420
4421       reg = gen_rtx_REG (mode, regno);
4422       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4423       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4424                                                 offset));
4425
4426       regno2 = aarch64_next_callee_save (regno + 1, limit);
4427
4428       if (regno2 <= limit
4429           && !cfun->machine->reg_is_wrapped_separately[regno2]
4430           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4431               == cfun->machine->frame.reg_offset[regno2]))
4432
4433         {
4434           rtx reg2 = gen_rtx_REG (mode, regno2);
4435           rtx mem2;
4436
4437           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4438           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4439                                                      offset));
4440           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4441                                                     reg2));
4442
4443           /* The first part of a frame-related parallel insn is
4444              always assumed to be relevant to the frame
4445              calculations; subsequent parts, are only
4446              frame-related if explicitly marked.  */
4447           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4448           regno = regno2;
4449         }
4450       else
4451         insn = emit_move_insn (mem, reg);
4452
4453       RTX_FRAME_RELATED_P (insn) = 1;
4454     }
4455 }
4456
4457 /* Emit code to restore the callee registers of mode MODE from register
4458    number START up to and including LIMIT.  Restore from the stack offset
4459    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4460    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
4461
4462 static void
4463 aarch64_restore_callee_saves (machine_mode mode,
4464                               poly_int64 start_offset, unsigned start,
4465                               unsigned limit, bool skip_wb, rtx *cfi_ops)
4466 {
4467   rtx base_rtx = stack_pointer_rtx;
4468   unsigned regno;
4469   unsigned regno2;
4470   poly_int64 offset;
4471
4472   for (regno = aarch64_next_callee_save (start, limit);
4473        regno <= limit;
4474        regno = aarch64_next_callee_save (regno + 1, limit))
4475     {
4476       if (cfun->machine->reg_is_wrapped_separately[regno])
4477        continue;
4478
4479       rtx reg, mem;
4480
4481       if (skip_wb
4482           && (regno == cfun->machine->frame.wb_candidate1
4483               || regno == cfun->machine->frame.wb_candidate2))
4484         continue;
4485
4486       reg = gen_rtx_REG (mode, regno);
4487       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4488       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4489
4490       regno2 = aarch64_next_callee_save (regno + 1, limit);
4491
4492       if (regno2 <= limit
4493           && !cfun->machine->reg_is_wrapped_separately[regno2]
4494           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4495               == cfun->machine->frame.reg_offset[regno2]))
4496         {
4497           rtx reg2 = gen_rtx_REG (mode, regno2);
4498           rtx mem2;
4499
4500           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4501           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4502           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4503
4504           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4505           regno = regno2;
4506         }
4507       else
4508         emit_move_insn (reg, mem);
4509       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4510     }
4511 }
4512
4513 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4514    of MODE.  */
4515
4516 static inline bool
4517 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4518 {
4519   HOST_WIDE_INT multiple;
4520   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4521           && IN_RANGE (multiple, -8, 7));
4522 }
4523
4524 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4525    of MODE.  */
4526
4527 static inline bool
4528 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4529 {
4530   HOST_WIDE_INT multiple;
4531   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4532           && IN_RANGE (multiple, 0, 63));
4533 }
4534
4535 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4536    of MODE.  */
4537
4538 bool
4539 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4540 {
4541   HOST_WIDE_INT multiple;
4542   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4543           && IN_RANGE (multiple, -64, 63));
4544 }
4545
4546 /* Return true if OFFSET is a signed 9-bit value.  */
4547
4548 static inline bool
4549 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4550                                poly_int64 offset)
4551 {
4552   HOST_WIDE_INT const_offset;
4553   return (offset.is_constant (&const_offset)
4554           && IN_RANGE (const_offset, -256, 255));
4555 }
4556
4557 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4558    of MODE.  */
4559
4560 static inline bool
4561 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4562 {
4563   HOST_WIDE_INT multiple;
4564   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4565           && IN_RANGE (multiple, -256, 255));
4566 }
4567
4568 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4569    of MODE.  */
4570
4571 static inline bool
4572 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4573 {
4574   HOST_WIDE_INT multiple;
4575   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4576           && IN_RANGE (multiple, 0, 4095));
4577 }
4578
4579 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
4580
4581 static sbitmap
4582 aarch64_get_separate_components (void)
4583 {
4584   aarch64_layout_frame ();
4585
4586   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4587   bitmap_clear (components);
4588
4589   /* The registers we need saved to the frame.  */
4590   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4591     if (aarch64_register_saved_on_entry (regno))
4592       {
4593         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4594         if (!frame_pointer_needed)
4595           offset += cfun->machine->frame.frame_size
4596                     - cfun->machine->frame.hard_fp_offset;
4597         /* Check that we can access the stack slot of the register with one
4598            direct load with no adjustments needed.  */
4599         if (offset_12bit_unsigned_scaled_p (DImode, offset))
4600           bitmap_set_bit (components, regno);
4601       }
4602
4603   /* Don't mess with the hard frame pointer.  */
4604   if (frame_pointer_needed)
4605     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4606
4607   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4608   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4609   /* If aarch64_layout_frame has chosen registers to store/restore with
4610      writeback don't interfere with them to avoid having to output explicit
4611      stack adjustment instructions.  */
4612   if (reg2 != INVALID_REGNUM)
4613     bitmap_clear_bit (components, reg2);
4614   if (reg1 != INVALID_REGNUM)
4615     bitmap_clear_bit (components, reg1);
4616
4617   bitmap_clear_bit (components, LR_REGNUM);
4618   bitmap_clear_bit (components, SP_REGNUM);
4619
4620   return components;
4621 }
4622
4623 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
4624
4625 static sbitmap
4626 aarch64_components_for_bb (basic_block bb)
4627 {
4628   bitmap in = DF_LIVE_IN (bb);
4629   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4630   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4631
4632   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4633   bitmap_clear (components);
4634
4635   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
4636   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4637     if ((!call_used_regs[regno])
4638        && (bitmap_bit_p (in, regno)
4639            || bitmap_bit_p (gen, regno)
4640            || bitmap_bit_p (kill, regno)))
4641       {
4642         unsigned regno2, offset, offset2;
4643         bitmap_set_bit (components, regno);
4644
4645         /* If there is a callee-save at an adjacent offset, add it too
4646            to increase the use of LDP/STP.  */
4647         offset = cfun->machine->frame.reg_offset[regno];
4648         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4649
4650         if (regno2 <= LAST_SAVED_REGNUM)
4651           {
4652             offset2 = cfun->machine->frame.reg_offset[regno2];
4653             if ((offset & ~8) == (offset2 & ~8))
4654               bitmap_set_bit (components, regno2);
4655           }
4656       }
4657
4658   return components;
4659 }
4660
4661 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4662    Nothing to do for aarch64.  */
4663
4664 static void
4665 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4666 {
4667 }
4668
4669 /* Return the next set bit in BMP from START onwards.  Return the total number
4670    of bits in BMP if no set bit is found at or after START.  */
4671
4672 static unsigned int
4673 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4674 {
4675   unsigned int nbits = SBITMAP_SIZE (bmp);
4676   if (start == nbits)
4677     return start;
4678
4679   gcc_assert (start < nbits);
4680   for (unsigned int i = start; i < nbits; i++)
4681     if (bitmap_bit_p (bmp, i))
4682       return i;
4683
4684   return nbits;
4685 }
4686
4687 /* Do the work for aarch64_emit_prologue_components and
4688    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
4689    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4690    for these components or the epilogue sequence.  That is, it determines
4691    whether we should emit stores or loads and what kind of CFA notes to attach
4692    to the insns.  Otherwise the logic for the two sequences is very
4693    similar.  */
4694
4695 static void
4696 aarch64_process_components (sbitmap components, bool prologue_p)
4697 {
4698   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4699                              ? HARD_FRAME_POINTER_REGNUM
4700                              : STACK_POINTER_REGNUM);
4701
4702   unsigned last_regno = SBITMAP_SIZE (components);
4703   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4704   rtx_insn *insn = NULL;
4705
4706   while (regno != last_regno)
4707     {
4708       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4709          so DFmode for the vector registers is enough.  */
4710       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4711       rtx reg = gen_rtx_REG (mode, regno);
4712       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4713       if (!frame_pointer_needed)
4714         offset += cfun->machine->frame.frame_size
4715                   - cfun->machine->frame.hard_fp_offset;
4716       rtx addr = plus_constant (Pmode, ptr_reg, offset);
4717       rtx mem = gen_frame_mem (mode, addr);
4718
4719       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4720       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4721       /* No more registers to handle after REGNO.
4722          Emit a single save/restore and exit.  */
4723       if (regno2 == last_regno)
4724         {
4725           insn = emit_insn (set);
4726           RTX_FRAME_RELATED_P (insn) = 1;
4727           if (prologue_p)
4728             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4729           else
4730             add_reg_note (insn, REG_CFA_RESTORE, reg);
4731           break;
4732         }
4733
4734       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4735       /* The next register is not of the same class or its offset is not
4736          mergeable with the current one into a pair.  */
4737       if (!satisfies_constraint_Ump (mem)
4738           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4739           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4740                        GET_MODE_SIZE (mode)))
4741         {
4742           insn = emit_insn (set);
4743           RTX_FRAME_RELATED_P (insn) = 1;
4744           if (prologue_p)
4745             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4746           else
4747             add_reg_note (insn, REG_CFA_RESTORE, reg);
4748
4749           regno = regno2;
4750           continue;
4751         }
4752
4753       /* REGNO2 can be saved/restored in a pair with REGNO.  */
4754       rtx reg2 = gen_rtx_REG (mode, regno2);
4755       if (!frame_pointer_needed)
4756         offset2 += cfun->machine->frame.frame_size
4757                   - cfun->machine->frame.hard_fp_offset;
4758       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4759       rtx mem2 = gen_frame_mem (mode, addr2);
4760       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4761                              : gen_rtx_SET (reg2, mem2);
4762
4763       if (prologue_p)
4764         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4765       else
4766         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4767
4768       RTX_FRAME_RELATED_P (insn) = 1;
4769       if (prologue_p)
4770         {
4771           add_reg_note (insn, REG_CFA_OFFSET, set);
4772           add_reg_note (insn, REG_CFA_OFFSET, set2);
4773         }
4774       else
4775         {
4776           add_reg_note (insn, REG_CFA_RESTORE, reg);
4777           add_reg_note (insn, REG_CFA_RESTORE, reg2);
4778         }
4779
4780       regno = aarch64_get_next_set_bit (components, regno2 + 1);
4781     }
4782 }
4783
4784 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
4785
4786 static void
4787 aarch64_emit_prologue_components (sbitmap components)
4788 {
4789   aarch64_process_components (components, true);
4790 }
4791
4792 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
4793
4794 static void
4795 aarch64_emit_epilogue_components (sbitmap components)
4796 {
4797   aarch64_process_components (components, false);
4798 }
4799
4800 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
4801
4802 static void
4803 aarch64_set_handled_components (sbitmap components)
4804 {
4805   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4806     if (bitmap_bit_p (components, regno))
4807       cfun->machine->reg_is_wrapped_separately[regno] = true;
4808 }
4809
4810 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4811    is saved at BASE + OFFSET.  */
4812
4813 static void
4814 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4815                             rtx base, poly_int64 offset)
4816 {
4817   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4818   add_reg_note (insn, REG_CFA_EXPRESSION,
4819                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4820 }
4821
4822 /* AArch64 stack frames generated by this compiler look like:
4823
4824         +-------------------------------+
4825         |                               |
4826         |  incoming stack arguments     |
4827         |                               |
4828         +-------------------------------+
4829         |                               | <-- incoming stack pointer (aligned)
4830         |  callee-allocated save area   |
4831         |  for register varargs         |
4832         |                               |
4833         +-------------------------------+
4834         |  local variables              | <-- frame_pointer_rtx
4835         |                               |
4836         +-------------------------------+
4837         |  padding0                     | \
4838         +-------------------------------+  |
4839         |  callee-saved registers       |  | frame.saved_regs_size
4840         +-------------------------------+  |
4841         |  LR'                          |  |
4842         +-------------------------------+  |
4843         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
4844         +-------------------------------+
4845         |  dynamic allocation           |
4846         +-------------------------------+
4847         |  padding                      |
4848         +-------------------------------+
4849         |  outgoing stack arguments     | <-- arg_pointer
4850         |                               |
4851         +-------------------------------+
4852         |                               | <-- stack_pointer_rtx (aligned)
4853
4854    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4855    but leave frame_pointer_rtx and hard_frame_pointer_rtx
4856    unchanged.  */
4857
4858 /* Generate the prologue instructions for entry into a function.
4859    Establish the stack frame by decreasing the stack pointer with a
4860    properly calculated size and, if necessary, create a frame record
4861    filled with the values of LR and previous frame pointer.  The
4862    current FP is also set up if it is in use.  */
4863
4864 void
4865 aarch64_expand_prologue (void)
4866 {
4867   aarch64_layout_frame ();
4868
4869   poly_int64 frame_size = cfun->machine->frame.frame_size;
4870   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4871   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4872   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4873   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4874   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4875   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4876   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4877   rtx_insn *insn;
4878
4879   /* Sign return address for functions.  */
4880   if (aarch64_return_address_signing_enabled ())
4881     {
4882       insn = emit_insn (gen_pacisp ());
4883       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4884       RTX_FRAME_RELATED_P (insn) = 1;
4885     }
4886
4887   if (flag_stack_usage_info)
4888     current_function_static_stack_size = constant_lower_bound (frame_size);
4889
4890   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4891     {
4892       if (crtl->is_leaf && !cfun->calls_alloca)
4893         {
4894           if (maybe_gt (frame_size, PROBE_INTERVAL)
4895               && maybe_gt (frame_size, get_stack_check_protect ()))
4896             aarch64_emit_probe_stack_range (get_stack_check_protect (),
4897                                             (frame_size
4898                                              - get_stack_check_protect ()));
4899         }
4900       else if (maybe_gt (frame_size, 0))
4901         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4902     }
4903
4904   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4905   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4906
4907   aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4908
4909   if (callee_adjust != 0)
4910     aarch64_push_regs (reg1, reg2, callee_adjust);
4911
4912   if (emit_frame_chain)
4913     {
4914       poly_int64 reg_offset = callee_adjust;
4915       if (callee_adjust == 0)
4916         {
4917           reg1 = R29_REGNUM;
4918           reg2 = R30_REGNUM;
4919           reg_offset = callee_offset;
4920           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4921         }
4922       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4923                           stack_pointer_rtx, callee_offset,
4924                           ip1_rtx, ip0_rtx, frame_pointer_needed);
4925       if (frame_pointer_needed && !frame_size.is_constant ())
4926         {
4927           /* Variable-sized frames need to describe the save slot
4928              address using DW_CFA_expression rather than DW_CFA_offset.
4929              This means that, without taking further action, the
4930              locations of the registers that we've already saved would
4931              remain based on the stack pointer even after we redefine
4932              the CFA based on the frame pointer.  We therefore need new
4933              DW_CFA_expressions to re-express the save slots with addresses
4934              based on the frame pointer.  */
4935           rtx_insn *insn = get_last_insn ();
4936           gcc_assert (RTX_FRAME_RELATED_P (insn));
4937
4938           /* Add an explicit CFA definition if this was previously
4939              implicit.  */
4940           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4941             {
4942               rtx src = plus_constant (Pmode, stack_pointer_rtx,
4943                                        callee_offset);
4944               add_reg_note (insn, REG_CFA_ADJUST_CFA,
4945                             gen_rtx_SET (hard_frame_pointer_rtx, src));
4946             }
4947
4948           /* Change the save slot expressions for the registers that
4949              we've already saved.  */
4950           reg_offset -= callee_offset;
4951           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4952                                       reg_offset + UNITS_PER_WORD);
4953           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4954                                       reg_offset);
4955         }
4956       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4957     }
4958
4959   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4960                              callee_adjust != 0 || emit_frame_chain);
4961   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4962                              callee_adjust != 0 || emit_frame_chain);
4963   aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4964 }
4965
4966 /* Return TRUE if we can use a simple_return insn.
4967
4968    This function checks whether the callee saved stack is empty, which
4969    means no restore actions are need. The pro_and_epilogue will use
4970    this to check whether shrink-wrapping opt is feasible.  */
4971
4972 bool
4973 aarch64_use_return_insn_p (void)
4974 {
4975   if (!reload_completed)
4976     return false;
4977
4978   if (crtl->profile)
4979     return false;
4980
4981   aarch64_layout_frame ();
4982
4983   return known_eq (cfun->machine->frame.frame_size, 0);
4984 }
4985
4986 /* Generate the epilogue instructions for returning from a function.
4987    This is almost exactly the reverse of the prolog sequence, except
4988    that we need to insert barriers to avoid scheduling loads that read
4989    from a deallocated stack, and we optimize the unwind records by
4990    emitting them all together if possible.  */
4991 void
4992 aarch64_expand_epilogue (bool for_sibcall)
4993 {
4994   aarch64_layout_frame ();
4995
4996   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4997   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4998   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4999   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5000   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5001   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5002   rtx cfi_ops = NULL;
5003   rtx_insn *insn;
5004   /* A stack clash protection prologue may not have left IP0_REGNUM or
5005      IP1_REGNUM in a usable state.  The same is true for allocations
5006      with an SVE component, since we then need both temporary registers
5007      for each allocation.  */
5008   bool can_inherit_p = (initial_adjust.is_constant ()
5009                         && final_adjust.is_constant ()
5010                         && !flag_stack_clash_protection);
5011
5012   /* We need to add memory barrier to prevent read from deallocated stack.  */
5013   bool need_barrier_p
5014     = maybe_ne (get_frame_size ()
5015                 + cfun->machine->frame.saved_varargs_size, 0);
5016
5017   /* Emit a barrier to prevent loads from a deallocated stack.  */
5018   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5019       || cfun->calls_alloca
5020       || crtl->calls_eh_return)
5021     {
5022       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5023       need_barrier_p = false;
5024     }
5025
5026   /* Restore the stack pointer from the frame pointer if it may not
5027      be the same as the stack pointer.  */
5028   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
5029   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
5030   if (frame_pointer_needed
5031       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5032     /* If writeback is used when restoring callee-saves, the CFA
5033        is restored on the instruction doing the writeback.  */
5034     aarch64_add_offset (Pmode, stack_pointer_rtx,
5035                         hard_frame_pointer_rtx, -callee_offset,
5036                         ip1_rtx, ip0_rtx, callee_adjust == 0);
5037   else
5038     aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
5039                     !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
5040
5041   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5042                                 callee_adjust != 0, &cfi_ops);
5043   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5044                                 callee_adjust != 0, &cfi_ops);
5045
5046   if (need_barrier_p)
5047     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5048
5049   if (callee_adjust != 0)
5050     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5051
5052   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5053     {
5054       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
5055       insn = get_last_insn ();
5056       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5057       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5058       RTX_FRAME_RELATED_P (insn) = 1;
5059       cfi_ops = NULL;
5060     }
5061
5062   aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
5063                   !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
5064
5065   if (cfi_ops)
5066     {
5067       /* Emit delayed restores and reset the CFA to be SP.  */
5068       insn = get_last_insn ();
5069       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5070       REG_NOTES (insn) = cfi_ops;
5071       RTX_FRAME_RELATED_P (insn) = 1;
5072     }
5073
5074   /* We prefer to emit the combined return/authenticate instruction RETAA,
5075      however there are three cases in which we must instead emit an explicit
5076      authentication instruction.
5077
5078         1) Sibcalls don't return in a normal way, so if we're about to call one
5079            we must authenticate.
5080
5081         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5082            generating code for !TARGET_ARMV8_3 we can't use it and must
5083            explicitly authenticate.
5084
5085         3) On an eh_return path we make extra stack adjustments to update the
5086            canonical frame address to be the exception handler's CFA.  We want
5087            to authenticate using the CFA of the function which calls eh_return.
5088     */
5089   if (aarch64_return_address_signing_enabled ()
5090       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5091     {
5092       insn = emit_insn (gen_autisp ());
5093       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5094       RTX_FRAME_RELATED_P (insn) = 1;
5095     }
5096
5097   /* Stack adjustment for exception handler.  */
5098   if (crtl->calls_eh_return)
5099     {
5100       /* We need to unwind the stack by the offset computed by
5101          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
5102          to be SP; letting the CFA move during this adjustment
5103          is just as correct as retaining the CFA from the body
5104          of the function.  Therefore, do nothing special.  */
5105       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5106     }
5107
5108   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5109   if (!for_sibcall)
5110     emit_jump_insn (ret_rtx);
5111 }
5112
5113 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
5114    normally or return to a previous frame after unwinding.
5115
5116    An EH return uses a single shared return sequence.  The epilogue is
5117    exactly like a normal epilogue except that it has an extra input
5118    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5119    that must be applied after the frame has been destroyed.  An extra label
5120    is inserted before the epilogue which initializes this register to zero,
5121    and this is the entry point for a normal return.
5122
5123    An actual EH return updates the return address, initializes the stack
5124    adjustment and jumps directly into the epilogue (bypassing the zeroing
5125    of the adjustment).  Since the return address is typically saved on the
5126    stack when a function makes a call, the saved LR must be updated outside
5127    the epilogue.
5128
5129    This poses problems as the store is generated well before the epilogue,
5130    so the offset of LR is not known yet.  Also optimizations will remove the
5131    store as it appears dead, even after the epilogue is generated (as the
5132    base or offset for loading LR is different in many cases).
5133
5134    To avoid these problems this implementation forces the frame pointer
5135    in eh_return functions so that the location of LR is fixed and known early.
5136    It also marks the store volatile, so no optimization is permitted to
5137    remove the store.  */
5138 rtx
5139 aarch64_eh_return_handler_rtx (void)
5140 {
5141   rtx tmp = gen_frame_mem (Pmode,
5142     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5143
5144   /* Mark the store volatile, so no optimization is permitted to remove it.  */
5145   MEM_VOLATILE_P (tmp) = true;
5146   return tmp;
5147 }
5148
5149 /* Output code to add DELTA to the first argument, and then jump
5150    to FUNCTION.  Used for C++ multiple inheritance.  */
5151 static void
5152 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5153                          HOST_WIDE_INT delta,
5154                          HOST_WIDE_INT vcall_offset,
5155                          tree function)
5156 {
5157   /* The this pointer is always in x0.  Note that this differs from
5158      Arm where the this pointer maybe bumped to r1 if r0 is required
5159      to return a pointer to an aggregate.  On AArch64 a result value
5160      pointer will be in x8.  */
5161   int this_regno = R0_REGNUM;
5162   rtx this_rtx, temp0, temp1, addr, funexp;
5163   rtx_insn *insn;
5164
5165   reload_completed = 1;
5166   emit_note (NOTE_INSN_PROLOGUE_END);
5167
5168   this_rtx = gen_rtx_REG (Pmode, this_regno);
5169   temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5170   temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5171
5172   if (vcall_offset == 0)
5173     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5174   else
5175     {
5176       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5177
5178       addr = this_rtx;
5179       if (delta != 0)
5180         {
5181           if (delta >= -256 && delta < 256)
5182             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5183                                        plus_constant (Pmode, this_rtx, delta));
5184           else
5185             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5186                                 temp1, temp0, false);
5187         }
5188
5189       if (Pmode == ptr_mode)
5190         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5191       else
5192         aarch64_emit_move (temp0,
5193                            gen_rtx_ZERO_EXTEND (Pmode,
5194                                                 gen_rtx_MEM (ptr_mode, addr)));
5195
5196       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5197           addr = plus_constant (Pmode, temp0, vcall_offset);
5198       else
5199         {
5200           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5201                                           Pmode);
5202           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5203         }
5204
5205       if (Pmode == ptr_mode)
5206         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5207       else
5208         aarch64_emit_move (temp1,
5209                            gen_rtx_SIGN_EXTEND (Pmode,
5210                                                 gen_rtx_MEM (ptr_mode, addr)));
5211
5212       emit_insn (gen_add2_insn (this_rtx, temp1));
5213     }
5214
5215   /* Generate a tail call to the target function.  */
5216   if (!TREE_USED (function))
5217     {
5218       assemble_external (function);
5219       TREE_USED (function) = 1;
5220     }
5221   funexp = XEXP (DECL_RTL (function), 0);
5222   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5223   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5224   SIBLING_CALL_P (insn) = 1;
5225
5226   insn = get_insns ();
5227   shorten_branches (insn);
5228   final_start_function (insn, file, 1);
5229   final (insn, file, 1);
5230   final_end_function ();
5231
5232   /* Stop pretending to be a post-reload pass.  */
5233   reload_completed = 0;
5234 }
5235
5236 static bool
5237 aarch64_tls_referenced_p (rtx x)
5238 {
5239   if (!TARGET_HAVE_TLS)
5240     return false;
5241   subrtx_iterator::array_type array;
5242   FOR_EACH_SUBRTX (iter, array, x, ALL)
5243     {
5244       const_rtx x = *iter;
5245       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5246         return true;
5247       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5248          TLS offsets, not real symbol references.  */
5249       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5250         iter.skip_subrtxes ();
5251     }
5252   return false;
5253 }
5254
5255
5256 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5257    a left shift of 0 or 12 bits.  */
5258 bool
5259 aarch64_uimm12_shift (HOST_WIDE_INT val)
5260 {
5261   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5262           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5263           );
5264 }
5265
5266
5267 /* Return true if val is an immediate that can be loaded into a
5268    register by a MOVZ instruction.  */
5269 static bool
5270 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5271 {
5272   if (GET_MODE_SIZE (mode) > 4)
5273     {
5274       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5275           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5276         return 1;
5277     }
5278   else
5279     {
5280       /* Ignore sign extension.  */
5281       val &= (HOST_WIDE_INT) 0xffffffff;
5282     }
5283   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5284           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5285 }
5286
5287 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
5288    64-bit (DImode) integer.  */
5289
5290 static unsigned HOST_WIDE_INT
5291 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5292 {
5293   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5294   while (size < 64)
5295     {
5296       val &= (HOST_WIDE_INT_1U << size) - 1;
5297       val |= val << size;
5298       size *= 2;
5299     }
5300   return val;
5301 }
5302
5303 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
5304
5305 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5306   {
5307     0x0000000100000001ull,
5308     0x0001000100010001ull,
5309     0x0101010101010101ull,
5310     0x1111111111111111ull,
5311     0x5555555555555555ull,
5312   };
5313
5314
5315 /* Return true if val is a valid bitmask immediate.  */
5316
5317 bool
5318 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5319 {
5320   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5321   int bits;
5322
5323   /* Check for a single sequence of one bits and return quickly if so.
5324      The special cases of all ones and all zeroes returns false.  */
5325   val = aarch64_replicate_bitmask_imm (val_in, mode);
5326   tmp = val + (val & -val);
5327
5328   if (tmp == (tmp & -tmp))
5329     return (val + 1) > 1;
5330
5331   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
5332   if (mode == SImode)
5333     val = (val << 32) | (val & 0xffffffff);
5334
5335   /* Invert if the immediate doesn't start with a zero bit - this means we
5336      only need to search for sequences of one bits.  */
5337   if (val & 1)
5338     val = ~val;
5339
5340   /* Find the first set bit and set tmp to val with the first sequence of one
5341      bits removed.  Return success if there is a single sequence of ones.  */
5342   first_one = val & -val;
5343   tmp = val & (val + first_one);
5344
5345   if (tmp == 0)
5346     return true;
5347
5348   /* Find the next set bit and compute the difference in bit position.  */
5349   next_one = tmp & -tmp;
5350   bits = clz_hwi (first_one) - clz_hwi (next_one);
5351   mask = val ^ tmp;
5352
5353   /* Check the bit position difference is a power of 2, and that the first
5354      sequence of one bits fits within 'bits' bits.  */
5355   if ((mask >> bits) != 0 || bits != (bits & -bits))
5356     return false;
5357
5358   /* Check the sequence of one bits is repeated 64/bits times.  */
5359   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5360 }
5361
5362 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5363    Assumed precondition: VAL_IN Is not zero.  */
5364
5365 unsigned HOST_WIDE_INT
5366 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5367 {
5368   int lowest_bit_set = ctz_hwi (val_in);
5369   int highest_bit_set = floor_log2 (val_in);
5370   gcc_assert (val_in != 0);
5371
5372   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5373           (HOST_WIDE_INT_1U << lowest_bit_set));
5374 }
5375
5376 /* Create constant where bits outside of lowest bit set to highest bit set
5377    are set to 1.  */
5378
5379 unsigned HOST_WIDE_INT
5380 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5381 {
5382   return val_in | ~aarch64_and_split_imm1 (val_in);
5383 }
5384
5385 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
5386
5387 bool
5388 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5389 {
5390   scalar_int_mode int_mode;
5391   if (!is_a <scalar_int_mode> (mode, &int_mode))
5392     return false;
5393
5394   if (aarch64_bitmask_imm (val_in, int_mode))
5395     return false;
5396
5397   if (aarch64_move_imm (val_in, int_mode))
5398     return false;
5399
5400   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5401
5402   return aarch64_bitmask_imm (imm2, int_mode);
5403 }
5404
5405 /* Return true if val is an immediate that can be loaded into a
5406    register in a single instruction.  */
5407 bool
5408 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5409 {
5410   scalar_int_mode int_mode;
5411   if (!is_a <scalar_int_mode> (mode, &int_mode))
5412     return false;
5413
5414   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5415     return 1;
5416   return aarch64_bitmask_imm (val, int_mode);
5417 }
5418
5419 static bool
5420 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5421 {
5422   rtx base, offset;
5423
5424   if (GET_CODE (x) == HIGH)
5425     return true;
5426
5427   /* There's no way to calculate VL-based values using relocations.  */
5428   subrtx_iterator::array_type array;
5429   FOR_EACH_SUBRTX (iter, array, x, ALL)
5430     if (GET_CODE (*iter) == CONST_POLY_INT)
5431       return true;
5432
5433   split_const (x, &base, &offset);
5434   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5435     {
5436       if (aarch64_classify_symbol (base, INTVAL (offset))
5437           != SYMBOL_FORCE_TO_MEM)
5438         return true;
5439       else
5440         /* Avoid generating a 64-bit relocation in ILP32; leave
5441            to aarch64_expand_mov_immediate to handle it properly.  */
5442         return mode != ptr_mode;
5443     }
5444
5445   return aarch64_tls_referenced_p (x);
5446 }
5447
5448 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5449    The expansion for a table switch is quite expensive due to the number
5450    of instructions, the table lookup and hard to predict indirect jump.
5451    When optimizing for speed, and -O3 enabled, use the per-core tuning if
5452    set, otherwise use tables for > 16 cases as a tradeoff between size and
5453    performance.  When optimizing for size, use the default setting.  */
5454
5455 static unsigned int
5456 aarch64_case_values_threshold (void)
5457 {
5458   /* Use the specified limit for the number of cases before using jump
5459      tables at higher optimization levels.  */
5460   if (optimize > 2
5461       && selected_cpu->tune->max_case_values != 0)
5462     return selected_cpu->tune->max_case_values;
5463   else
5464     return optimize_size ? default_case_values_threshold () : 17;
5465 }
5466
5467 /* Return true if register REGNO is a valid index register.
5468    STRICT_P is true if REG_OK_STRICT is in effect.  */
5469
5470 bool
5471 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5472 {
5473   if (!HARD_REGISTER_NUM_P (regno))
5474     {
5475       if (!strict_p)
5476         return true;
5477
5478       if (!reg_renumber)
5479         return false;
5480
5481       regno = reg_renumber[regno];
5482     }
5483   return GP_REGNUM_P (regno);
5484 }
5485
5486 /* Return true if register REGNO is a valid base register for mode MODE.
5487    STRICT_P is true if REG_OK_STRICT is in effect.  */
5488
5489 bool
5490 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5491 {
5492   if (!HARD_REGISTER_NUM_P (regno))
5493     {
5494       if (!strict_p)
5495         return true;
5496
5497       if (!reg_renumber)
5498         return false;
5499
5500       regno = reg_renumber[regno];
5501     }
5502
5503   /* The fake registers will be eliminated to either the stack or
5504      hard frame pointer, both of which are usually valid base registers.
5505      Reload deals with the cases where the eliminated form isn't valid.  */
5506   return (GP_REGNUM_P (regno)
5507           || regno == SP_REGNUM
5508           || regno == FRAME_POINTER_REGNUM
5509           || regno == ARG_POINTER_REGNUM);
5510 }
5511
5512 /* Return true if X is a valid base register for mode MODE.
5513    STRICT_P is true if REG_OK_STRICT is in effect.  */
5514
5515 static bool
5516 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5517 {
5518   if (!strict_p
5519       && GET_CODE (x) == SUBREG
5520       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5521     x = SUBREG_REG (x);
5522
5523   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5524 }
5525
5526 /* Return true if address offset is a valid index.  If it is, fill in INFO
5527    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5528
5529 static bool
5530 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5531                         machine_mode mode, bool strict_p)
5532 {
5533   enum aarch64_address_type type;
5534   rtx index;
5535   int shift;
5536
5537   /* (reg:P) */
5538   if ((REG_P (x) || GET_CODE (x) == SUBREG)
5539       && GET_MODE (x) == Pmode)
5540     {
5541       type = ADDRESS_REG_REG;
5542       index = x;
5543       shift = 0;
5544     }
5545   /* (sign_extend:DI (reg:SI)) */
5546   else if ((GET_CODE (x) == SIGN_EXTEND
5547             || GET_CODE (x) == ZERO_EXTEND)
5548            && GET_MODE (x) == DImode
5549            && GET_MODE (XEXP (x, 0)) == SImode)
5550     {
5551       type = (GET_CODE (x) == SIGN_EXTEND)
5552         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5553       index = XEXP (x, 0);
5554       shift = 0;
5555     }
5556   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5557   else if (GET_CODE (x) == MULT
5558            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5559                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5560            && GET_MODE (XEXP (x, 0)) == DImode
5561            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5562            && CONST_INT_P (XEXP (x, 1)))
5563     {
5564       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5565         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5566       index = XEXP (XEXP (x, 0), 0);
5567       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5568     }
5569   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5570   else if (GET_CODE (x) == ASHIFT
5571            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5572                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5573            && GET_MODE (XEXP (x, 0)) == DImode
5574            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5575            && CONST_INT_P (XEXP (x, 1)))
5576     {
5577       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5578         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5579       index = XEXP (XEXP (x, 0), 0);
5580       shift = INTVAL (XEXP (x, 1));
5581     }
5582   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5583   else if ((GET_CODE (x) == SIGN_EXTRACT
5584             || GET_CODE (x) == ZERO_EXTRACT)
5585            && GET_MODE (x) == DImode
5586            && GET_CODE (XEXP (x, 0)) == MULT
5587            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5588            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5589     {
5590       type = (GET_CODE (x) == SIGN_EXTRACT)
5591         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5592       index = XEXP (XEXP (x, 0), 0);
5593       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5594       if (INTVAL (XEXP (x, 1)) != 32 + shift
5595           || INTVAL (XEXP (x, 2)) != 0)
5596         shift = -1;
5597     }
5598   /* (and:DI (mult:DI (reg:DI) (const_int scale))
5599      (const_int 0xffffffff<<shift)) */
5600   else if (GET_CODE (x) == AND
5601            && GET_MODE (x) == DImode
5602            && GET_CODE (XEXP (x, 0)) == MULT
5603            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5604            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5605            && CONST_INT_P (XEXP (x, 1)))
5606     {
5607       type = ADDRESS_REG_UXTW;
5608       index = XEXP (XEXP (x, 0), 0);
5609       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5610       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5611         shift = -1;
5612     }
5613   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5614   else if ((GET_CODE (x) == SIGN_EXTRACT
5615             || GET_CODE (x) == ZERO_EXTRACT)
5616            && GET_MODE (x) == DImode
5617            && GET_CODE (XEXP (x, 0)) == ASHIFT
5618            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5619            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5620     {
5621       type = (GET_CODE (x) == SIGN_EXTRACT)
5622         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5623       index = XEXP (XEXP (x, 0), 0);
5624       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5625       if (INTVAL (XEXP (x, 1)) != 32 + shift
5626           || INTVAL (XEXP (x, 2)) != 0)
5627         shift = -1;
5628     }
5629   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5630      (const_int 0xffffffff<<shift)) */
5631   else if (GET_CODE (x) == AND
5632            && GET_MODE (x) == DImode
5633            && GET_CODE (XEXP (x, 0)) == ASHIFT
5634            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5635            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5636            && CONST_INT_P (XEXP (x, 1)))
5637     {
5638       type = ADDRESS_REG_UXTW;
5639       index = XEXP (XEXP (x, 0), 0);
5640       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5641       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5642         shift = -1;
5643     }
5644   /* (mult:P (reg:P) (const_int scale)) */
5645   else if (GET_CODE (x) == MULT
5646            && GET_MODE (x) == Pmode
5647            && GET_MODE (XEXP (x, 0)) == Pmode
5648            && CONST_INT_P (XEXP (x, 1)))
5649     {
5650       type = ADDRESS_REG_REG;
5651       index = XEXP (x, 0);
5652       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5653     }
5654   /* (ashift:P (reg:P) (const_int shift)) */
5655   else if (GET_CODE (x) == ASHIFT
5656            && GET_MODE (x) == Pmode
5657            && GET_MODE (XEXP (x, 0)) == Pmode
5658            && CONST_INT_P (XEXP (x, 1)))
5659     {
5660       type = ADDRESS_REG_REG;
5661       index = XEXP (x, 0);
5662       shift = INTVAL (XEXP (x, 1));
5663     }
5664   else
5665     return false;
5666
5667   if (!strict_p
5668       && GET_CODE (index) == SUBREG
5669       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5670     index = SUBREG_REG (index);
5671
5672   if (aarch64_sve_data_mode_p (mode))
5673     {
5674       if (type != ADDRESS_REG_REG
5675           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5676         return false;
5677     }
5678   else
5679     {
5680       if (shift != 0
5681           && !(IN_RANGE (shift, 1, 3)
5682                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5683         return false;
5684     }
5685
5686   if (REG_P (index)
5687       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5688     {
5689       info->type = type;
5690       info->offset = index;
5691       info->shift = shift;
5692       return true;
5693     }
5694
5695   return false;
5696 }
5697
5698 /* Return true if MODE is one of the modes for which we
5699    support LDP/STP operations.  */
5700
5701 static bool
5702 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5703 {
5704   return mode == SImode || mode == DImode
5705          || mode == SFmode || mode == DFmode
5706          || (aarch64_vector_mode_supported_p (mode)
5707              && (known_eq (GET_MODE_SIZE (mode), 8)
5708                  || (known_eq (GET_MODE_SIZE (mode), 16)
5709                     && (aarch64_tune_params.extra_tuning_flags
5710                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
5711 }
5712
5713 /* Return true if REGNO is a virtual pointer register, or an eliminable
5714    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
5715    include stack_pointer or hard_frame_pointer.  */
5716 static bool
5717 virt_or_elim_regno_p (unsigned regno)
5718 {
5719   return ((regno >= FIRST_VIRTUAL_REGISTER
5720            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5721           || regno == FRAME_POINTER_REGNUM
5722           || regno == ARG_POINTER_REGNUM);
5723 }
5724
5725 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5726    If it is, fill in INFO appropriately.  STRICT_P is true if
5727    REG_OK_STRICT is in effect.  */
5728
5729 static bool
5730 aarch64_classify_address (struct aarch64_address_info *info,
5731                           rtx x, machine_mode mode, bool strict_p,
5732                           aarch64_addr_query_type type = ADDR_QUERY_M)
5733 {
5734   enum rtx_code code = GET_CODE (x);
5735   rtx op0, op1;
5736   poly_int64 offset;
5737
5738   HOST_WIDE_INT const_size;
5739
5740   /* On BE, we use load/store pair for all large int mode load/stores.
5741      TI/TFmode may also use a load/store pair.  */
5742   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5743   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5744   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5745                             || mode == TImode
5746                             || mode == TFmode
5747                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5748
5749   bool allow_reg_index_p = (!load_store_pair_p
5750                             && (known_lt (GET_MODE_SIZE (mode), 16)
5751                                 || vec_flags == VEC_ADVSIMD
5752                                 || vec_flags == VEC_SVE_DATA));
5753
5754   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5755      [Rn, #offset, MUL VL].  */
5756   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5757       && (code != REG && code != PLUS))
5758     return false;
5759
5760   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5761      REG addressing.  */
5762   if (advsimd_struct_p
5763       && !BYTES_BIG_ENDIAN
5764       && (code != POST_INC && code != REG))
5765     return false;
5766
5767   gcc_checking_assert (GET_MODE (x) == VOIDmode
5768                        || SCALAR_INT_MODE_P (GET_MODE (x)));
5769
5770   switch (code)
5771     {
5772     case REG:
5773     case SUBREG:
5774       info->type = ADDRESS_REG_IMM;
5775       info->base = x;
5776       info->offset = const0_rtx;
5777       info->const_offset = 0;
5778       return aarch64_base_register_rtx_p (x, strict_p);
5779
5780     case PLUS:
5781       op0 = XEXP (x, 0);
5782       op1 = XEXP (x, 1);
5783
5784       if (! strict_p
5785           && REG_P (op0)
5786           && virt_or_elim_regno_p (REGNO (op0))
5787           && poly_int_rtx_p (op1, &offset))
5788         {
5789           info->type = ADDRESS_REG_IMM;
5790           info->base = op0;
5791           info->offset = op1;
5792           info->const_offset = offset;
5793
5794           return true;
5795         }
5796
5797       if (maybe_ne (GET_MODE_SIZE (mode), 0)
5798           && aarch64_base_register_rtx_p (op0, strict_p)
5799           && poly_int_rtx_p (op1, &offset))
5800         {
5801           info->type = ADDRESS_REG_IMM;
5802           info->base = op0;
5803           info->offset = op1;
5804           info->const_offset = offset;
5805
5806           /* TImode and TFmode values are allowed in both pairs of X
5807              registers and individual Q registers.  The available
5808              address modes are:
5809              X,X: 7-bit signed scaled offset
5810              Q:   9-bit signed offset
5811              We conservatively require an offset representable in either mode.
5812              When performing the check for pairs of X registers i.e.  LDP/STP
5813              pass down DImode since that is the natural size of the LDP/STP
5814              instruction memory accesses.  */
5815           if (mode == TImode || mode == TFmode)
5816             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5817                     && (offset_9bit_signed_unscaled_p (mode, offset)
5818                         || offset_12bit_unsigned_scaled_p (mode, offset)));
5819
5820           /* A 7bit offset check because OImode will emit a ldp/stp
5821              instruction (only big endian will get here).
5822              For ldp/stp instructions, the offset is scaled for the size of a
5823              single element of the pair.  */
5824           if (mode == OImode)
5825             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5826
5827           /* Three 9/12 bit offsets checks because CImode will emit three
5828              ldr/str instructions (only big endian will get here).  */
5829           if (mode == CImode)
5830             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5831                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5832                         || offset_12bit_unsigned_scaled_p (V16QImode,
5833                                                            offset + 32)));
5834
5835           /* Two 7bit offsets checks because XImode will emit two ldp/stp
5836              instructions (only big endian will get here).  */
5837           if (mode == XImode)
5838             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5839                     && aarch64_offset_7bit_signed_scaled_p (TImode,
5840                                                             offset + 32));
5841
5842           /* Make "m" use the LD1 offset range for SVE data modes, so
5843              that pre-RTL optimizers like ivopts will work to that
5844              instead of the wider LDR/STR range.  */
5845           if (vec_flags == VEC_SVE_DATA)
5846             return (type == ADDR_QUERY_M
5847                     ? offset_4bit_signed_scaled_p (mode, offset)
5848                     : offset_9bit_signed_scaled_p (mode, offset));
5849
5850           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5851             {
5852               poly_int64 end_offset = (offset
5853                                        + GET_MODE_SIZE (mode)
5854                                        - BYTES_PER_SVE_VECTOR);
5855               return (type == ADDR_QUERY_M
5856                       ? offset_4bit_signed_scaled_p (mode, offset)
5857                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5858                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5859                                                          end_offset)));
5860             }
5861
5862           if (vec_flags == VEC_SVE_PRED)
5863             return offset_9bit_signed_scaled_p (mode, offset);
5864
5865           if (load_store_pair_p)
5866             return ((known_eq (GET_MODE_SIZE (mode), 4)
5867                      || known_eq (GET_MODE_SIZE (mode), 8)
5868                      || known_eq (GET_MODE_SIZE (mode), 16))
5869                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5870           else
5871             return (offset_9bit_signed_unscaled_p (mode, offset)
5872                     || offset_12bit_unsigned_scaled_p (mode, offset));
5873         }
5874
5875       if (allow_reg_index_p)
5876         {
5877           /* Look for base + (scaled/extended) index register.  */
5878           if (aarch64_base_register_rtx_p (op0, strict_p)
5879               && aarch64_classify_index (info, op1, mode, strict_p))
5880             {
5881               info->base = op0;
5882               return true;
5883             }
5884           if (aarch64_base_register_rtx_p (op1, strict_p)
5885               && aarch64_classify_index (info, op0, mode, strict_p))
5886             {
5887               info->base = op1;
5888               return true;
5889             }
5890         }
5891
5892       return false;
5893
5894     case POST_INC:
5895     case POST_DEC:
5896     case PRE_INC:
5897     case PRE_DEC:
5898       info->type = ADDRESS_REG_WB;
5899       info->base = XEXP (x, 0);
5900       info->offset = NULL_RTX;
5901       return aarch64_base_register_rtx_p (info->base, strict_p);
5902
5903     case POST_MODIFY:
5904     case PRE_MODIFY:
5905       info->type = ADDRESS_REG_WB;
5906       info->base = XEXP (x, 0);
5907       if (GET_CODE (XEXP (x, 1)) == PLUS
5908           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5909           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5910           && aarch64_base_register_rtx_p (info->base, strict_p))
5911         {
5912           info->offset = XEXP (XEXP (x, 1), 1);
5913           info->const_offset = offset;
5914
5915           /* TImode and TFmode values are allowed in both pairs of X
5916              registers and individual Q registers.  The available
5917              address modes are:
5918              X,X: 7-bit signed scaled offset
5919              Q:   9-bit signed offset
5920              We conservatively require an offset representable in either mode.
5921            */
5922           if (mode == TImode || mode == TFmode)
5923             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5924                     && offset_9bit_signed_unscaled_p (mode, offset));
5925
5926           if (load_store_pair_p)
5927             return ((known_eq (GET_MODE_SIZE (mode), 4)
5928                      || known_eq (GET_MODE_SIZE (mode), 8)
5929                      || known_eq (GET_MODE_SIZE (mode), 16))
5930                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5931           else
5932             return offset_9bit_signed_unscaled_p (mode, offset);
5933         }
5934       return false;
5935
5936     case CONST:
5937     case SYMBOL_REF:
5938     case LABEL_REF:
5939       /* load literal: pc-relative constant pool entry.  Only supported
5940          for SI mode or larger.  */
5941       info->type = ADDRESS_SYMBOLIC;
5942
5943       if (!load_store_pair_p
5944           && GET_MODE_SIZE (mode).is_constant (&const_size)
5945           && const_size >= 4)
5946         {
5947           rtx sym, addend;
5948
5949           split_const (x, &sym, &addend);
5950           return ((GET_CODE (sym) == LABEL_REF
5951                    || (GET_CODE (sym) == SYMBOL_REF
5952                        && CONSTANT_POOL_ADDRESS_P (sym)
5953                        && aarch64_pcrelative_literal_loads)));
5954         }
5955       return false;
5956
5957     case LO_SUM:
5958       info->type = ADDRESS_LO_SUM;
5959       info->base = XEXP (x, 0);
5960       info->offset = XEXP (x, 1);
5961       if (allow_reg_index_p
5962           && aarch64_base_register_rtx_p (info->base, strict_p))
5963         {
5964           rtx sym, offs;
5965           split_const (info->offset, &sym, &offs);
5966           if (GET_CODE (sym) == SYMBOL_REF
5967               && (aarch64_classify_symbol (sym, INTVAL (offs))
5968                   == SYMBOL_SMALL_ABSOLUTE))
5969             {
5970               /* The symbol and offset must be aligned to the access size.  */
5971               unsigned int align;
5972
5973               if (CONSTANT_POOL_ADDRESS_P (sym))
5974                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5975               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5976                 {
5977                   tree exp = SYMBOL_REF_DECL (sym);
5978                   align = TYPE_ALIGN (TREE_TYPE (exp));
5979                   align = aarch64_constant_alignment (exp, align);
5980                 }
5981               else if (SYMBOL_REF_DECL (sym))
5982                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5983               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5984                        && SYMBOL_REF_BLOCK (sym) != NULL)
5985                 align = SYMBOL_REF_BLOCK (sym)->alignment;
5986               else
5987                 align = BITS_PER_UNIT;
5988
5989               poly_int64 ref_size = GET_MODE_SIZE (mode);
5990               if (known_eq (ref_size, 0))
5991                 ref_size = GET_MODE_SIZE (DImode);
5992
5993               return (multiple_p (INTVAL (offs), ref_size)
5994                       && multiple_p (align / BITS_PER_UNIT, ref_size));
5995             }
5996         }
5997       return false;
5998
5999     default:
6000       return false;
6001     }
6002 }
6003
6004 /* Return true if the address X is valid for a PRFM instruction.
6005    STRICT_P is true if we should do strict checking with
6006    aarch64_classify_address.  */
6007
6008 bool
6009 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
6010 {
6011   struct aarch64_address_info addr;
6012
6013   /* PRFM accepts the same addresses as DImode...  */
6014   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6015   if (!res)
6016     return false;
6017
6018   /* ... except writeback forms.  */
6019   return addr.type != ADDRESS_REG_WB;
6020 }
6021
6022 bool
6023 aarch64_symbolic_address_p (rtx x)
6024 {
6025   rtx offset;
6026
6027   split_const (x, &x, &offset);
6028   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6029 }
6030
6031 /* Classify the base of symbolic expression X.  */
6032
6033 enum aarch64_symbol_type
6034 aarch64_classify_symbolic_expression (rtx x)
6035 {
6036   rtx offset;
6037
6038   split_const (x, &x, &offset);
6039   return aarch64_classify_symbol (x, INTVAL (offset));
6040 }
6041
6042
6043 /* Return TRUE if X is a legitimate address for accessing memory in
6044    mode MODE.  */
6045 static bool
6046 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6047 {
6048   struct aarch64_address_info addr;
6049
6050   return aarch64_classify_address (&addr, x, mode, strict_p);
6051 }
6052
6053 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6054    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6055 bool
6056 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6057                               aarch64_addr_query_type type)
6058 {
6059   struct aarch64_address_info addr;
6060
6061   return aarch64_classify_address (&addr, x, mode, strict_p, type);
6062 }
6063
6064 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
6065
6066 static bool
6067 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6068                                          poly_int64 orig_offset,
6069                                          machine_mode mode)
6070 {
6071   HOST_WIDE_INT size;
6072   if (GET_MODE_SIZE (mode).is_constant (&size))
6073     {
6074       HOST_WIDE_INT const_offset, second_offset;
6075
6076       /* A general SVE offset is A * VQ + B.  Remove the A component from
6077          coefficient 0 in order to get the constant B.  */
6078       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6079
6080       /* Split an out-of-range address displacement into a base and
6081          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
6082          range otherwise to increase opportunities for sharing the base
6083          address of different sizes.  Unaligned accesses use the signed
6084          9-bit range, TImode/TFmode use the intersection of signed
6085          scaled 7-bit and signed 9-bit offset.  */
6086       if (mode == TImode || mode == TFmode)
6087         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6088       else if ((const_offset & (size - 1)) != 0)
6089         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6090       else
6091         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6092
6093       if (second_offset == 0 || known_eq (orig_offset, second_offset))
6094         return false;
6095
6096       /* Split the offset into second_offset and the rest.  */
6097       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6098       *offset2 = gen_int_mode (second_offset, Pmode);
6099       return true;
6100     }
6101   else
6102     {
6103       /* Get the mode we should use as the basis of the range.  For structure
6104          modes this is the mode of one vector.  */
6105       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6106       machine_mode step_mode
6107         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6108
6109       /* Get the "mul vl" multiplier we'd like to use.  */
6110       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6111       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6112       if (vec_flags & VEC_SVE_DATA)
6113         /* LDR supports a 9-bit range, but the move patterns for
6114            structure modes require all vectors to be in range of the
6115            same base.  The simplest way of accomodating that while still
6116            promoting reuse of anchor points between different modes is
6117            to use an 8-bit range unconditionally.  */
6118         vnum = ((vnum + 128) & 255) - 128;
6119       else
6120         /* Predicates are only handled singly, so we might as well use
6121            the full range.  */
6122         vnum = ((vnum + 256) & 511) - 256;
6123       if (vnum == 0)
6124         return false;
6125
6126       /* Convert the "mul vl" multiplier into a byte offset.  */
6127       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6128       if (known_eq (second_offset, orig_offset))
6129         return false;
6130
6131       /* Split the offset into second_offset and the rest.  */
6132       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6133       *offset2 = gen_int_mode (second_offset, Pmode);
6134       return true;
6135     }
6136 }
6137
6138 /* Return the binary representation of floating point constant VALUE in INTVAL.
6139    If the value cannot be converted, return false without setting INTVAL.
6140    The conversion is done in the given MODE.  */
6141 bool
6142 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6143 {
6144
6145   /* We make a general exception for 0.  */
6146   if (aarch64_float_const_zero_rtx_p (value))
6147     {
6148       *intval = 0;
6149       return true;
6150     }
6151
6152   scalar_float_mode mode;
6153   if (GET_CODE (value) != CONST_DOUBLE
6154       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6155       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6156       /* Only support up to DF mode.  */
6157       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6158     return false;
6159
6160   unsigned HOST_WIDE_INT ival = 0;
6161
6162   long res[2];
6163   real_to_target (res,
6164                   CONST_DOUBLE_REAL_VALUE (value),
6165                   REAL_MODE_FORMAT (mode));
6166
6167   if (mode == DFmode)
6168     {
6169       int order = BYTES_BIG_ENDIAN ? 1 : 0;
6170       ival = zext_hwi (res[order], 32);
6171       ival |= (zext_hwi (res[1 - order], 32) << 32);
6172     }
6173   else
6174       ival = zext_hwi (res[0], 32);
6175
6176   *intval = ival;
6177   return true;
6178 }
6179
6180 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6181    single MOV(+MOVK) followed by an FMOV.  */
6182 bool
6183 aarch64_float_const_rtx_p (rtx x)
6184 {
6185   machine_mode mode = GET_MODE (x);
6186   if (mode == VOIDmode)
6187     return false;
6188
6189   /* Determine whether it's cheaper to write float constants as
6190      mov/movk pairs over ldr/adrp pairs.  */
6191   unsigned HOST_WIDE_INT ival;
6192
6193   if (GET_CODE (x) == CONST_DOUBLE
6194       && SCALAR_FLOAT_MODE_P (mode)
6195       && aarch64_reinterpret_float_as_int (x, &ival))
6196     {
6197       scalar_int_mode imode = (mode == HFmode
6198                                ? SImode
6199                                : int_mode_for_mode (mode).require ());
6200       int num_instr = aarch64_internal_mov_immediate
6201                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6202       return num_instr < 3;
6203     }
6204
6205   return false;
6206 }
6207
6208 /* Return TRUE if rtx X is immediate constant 0.0 */
6209 bool
6210 aarch64_float_const_zero_rtx_p (rtx x)
6211 {
6212   if (GET_MODE (x) == VOIDmode)
6213     return false;
6214
6215   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6216     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6217   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6218 }
6219
6220 /* Return TRUE if rtx X is immediate constant that fits in a single
6221    MOVI immediate operation.  */
6222 bool
6223 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6224 {
6225   if (!TARGET_SIMD)
6226      return false;
6227
6228   machine_mode vmode;
6229   scalar_int_mode imode;
6230   unsigned HOST_WIDE_INT ival;
6231
6232   if (GET_CODE (x) == CONST_DOUBLE
6233       && SCALAR_FLOAT_MODE_P (mode))
6234     {
6235       if (!aarch64_reinterpret_float_as_int (x, &ival))
6236         return false;
6237
6238       /* We make a general exception for 0.  */
6239       if (aarch64_float_const_zero_rtx_p (x))
6240         return true;
6241
6242       imode = int_mode_for_mode (mode).require ();
6243     }
6244   else if (GET_CODE (x) == CONST_INT
6245            && is_a <scalar_int_mode> (mode, &imode))
6246     ival = INTVAL (x);
6247   else
6248     return false;
6249
6250    /* use a 64 bit mode for everything except for DI/DF mode, where we use
6251      a 128 bit vector mode.  */
6252   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6253
6254   vmode = aarch64_simd_container_mode (imode, width);
6255   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6256
6257   return aarch64_simd_valid_immediate (v_op, NULL);
6258 }
6259
6260
6261 /* Return the fixed registers used for condition codes.  */
6262
6263 static bool
6264 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6265 {
6266   *p1 = CC_REGNUM;
6267   *p2 = INVALID_REGNUM;
6268   return true;
6269 }
6270
6271 /* This function is used by the call expanders of the machine description.
6272    RESULT is the register in which the result is returned.  It's NULL for
6273    "call" and "sibcall".
6274    MEM is the location of the function call.
6275    SIBCALL indicates whether this function call is normal call or sibling call.
6276    It will generate different pattern accordingly.  */
6277
6278 void
6279 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6280 {
6281   rtx call, callee, tmp;
6282   rtvec vec;
6283   machine_mode mode;
6284
6285   gcc_assert (MEM_P (mem));
6286   callee = XEXP (mem, 0);
6287   mode = GET_MODE (callee);
6288   gcc_assert (mode == Pmode);
6289
6290   /* Decide if we should generate indirect calls by loading the
6291      address of the callee into a register before performing
6292      the branch-and-link.  */
6293   if (SYMBOL_REF_P (callee)
6294       ? (aarch64_is_long_call_p (callee)
6295          || aarch64_is_noplt_call_p (callee))
6296       : !REG_P (callee))
6297     XEXP (mem, 0) = force_reg (mode, callee);
6298
6299   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6300
6301   if (result != NULL_RTX)
6302     call = gen_rtx_SET (result, call);
6303
6304   if (sibcall)
6305     tmp = ret_rtx;
6306   else
6307     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6308
6309   vec = gen_rtvec (2, call, tmp);
6310   call = gen_rtx_PARALLEL (VOIDmode, vec);
6311
6312   aarch64_emit_call_insn (call);
6313 }
6314
6315 /* Emit call insn with PAT and do aarch64-specific handling.  */
6316
6317 void
6318 aarch64_emit_call_insn (rtx pat)
6319 {
6320   rtx insn = emit_call_insn (pat);
6321
6322   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6323   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6324   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6325 }
6326
6327 machine_mode
6328 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6329 {
6330   /* All floating point compares return CCFP if it is an equality
6331      comparison, and CCFPE otherwise.  */
6332   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6333     {
6334       switch (code)
6335         {
6336         case EQ:
6337         case NE:
6338         case UNORDERED:
6339         case ORDERED:
6340         case UNLT:
6341         case UNLE:
6342         case UNGT:
6343         case UNGE:
6344         case UNEQ:
6345           return CCFPmode;
6346
6347         case LT:
6348         case LE:
6349         case GT:
6350         case GE:
6351         case LTGT:
6352           return CCFPEmode;
6353
6354         default:
6355           gcc_unreachable ();
6356         }
6357     }
6358
6359   /* Equality comparisons of short modes against zero can be performed
6360      using the TST instruction with the appropriate bitmask.  */
6361   if (y == const0_rtx && REG_P (x)
6362       && (code == EQ || code == NE)
6363       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6364     return CC_NZmode;
6365
6366   /* Similarly, comparisons of zero_extends from shorter modes can
6367      be performed using an ANDS with an immediate mask.  */
6368   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6369       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6370       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6371       && (code == EQ || code == NE))
6372     return CC_NZmode;
6373
6374   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6375       && y == const0_rtx
6376       && (code == EQ || code == NE || code == LT || code == GE)
6377       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6378           || GET_CODE (x) == NEG
6379           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6380               && CONST_INT_P (XEXP (x, 2)))))
6381     return CC_NZmode;
6382
6383   /* A compare with a shifted operand.  Because of canonicalization,
6384      the comparison will have to be swapped when we emit the assembly
6385      code.  */
6386   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6387       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6388       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6389           || GET_CODE (x) == LSHIFTRT
6390           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6391     return CC_SWPmode;
6392
6393   /* Similarly for a negated operand, but we can only do this for
6394      equalities.  */
6395   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6396       && (REG_P (y) || GET_CODE (y) == SUBREG)
6397       && (code == EQ || code == NE)
6398       && GET_CODE (x) == NEG)
6399     return CC_Zmode;
6400
6401   /* A test for unsigned overflow.  */
6402   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6403       && code == NE
6404       && GET_CODE (x) == PLUS
6405       && GET_CODE (y) == ZERO_EXTEND)
6406     return CC_Cmode;
6407
6408   /* For everything else, return CCmode.  */
6409   return CCmode;
6410 }
6411
6412 static int
6413 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6414
6415 int
6416 aarch64_get_condition_code (rtx x)
6417 {
6418   machine_mode mode = GET_MODE (XEXP (x, 0));
6419   enum rtx_code comp_code = GET_CODE (x);
6420
6421   if (GET_MODE_CLASS (mode) != MODE_CC)
6422     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6423   return aarch64_get_condition_code_1 (mode, comp_code);
6424 }
6425
6426 static int
6427 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6428 {
6429   switch (mode)
6430     {
6431     case E_CCFPmode:
6432     case E_CCFPEmode:
6433       switch (comp_code)
6434         {
6435         case GE: return AARCH64_GE;
6436         case GT: return AARCH64_GT;
6437         case LE: return AARCH64_LS;
6438         case LT: return AARCH64_MI;
6439         case NE: return AARCH64_NE;
6440         case EQ: return AARCH64_EQ;
6441         case ORDERED: return AARCH64_VC;
6442         case UNORDERED: return AARCH64_VS;
6443         case UNLT: return AARCH64_LT;
6444         case UNLE: return AARCH64_LE;
6445         case UNGT: return AARCH64_HI;
6446         case UNGE: return AARCH64_PL;
6447         default: return -1;
6448         }
6449       break;
6450
6451     case E_CCmode:
6452       switch (comp_code)
6453         {
6454         case NE: return AARCH64_NE;
6455         case EQ: return AARCH64_EQ;
6456         case GE: return AARCH64_GE;
6457         case GT: return AARCH64_GT;
6458         case LE: return AARCH64_LE;
6459         case LT: return AARCH64_LT;
6460         case GEU: return AARCH64_CS;
6461         case GTU: return AARCH64_HI;
6462         case LEU: return AARCH64_LS;
6463         case LTU: return AARCH64_CC;
6464         default: return -1;
6465         }
6466       break;
6467
6468     case E_CC_SWPmode:
6469       switch (comp_code)
6470         {
6471         case NE: return AARCH64_NE;
6472         case EQ: return AARCH64_EQ;
6473         case GE: return AARCH64_LE;
6474         case GT: return AARCH64_LT;
6475         case LE: return AARCH64_GE;
6476         case LT: return AARCH64_GT;
6477         case GEU: return AARCH64_LS;
6478         case GTU: return AARCH64_CC;
6479         case LEU: return AARCH64_CS;
6480         case LTU: return AARCH64_HI;
6481         default: return -1;
6482         }
6483       break;
6484
6485     case E_CC_NZmode:
6486       switch (comp_code)
6487         {
6488         case NE: return AARCH64_NE;
6489         case EQ: return AARCH64_EQ;
6490         case GE: return AARCH64_PL;
6491         case LT: return AARCH64_MI;
6492         default: return -1;
6493         }
6494       break;
6495
6496     case E_CC_Zmode:
6497       switch (comp_code)
6498         {
6499         case NE: return AARCH64_NE;
6500         case EQ: return AARCH64_EQ;
6501         default: return -1;
6502         }
6503       break;
6504
6505     case E_CC_Cmode:
6506       switch (comp_code)
6507         {
6508         case NE: return AARCH64_CS;
6509         case EQ: return AARCH64_CC;
6510         default: return -1;
6511         }
6512       break;
6513
6514     default:
6515       return -1;
6516     }
6517
6518   return -1;
6519 }
6520
6521 bool
6522 aarch64_const_vec_all_same_in_range_p (rtx x,
6523                                        HOST_WIDE_INT minval,
6524                                        HOST_WIDE_INT maxval)
6525 {
6526   rtx elt;
6527   return (const_vec_duplicate_p (x, &elt)
6528           && CONST_INT_P (elt)
6529           && IN_RANGE (INTVAL (elt), minval, maxval));
6530 }
6531
6532 bool
6533 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6534 {
6535   return aarch64_const_vec_all_same_in_range_p (x, val, val);
6536 }
6537
6538 /* Return true if VEC is a constant in which every element is in the range
6539    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
6540
6541 static bool
6542 aarch64_const_vec_all_in_range_p (rtx vec,
6543                                   HOST_WIDE_INT minval,
6544                                   HOST_WIDE_INT maxval)
6545 {
6546   if (GET_CODE (vec) != CONST_VECTOR
6547       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6548     return false;
6549
6550   int nunits;
6551   if (!CONST_VECTOR_STEPPED_P (vec))
6552     nunits = const_vector_encoded_nelts (vec);
6553   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6554     return false;
6555
6556   for (int i = 0; i < nunits; i++)
6557     {
6558       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6559       if (!CONST_INT_P (vec_elem)
6560           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6561         return false;
6562     }
6563   return true;
6564 }
6565
6566 /* N Z C V.  */
6567 #define AARCH64_CC_V 1
6568 #define AARCH64_CC_C (1 << 1)
6569 #define AARCH64_CC_Z (1 << 2)
6570 #define AARCH64_CC_N (1 << 3)
6571
6572 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
6573 static const int aarch64_nzcv_codes[] =
6574 {
6575   0,            /* EQ, Z == 1.  */
6576   AARCH64_CC_Z, /* NE, Z == 0.  */
6577   0,            /* CS, C == 1.  */
6578   AARCH64_CC_C, /* CC, C == 0.  */
6579   0,            /* MI, N == 1.  */
6580   AARCH64_CC_N, /* PL, N == 0.  */
6581   0,            /* VS, V == 1.  */
6582   AARCH64_CC_V, /* VC, V == 0.  */
6583   0,            /* HI, C ==1 && Z == 0.  */
6584   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
6585   AARCH64_CC_V, /* GE, N == V.  */
6586   0,            /* LT, N != V.  */
6587   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
6588   0,            /* LE, !(Z == 0 && N == V).  */
6589   0,            /* AL, Any.  */
6590   0             /* NV, Any.  */
6591 };
6592
6593 /* Print floating-point vector immediate operand X to F, negating it
6594    first if NEGATE is true.  Return true on success, false if it isn't
6595    a constant we can handle.  */
6596
6597 static bool
6598 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6599 {
6600   rtx elt;
6601
6602   if (!const_vec_duplicate_p (x, &elt))
6603     return false;
6604
6605   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6606   if (negate)
6607     r = real_value_negate (&r);
6608
6609   /* We only handle the SVE single-bit immediates here.  */
6610   if (real_equal (&r, &dconst0))
6611     asm_fprintf (f, "0.0");
6612   else if (real_equal (&r, &dconst1))
6613     asm_fprintf (f, "1.0");
6614   else if (real_equal (&r, &dconsthalf))
6615     asm_fprintf (f, "0.5");
6616   else
6617     return false;
6618
6619   return true;
6620 }
6621
6622 /* Return the equivalent letter for size.  */
6623 static char
6624 sizetochar (int size)
6625 {
6626   switch (size)
6627     {
6628     case 64: return 'd';
6629     case 32: return 's';
6630     case 16: return 'h';
6631     case 8 : return 'b';
6632     default: gcc_unreachable ();
6633     }
6634 }
6635
6636 /* Print operand X to file F in a target specific manner according to CODE.
6637    The acceptable formatting commands given by CODE are:
6638      'c':               An integer or symbol address without a preceding #
6639                         sign.
6640      'C':               Take the duplicated element in a vector constant
6641                         and print it in hex.
6642      'D':               Take the duplicated element in a vector constant
6643                         and print it as an unsigned integer, in decimal.
6644      'e':               Print the sign/zero-extend size as a character 8->b,
6645                         16->h, 32->w.
6646      'p':               Prints N such that 2^N == X (X must be power of 2 and
6647                         const int).
6648      'P':               Print the number of non-zero bits in X (a const_int).
6649      'H':               Print the higher numbered register of a pair (TImode)
6650                         of regs.
6651      'm':               Print a condition (eq, ne, etc).
6652      'M':               Same as 'm', but invert condition.
6653      'N':               Take the duplicated element in a vector constant
6654                         and print the negative of it in decimal.
6655      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
6656      'S/T/U/V':         Print a FP/SIMD register name for a register list.
6657                         The register printed is the FP/SIMD register name
6658                         of X + 0/1/2/3 for S/T/U/V.
6659      'R':               Print a scalar FP/SIMD register name + 1.
6660      'X':               Print bottom 16 bits of integer constant in hex.
6661      'w/x':             Print a general register name or the zero register
6662                         (32-bit or 64-bit).
6663      '0':               Print a normal operand, if it's a general register,
6664                         then we assume DImode.
6665      'k':               Print NZCV for conditional compare instructions.
6666      'A':               Output address constant representing the first
6667                         argument of X, specifying a relocation offset
6668                         if appropriate.
6669      'L':               Output constant address specified by X
6670                         with a relocation offset if appropriate.
6671      'G':               Prints address of X, specifying a PC relative
6672                         relocation mode if appropriate.
6673      'y':               Output address of LDP or STP - this is used for
6674                         some LDP/STPs which don't use a PARALLEL in their
6675                         pattern (so the mode needs to be adjusted).
6676      'z':               Output address of a typical LDP or STP.  */
6677
6678 static void
6679 aarch64_print_operand (FILE *f, rtx x, int code)
6680 {
6681   rtx elt;
6682   switch (code)
6683     {
6684     case 'c':
6685       switch (GET_CODE (x))
6686         {
6687         case CONST_INT:
6688           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6689           break;
6690
6691         case SYMBOL_REF:
6692           output_addr_const (f, x);
6693           break;
6694
6695         case CONST:
6696           if (GET_CODE (XEXP (x, 0)) == PLUS
6697               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6698             {
6699               output_addr_const (f, x);
6700               break;
6701             }
6702           /* Fall through.  */
6703
6704         default:
6705           output_operand_lossage ("unsupported operand for code '%c'", code);
6706         }
6707       break;
6708
6709     case 'e':
6710       {
6711         int n;
6712
6713         if (!CONST_INT_P (x)
6714             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6715           {
6716             output_operand_lossage ("invalid operand for '%%%c'", code);
6717             return;
6718           }
6719
6720         switch (n)
6721           {
6722           case 3:
6723             fputc ('b', f);
6724             break;
6725           case 4:
6726             fputc ('h', f);
6727             break;
6728           case 5:
6729             fputc ('w', f);
6730             break;
6731           default:
6732             output_operand_lossage ("invalid operand for '%%%c'", code);
6733             return;
6734           }
6735       }
6736       break;
6737
6738     case 'p':
6739       {
6740         int n;
6741
6742         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6743           {
6744             output_operand_lossage ("invalid operand for '%%%c'", code);
6745             return;
6746           }
6747
6748         asm_fprintf (f, "%d", n);
6749       }
6750       break;
6751
6752     case 'P':
6753       if (!CONST_INT_P (x))
6754         {
6755           output_operand_lossage ("invalid operand for '%%%c'", code);
6756           return;
6757         }
6758
6759       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6760       break;
6761
6762     case 'H':
6763       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6764         {
6765           output_operand_lossage ("invalid operand for '%%%c'", code);
6766           return;
6767         }
6768
6769       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6770       break;
6771
6772     case 'M':
6773     case 'm':
6774       {
6775         int cond_code;
6776         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
6777         if (x == const_true_rtx)
6778           {
6779             if (code == 'M')
6780               fputs ("nv", f);
6781             return;
6782           }
6783
6784         if (!COMPARISON_P (x))
6785           {
6786             output_operand_lossage ("invalid operand for '%%%c'", code);
6787             return;
6788           }
6789
6790         cond_code = aarch64_get_condition_code (x);
6791         gcc_assert (cond_code >= 0);
6792         if (code == 'M')
6793           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6794         fputs (aarch64_condition_codes[cond_code], f);
6795       }
6796       break;
6797
6798     case 'N':
6799       if (!const_vec_duplicate_p (x, &elt))
6800         {
6801           output_operand_lossage ("invalid vector constant");
6802           return;
6803         }
6804
6805       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6806         asm_fprintf (f, "%wd", -INTVAL (elt));
6807       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6808                && aarch64_print_vector_float_operand (f, x, true))
6809         ;
6810       else
6811         {
6812           output_operand_lossage ("invalid vector constant");
6813           return;
6814         }
6815       break;
6816
6817     case 'b':
6818     case 'h':
6819     case 's':
6820     case 'd':
6821     case 'q':
6822       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6823         {
6824           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6825           return;
6826         }
6827       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6828       break;
6829
6830     case 'S':
6831     case 'T':
6832     case 'U':
6833     case 'V':
6834       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6835         {
6836           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6837           return;
6838         }
6839       asm_fprintf (f, "%c%d",
6840                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6841                    REGNO (x) - V0_REGNUM + (code - 'S'));
6842       break;
6843
6844     case 'R':
6845       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6846         {
6847           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6848           return;
6849         }
6850       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6851       break;
6852
6853     case 'X':
6854       if (!CONST_INT_P (x))
6855         {
6856           output_operand_lossage ("invalid operand for '%%%c'", code);
6857           return;
6858         }
6859       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6860       break;
6861
6862     case 'C':
6863       {
6864         /* Print a replicated constant in hex.  */
6865         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6866           {
6867             output_operand_lossage ("invalid operand for '%%%c'", code);
6868             return;
6869           }
6870         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6871         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6872       }
6873       break;
6874
6875     case 'D':
6876       {
6877         /* Print a replicated constant in decimal, treating it as
6878            unsigned.  */
6879         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6880           {
6881             output_operand_lossage ("invalid operand for '%%%c'", code);
6882             return;
6883           }
6884         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6885         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6886       }
6887       break;
6888
6889     case 'w':
6890     case 'x':
6891       if (x == const0_rtx
6892           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6893         {
6894           asm_fprintf (f, "%czr", code);
6895           break;
6896         }
6897
6898       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6899         {
6900           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6901           break;
6902         }
6903
6904       if (REG_P (x) && REGNO (x) == SP_REGNUM)
6905         {
6906           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6907           break;
6908         }
6909
6910       /* Fall through */
6911
6912     case 0:
6913       if (x == NULL)
6914         {
6915           output_operand_lossage ("missing operand");
6916           return;
6917         }
6918
6919       switch (GET_CODE (x))
6920         {
6921         case REG:
6922           if (aarch64_sve_data_mode_p (GET_MODE (x)))
6923             {
6924               if (REG_NREGS (x) == 1)
6925                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6926               else
6927                 {
6928                   char suffix
6929                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6930                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
6931                                REGNO (x) - V0_REGNUM, suffix,
6932                                END_REGNO (x) - V0_REGNUM - 1, suffix);
6933                 }
6934             }
6935           else
6936             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6937           break;
6938
6939         case MEM:
6940           output_address (GET_MODE (x), XEXP (x, 0));
6941           break;
6942
6943         case LABEL_REF:
6944         case SYMBOL_REF:
6945           output_addr_const (asm_out_file, x);
6946           break;
6947
6948         case CONST_INT:
6949           asm_fprintf (f, "%wd", INTVAL (x));
6950           break;
6951
6952         case CONST:
6953           if (!VECTOR_MODE_P (GET_MODE (x)))
6954             {
6955               output_addr_const (asm_out_file, x);
6956               break;
6957             }
6958           /* fall through */
6959
6960         case CONST_VECTOR:
6961           if (!const_vec_duplicate_p (x, &elt))
6962             {
6963               output_operand_lossage ("invalid vector constant");
6964               return;
6965             }
6966
6967           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6968             asm_fprintf (f, "%wd", INTVAL (elt));
6969           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6970                    && aarch64_print_vector_float_operand (f, x, false))
6971             ;
6972           else
6973             {
6974               output_operand_lossage ("invalid vector constant");
6975               return;
6976             }
6977           break;
6978
6979         case CONST_DOUBLE:
6980           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6981              be getting CONST_DOUBLEs holding integers.  */
6982           gcc_assert (GET_MODE (x) != VOIDmode);
6983           if (aarch64_float_const_zero_rtx_p (x))
6984             {
6985               fputc ('0', f);
6986               break;
6987             }
6988           else if (aarch64_float_const_representable_p (x))
6989             {
6990 #define buf_size 20
6991               char float_buf[buf_size] = {'\0'};
6992               real_to_decimal_for_mode (float_buf,
6993                                         CONST_DOUBLE_REAL_VALUE (x),
6994                                         buf_size, buf_size,
6995                                         1, GET_MODE (x));
6996               asm_fprintf (asm_out_file, "%s", float_buf);
6997               break;
6998 #undef buf_size
6999             }
7000           output_operand_lossage ("invalid constant");
7001           return;
7002         default:
7003           output_operand_lossage ("invalid operand");
7004           return;
7005         }
7006       break;
7007
7008     case 'A':
7009       if (GET_CODE (x) == HIGH)
7010         x = XEXP (x, 0);
7011
7012       switch (aarch64_classify_symbolic_expression (x))
7013         {
7014         case SYMBOL_SMALL_GOT_4G:
7015           asm_fprintf (asm_out_file, ":got:");
7016           break;
7017
7018         case SYMBOL_SMALL_TLSGD:
7019           asm_fprintf (asm_out_file, ":tlsgd:");
7020           break;
7021
7022         case SYMBOL_SMALL_TLSDESC:
7023           asm_fprintf (asm_out_file, ":tlsdesc:");
7024           break;
7025
7026         case SYMBOL_SMALL_TLSIE:
7027           asm_fprintf (asm_out_file, ":gottprel:");
7028           break;
7029
7030         case SYMBOL_TLSLE24:
7031           asm_fprintf (asm_out_file, ":tprel:");
7032           break;
7033
7034         case SYMBOL_TINY_GOT:
7035           gcc_unreachable ();
7036           break;
7037
7038         default:
7039           break;
7040         }
7041       output_addr_const (asm_out_file, x);
7042       break;
7043
7044     case 'L':
7045       switch (aarch64_classify_symbolic_expression (x))
7046         {
7047         case SYMBOL_SMALL_GOT_4G:
7048           asm_fprintf (asm_out_file, ":lo12:");
7049           break;
7050
7051         case SYMBOL_SMALL_TLSGD:
7052           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7053           break;
7054
7055         case SYMBOL_SMALL_TLSDESC:
7056           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7057           break;
7058
7059         case SYMBOL_SMALL_TLSIE:
7060           asm_fprintf (asm_out_file, ":gottprel_lo12:");
7061           break;
7062
7063         case SYMBOL_TLSLE12:
7064           asm_fprintf (asm_out_file, ":tprel_lo12:");
7065           break;
7066
7067         case SYMBOL_TLSLE24:
7068           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7069           break;
7070
7071         case SYMBOL_TINY_GOT:
7072           asm_fprintf (asm_out_file, ":got:");
7073           break;
7074
7075         case SYMBOL_TINY_TLSIE:
7076           asm_fprintf (asm_out_file, ":gottprel:");
7077           break;
7078
7079         default:
7080           break;
7081         }
7082       output_addr_const (asm_out_file, x);
7083       break;
7084
7085     case 'G':
7086       switch (aarch64_classify_symbolic_expression (x))
7087         {
7088         case SYMBOL_TLSLE24:
7089           asm_fprintf (asm_out_file, ":tprel_hi12:");
7090           break;
7091         default:
7092           break;
7093         }
7094       output_addr_const (asm_out_file, x);
7095       break;
7096
7097     case 'k':
7098       {
7099         HOST_WIDE_INT cond_code;
7100
7101         if (!CONST_INT_P (x))
7102           {
7103             output_operand_lossage ("invalid operand for '%%%c'", code);
7104             return;
7105           }
7106
7107         cond_code = INTVAL (x);
7108         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7109         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7110       }
7111       break;
7112
7113     case 'y':
7114     case 'z':
7115       {
7116         machine_mode mode = GET_MODE (x);
7117
7118         if (GET_CODE (x) != MEM
7119             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7120           {
7121             output_operand_lossage ("invalid operand for '%%%c'", code);
7122             return;
7123           }
7124
7125         if (code == 'y')
7126           /* LDP/STP which uses a single double-width memory operand.
7127              Adjust the mode to appear like a typical LDP/STP.
7128              Currently this is supported for 16-byte accesses only.  */
7129           mode = DFmode;
7130
7131         if (!aarch64_print_ldpstp_address (f, mode, XEXP (x, 0)))
7132           output_operand_lossage ("invalid operand prefix '%%%c'", code);
7133       }
7134       break;
7135
7136     default:
7137       output_operand_lossage ("invalid operand prefix '%%%c'", code);
7138       return;
7139     }
7140 }
7141
7142 /* Print address 'x' of a memory access with mode 'mode'.
7143    'op' is the context required by aarch64_classify_address.  It can either be
7144    MEM for a normal memory access or PARALLEL for LDP/STP.  */
7145 static bool
7146 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7147                                 aarch64_addr_query_type type)
7148 {
7149   struct aarch64_address_info addr;
7150   unsigned int size;
7151
7152   /* Check all addresses are Pmode - including ILP32.  */
7153   if (GET_MODE (x) != Pmode)
7154     output_operand_lossage ("invalid address mode");
7155
7156   if (aarch64_classify_address (&addr, x, mode, true, type))
7157     switch (addr.type)
7158       {
7159       case ADDRESS_REG_IMM:
7160         if (known_eq (addr.const_offset, 0))
7161           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7162         else if (aarch64_sve_data_mode_p (mode))
7163           {
7164             HOST_WIDE_INT vnum
7165               = exact_div (addr.const_offset,
7166                            BYTES_PER_SVE_VECTOR).to_constant ();
7167             asm_fprintf (f, "[%s, #%wd, mul vl]",
7168                          reg_names[REGNO (addr.base)], vnum);
7169           }
7170         else if (aarch64_sve_pred_mode_p (mode))
7171           {
7172             HOST_WIDE_INT vnum
7173               = exact_div (addr.const_offset,
7174                            BYTES_PER_SVE_PRED).to_constant ();
7175             asm_fprintf (f, "[%s, #%wd, mul vl]",
7176                          reg_names[REGNO (addr.base)], vnum);
7177           }
7178         else
7179           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7180                        INTVAL (addr.offset));
7181         return true;
7182
7183       case ADDRESS_REG_REG:
7184         if (addr.shift == 0)
7185           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7186                        reg_names [REGNO (addr.offset)]);
7187         else
7188           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7189                        reg_names [REGNO (addr.offset)], addr.shift);
7190         return true;
7191
7192       case ADDRESS_REG_UXTW:
7193         if (addr.shift == 0)
7194           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7195                        REGNO (addr.offset) - R0_REGNUM);
7196         else
7197           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7198                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7199         return true;
7200
7201       case ADDRESS_REG_SXTW:
7202         if (addr.shift == 0)
7203           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7204                        REGNO (addr.offset) - R0_REGNUM);
7205         else
7206           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7207                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7208         return true;
7209
7210       case ADDRESS_REG_WB:
7211         /* Writeback is only supported for fixed-width modes.  */
7212         size = GET_MODE_SIZE (mode).to_constant ();
7213         switch (GET_CODE (x))
7214           {
7215           case PRE_INC:
7216             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7217             return true;
7218           case POST_INC:
7219             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7220             return true;
7221           case PRE_DEC:
7222             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7223             return true;
7224           case POST_DEC:
7225             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7226             return true;
7227           case PRE_MODIFY:
7228             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7229                          INTVAL (addr.offset));
7230             return true;
7231           case POST_MODIFY:
7232             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7233                          INTVAL (addr.offset));
7234             return true;
7235           default:
7236             break;
7237           }
7238         break;
7239
7240       case ADDRESS_LO_SUM:
7241         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7242         output_addr_const (f, addr.offset);
7243         asm_fprintf (f, "]");
7244         return true;
7245
7246       case ADDRESS_SYMBOLIC:
7247         output_addr_const (f, x);
7248         return true;
7249       }
7250
7251   return false;
7252 }
7253
7254 /* Print address 'x' of a LDP/STP with mode 'mode'.  */
7255 static bool
7256 aarch64_print_ldpstp_address (FILE *f, machine_mode mode, rtx x)
7257 {
7258   return aarch64_print_address_internal (f, mode, x, ADDR_QUERY_LDP_STP);
7259 }
7260
7261 /* Print address 'x' of a memory access with mode 'mode'.  */
7262 static void
7263 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7264 {
7265   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7266     output_addr_const (f, x);
7267 }
7268
7269 bool
7270 aarch64_label_mentioned_p (rtx x)
7271 {
7272   const char *fmt;
7273   int i;
7274
7275   if (GET_CODE (x) == LABEL_REF)
7276     return true;
7277
7278   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7279      referencing instruction, but they are constant offsets, not
7280      symbols.  */
7281   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7282     return false;
7283
7284   fmt = GET_RTX_FORMAT (GET_CODE (x));
7285   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7286     {
7287       if (fmt[i] == 'E')
7288         {
7289           int j;
7290
7291           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7292             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7293               return 1;
7294         }
7295       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7296         return 1;
7297     }
7298
7299   return 0;
7300 }
7301
7302 /* Implement REGNO_REG_CLASS.  */
7303
7304 enum reg_class
7305 aarch64_regno_regclass (unsigned regno)
7306 {
7307   if (GP_REGNUM_P (regno))
7308     return GENERAL_REGS;
7309
7310   if (regno == SP_REGNUM)
7311     return STACK_REG;
7312
7313   if (regno == FRAME_POINTER_REGNUM
7314       || regno == ARG_POINTER_REGNUM)
7315     return POINTER_REGS;
7316
7317   if (FP_REGNUM_P (regno))
7318     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
7319
7320   if (PR_REGNUM_P (regno))
7321     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7322
7323   return NO_REGS;
7324 }
7325
7326 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7327    If OFFSET is out of range, return an offset of an anchor point
7328    that is in range.  Return 0 otherwise.  */
7329
7330 static HOST_WIDE_INT
7331 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7332                        machine_mode mode)
7333 {
7334   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
7335   if (size > 16)
7336     return (offset + 0x400) & ~0x7f0;
7337
7338   /* For offsets that aren't a multiple of the access size, the limit is
7339      -256...255.  */
7340   if (offset & (size - 1))
7341     {
7342       /* BLKmode typically uses LDP of X-registers.  */
7343       if (mode == BLKmode)
7344         return (offset + 512) & ~0x3ff;
7345       return (offset + 0x100) & ~0x1ff;
7346     }
7347
7348   /* Small negative offsets are supported.  */
7349   if (IN_RANGE (offset, -256, 0))
7350     return 0;
7351
7352   if (mode == TImode || mode == TFmode)
7353     return (offset + 0x100) & ~0x1ff;
7354
7355   /* Use 12-bit offset by access size.  */
7356   return offset & (~0xfff * size);
7357 }
7358
7359 static rtx
7360 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
7361 {
7362   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7363      where mask is selected by alignment and size of the offset.
7364      We try to pick as large a range for the offset as possible to
7365      maximize the chance of a CSE.  However, for aligned addresses
7366      we limit the range to 4k so that structures with different sized
7367      elements are likely to use the same base.  We need to be careful
7368      not to split a CONST for some forms of address expression, otherwise
7369      it will generate sub-optimal code.  */
7370
7371   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7372     {
7373       rtx base = XEXP (x, 0);
7374       rtx offset_rtx = XEXP (x, 1);
7375       HOST_WIDE_INT offset = INTVAL (offset_rtx);
7376
7377       if (GET_CODE (base) == PLUS)
7378         {
7379           rtx op0 = XEXP (base, 0);
7380           rtx op1 = XEXP (base, 1);
7381
7382           /* Force any scaling into a temp for CSE.  */
7383           op0 = force_reg (Pmode, op0);
7384           op1 = force_reg (Pmode, op1);
7385
7386           /* Let the pointer register be in op0.  */
7387           if (REG_POINTER (op1))
7388             std::swap (op0, op1);
7389
7390           /* If the pointer is virtual or frame related, then we know that
7391              virtual register instantiation or register elimination is going
7392              to apply a second constant.  We want the two constants folded
7393              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
7394           if (virt_or_elim_regno_p (REGNO (op0)))
7395             {
7396               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7397                                    NULL_RTX, true, OPTAB_DIRECT);
7398               return gen_rtx_PLUS (Pmode, base, op1);
7399             }
7400
7401           /* Otherwise, in order to encourage CSE (and thence loop strength
7402              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
7403           base = expand_binop (Pmode, add_optab, op0, op1,
7404                                NULL_RTX, true, OPTAB_DIRECT);
7405           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7406         }
7407
7408       HOST_WIDE_INT size;
7409       if (GET_MODE_SIZE (mode).is_constant (&size))
7410         {
7411           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7412                                                              mode);
7413           if (base_offset != 0)
7414             {
7415               base = plus_constant (Pmode, base, base_offset);
7416               base = force_operand (base, NULL_RTX);
7417               return plus_constant (Pmode, base, offset - base_offset);
7418             }
7419         }
7420     }
7421
7422   return x;
7423 }
7424
7425 /* Return the reload icode required for a constant pool in mode.  */
7426 static enum insn_code
7427 aarch64_constant_pool_reload_icode (machine_mode mode)
7428 {
7429   switch (mode)
7430     {
7431     case E_SFmode:
7432       return CODE_FOR_aarch64_reload_movcpsfdi;
7433
7434     case E_DFmode:
7435       return CODE_FOR_aarch64_reload_movcpdfdi;
7436
7437     case E_TFmode:
7438       return CODE_FOR_aarch64_reload_movcptfdi;
7439
7440     case E_V8QImode:
7441       return CODE_FOR_aarch64_reload_movcpv8qidi;
7442
7443     case E_V16QImode:
7444       return CODE_FOR_aarch64_reload_movcpv16qidi;
7445
7446     case E_V4HImode:
7447       return CODE_FOR_aarch64_reload_movcpv4hidi;
7448
7449     case E_V8HImode:
7450       return CODE_FOR_aarch64_reload_movcpv8hidi;
7451
7452     case E_V2SImode:
7453       return CODE_FOR_aarch64_reload_movcpv2sidi;
7454
7455     case E_V4SImode:
7456       return CODE_FOR_aarch64_reload_movcpv4sidi;
7457
7458     case E_V2DImode:
7459       return CODE_FOR_aarch64_reload_movcpv2didi;
7460
7461     case E_V2DFmode:
7462       return CODE_FOR_aarch64_reload_movcpv2dfdi;
7463
7464     default:
7465       gcc_unreachable ();
7466     }
7467
7468   gcc_unreachable ();
7469 }
7470 static reg_class_t
7471 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7472                           reg_class_t rclass,
7473                           machine_mode mode,
7474                           secondary_reload_info *sri)
7475 {
7476   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7477      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
7478      comment at the head of aarch64-sve.md for more details about the
7479      big-endian handling.  */
7480   if (BYTES_BIG_ENDIAN
7481       && reg_class_subset_p (rclass, FP_REGS)
7482       && !((REG_P (x) && HARD_REGISTER_P (x))
7483            || aarch64_simd_valid_immediate (x, NULL))
7484       && aarch64_sve_data_mode_p (mode))
7485     {
7486       sri->icode = CODE_FOR_aarch64_sve_reload_be;
7487       return NO_REGS;
7488     }
7489
7490   /* If we have to disable direct literal pool loads and stores because the
7491      function is too big, then we need a scratch register.  */
7492   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7493       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7494           || targetm.vector_mode_supported_p (GET_MODE (x)))
7495       && !aarch64_pcrelative_literal_loads)
7496     {
7497       sri->icode = aarch64_constant_pool_reload_icode (mode);
7498       return NO_REGS;
7499     }
7500
7501   /* Without the TARGET_SIMD instructions we cannot move a Q register
7502      to a Q register directly.  We need a scratch.  */
7503   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7504       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7505       && reg_class_subset_p (rclass, FP_REGS))
7506     {
7507       if (mode == TFmode)
7508         sri->icode = CODE_FOR_aarch64_reload_movtf;
7509       else if (mode == TImode)
7510         sri->icode = CODE_FOR_aarch64_reload_movti;
7511       return NO_REGS;
7512     }
7513
7514   /* A TFmode or TImode memory access should be handled via an FP_REGS
7515      because AArch64 has richer addressing modes for LDR/STR instructions
7516      than LDP/STP instructions.  */
7517   if (TARGET_FLOAT && rclass == GENERAL_REGS
7518       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7519     return FP_REGS;
7520
7521   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7522       return GENERAL_REGS;
7523
7524   return NO_REGS;
7525 }
7526
7527 static bool
7528 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7529 {
7530   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7531
7532   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7533      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
7534   if (frame_pointer_needed)
7535     return to == HARD_FRAME_POINTER_REGNUM;
7536   return true;
7537 }
7538
7539 poly_int64
7540 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7541 {
7542   aarch64_layout_frame ();
7543
7544   if (to == HARD_FRAME_POINTER_REGNUM)
7545     {
7546       if (from == ARG_POINTER_REGNUM)
7547         return cfun->machine->frame.hard_fp_offset;
7548
7549       if (from == FRAME_POINTER_REGNUM)
7550         return cfun->machine->frame.hard_fp_offset
7551                - cfun->machine->frame.locals_offset;
7552     }
7553
7554   if (to == STACK_POINTER_REGNUM)
7555     {
7556       if (from == FRAME_POINTER_REGNUM)
7557           return cfun->machine->frame.frame_size
7558                  - cfun->machine->frame.locals_offset;
7559     }
7560
7561   return cfun->machine->frame.frame_size;
7562 }
7563
7564 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
7565    previous frame.  */
7566
7567 rtx
7568 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7569 {
7570   if (count != 0)
7571     return const0_rtx;
7572   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7573 }
7574
7575
7576 static void
7577 aarch64_asm_trampoline_template (FILE *f)
7578 {
7579   if (TARGET_ILP32)
7580     {
7581       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7582       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7583     }
7584   else
7585     {
7586       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7587       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7588     }
7589   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7590   assemble_aligned_integer (4, const0_rtx);
7591   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7592   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7593 }
7594
7595 static void
7596 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7597 {
7598   rtx fnaddr, mem, a_tramp;
7599   const int tramp_code_sz = 16;
7600
7601   /* Don't need to copy the trailing D-words, we fill those in below.  */
7602   emit_block_move (m_tramp, assemble_trampoline_template (),
7603                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7604   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7605   fnaddr = XEXP (DECL_RTL (fndecl), 0);
7606   if (GET_MODE (fnaddr) != ptr_mode)
7607     fnaddr = convert_memory_address (ptr_mode, fnaddr);
7608   emit_move_insn (mem, fnaddr);
7609
7610   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7611   emit_move_insn (mem, chain_value);
7612
7613   /* XXX We should really define a "clear_cache" pattern and use
7614      gen_clear_cache().  */
7615   a_tramp = XEXP (m_tramp, 0);
7616   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7617                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7618                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7619                      ptr_mode);
7620 }
7621
7622 static unsigned char
7623 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7624 {
7625   /* ??? Logically we should only need to provide a value when
7626      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7627      can hold MODE, but at the moment we need to handle all modes.
7628      Just ignore any runtime parts for registers that can't store them.  */
7629   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7630   unsigned int nregs;
7631   switch (regclass)
7632     {
7633     case TAILCALL_ADDR_REGS:
7634     case POINTER_REGS:
7635     case GENERAL_REGS:
7636     case ALL_REGS:
7637     case POINTER_AND_FP_REGS:
7638     case FP_REGS:
7639     case FP_LO_REGS:
7640       if (aarch64_sve_data_mode_p (mode)
7641           && constant_multiple_p (GET_MODE_SIZE (mode),
7642                                   BYTES_PER_SVE_VECTOR, &nregs))
7643         return nregs;
7644       return (aarch64_vector_data_mode_p (mode)
7645               ? CEIL (lowest_size, UNITS_PER_VREG)
7646               : CEIL (lowest_size, UNITS_PER_WORD));
7647     case STACK_REG:
7648     case PR_REGS:
7649     case PR_LO_REGS:
7650     case PR_HI_REGS:
7651       return 1;
7652
7653     case NO_REGS:
7654       return 0;
7655
7656     default:
7657       break;
7658     }
7659   gcc_unreachable ();
7660 }
7661
7662 static reg_class_t
7663 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7664 {
7665   if (regclass == POINTER_REGS)
7666     return GENERAL_REGS;
7667
7668   if (regclass == STACK_REG)
7669     {
7670       if (REG_P(x)
7671           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7672           return regclass;
7673
7674       return NO_REGS;
7675     }
7676
7677   /* Register eliminiation can result in a request for
7678      SP+constant->FP_REGS.  We cannot support such operations which
7679      use SP as source and an FP_REG as destination, so reject out
7680      right now.  */
7681   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7682     {
7683       rtx lhs = XEXP (x, 0);
7684
7685       /* Look through a possible SUBREG introduced by ILP32.  */
7686       if (GET_CODE (lhs) == SUBREG)
7687         lhs = SUBREG_REG (lhs);
7688
7689       gcc_assert (REG_P (lhs));
7690       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7691                                       POINTER_REGS));
7692       return NO_REGS;
7693     }
7694
7695   return regclass;
7696 }
7697
7698 void
7699 aarch64_asm_output_labelref (FILE* f, const char *name)
7700 {
7701   asm_fprintf (f, "%U%s", name);
7702 }
7703
7704 static void
7705 aarch64_elf_asm_constructor (rtx symbol, int priority)
7706 {
7707   if (priority == DEFAULT_INIT_PRIORITY)
7708     default_ctor_section_asm_out_constructor (symbol, priority);
7709   else
7710     {
7711       section *s;
7712       /* While priority is known to be in range [0, 65535], so 18 bytes
7713          would be enough, the compiler might not know that.  To avoid
7714          -Wformat-truncation false positive, use a larger size.  */
7715       char buf[23];
7716       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7717       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7718       switch_to_section (s);
7719       assemble_align (POINTER_SIZE);
7720       assemble_aligned_integer (POINTER_BYTES, symbol);
7721     }
7722 }
7723
7724 static void
7725 aarch64_elf_asm_destructor (rtx symbol, int priority)
7726 {
7727   if (priority == DEFAULT_INIT_PRIORITY)
7728     default_dtor_section_asm_out_destructor (symbol, priority);
7729   else
7730     {
7731       section *s;
7732       /* While priority is known to be in range [0, 65535], so 18 bytes
7733          would be enough, the compiler might not know that.  To avoid
7734          -Wformat-truncation false positive, use a larger size.  */
7735       char buf[23];
7736       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7737       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7738       switch_to_section (s);
7739       assemble_align (POINTER_SIZE);
7740       assemble_aligned_integer (POINTER_BYTES, symbol);
7741     }
7742 }
7743
7744 const char*
7745 aarch64_output_casesi (rtx *operands)
7746 {
7747   char buf[100];
7748   char label[100];
7749   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7750   int index;
7751   static const char *const patterns[4][2] =
7752   {
7753     {
7754       "ldrb\t%w3, [%0,%w1,uxtw]",
7755       "add\t%3, %4, %w3, sxtb #2"
7756     },
7757     {
7758       "ldrh\t%w3, [%0,%w1,uxtw #1]",
7759       "add\t%3, %4, %w3, sxth #2"
7760     },
7761     {
7762       "ldr\t%w3, [%0,%w1,uxtw #2]",
7763       "add\t%3, %4, %w3, sxtw #2"
7764     },
7765     /* We assume that DImode is only generated when not optimizing and
7766        that we don't really need 64-bit address offsets.  That would
7767        imply an object file with 8GB of code in a single function!  */
7768     {
7769       "ldr\t%w3, [%0,%w1,uxtw #2]",
7770       "add\t%3, %4, %w3, sxtw #2"
7771     }
7772   };
7773
7774   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7775
7776   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7777   index = exact_log2 (GET_MODE_SIZE (mode));
7778
7779   gcc_assert (index >= 0 && index <= 3);
7780
7781   /* Need to implement table size reduction, by chaning the code below.  */
7782   output_asm_insn (patterns[index][0], operands);
7783   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7784   snprintf (buf, sizeof (buf),
7785             "adr\t%%4, %s", targetm.strip_name_encoding (label));
7786   output_asm_insn (buf, operands);
7787   output_asm_insn (patterns[index][1], operands);
7788   output_asm_insn ("br\t%3", operands);
7789   assemble_label (asm_out_file, label);
7790   return "";
7791 }
7792
7793
7794 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7795    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7796    operator.  */
7797
7798 int
7799 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7800 {
7801   if (shift >= 0 && shift <= 3)
7802     {
7803       int size;
7804       for (size = 8; size <= 32; size *= 2)
7805         {
7806           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7807           if (mask == bits << shift)
7808             return size;
7809         }
7810     }
7811   return 0;
7812 }
7813
7814 /* Constant pools are per function only when PC relative
7815    literal loads are true or we are in the large memory
7816    model.  */
7817
7818 static inline bool
7819 aarch64_can_use_per_function_literal_pools_p (void)
7820 {
7821   return (aarch64_pcrelative_literal_loads
7822           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7823 }
7824
7825 static bool
7826 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7827 {
7828   /* We can't use blocks for constants when we're using a per-function
7829      constant pool.  */
7830   return !aarch64_can_use_per_function_literal_pools_p ();
7831 }
7832
7833 /* Select appropriate section for constants depending
7834    on where we place literal pools.  */
7835
7836 static section *
7837 aarch64_select_rtx_section (machine_mode mode,
7838                             rtx x,
7839                             unsigned HOST_WIDE_INT align)
7840 {
7841   if (aarch64_can_use_per_function_literal_pools_p ())
7842     return function_section (current_function_decl);
7843
7844   return default_elf_select_rtx_section (mode, x, align);
7845 }
7846
7847 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
7848 void
7849 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7850                                   HOST_WIDE_INT offset)
7851 {
7852   /* When using per-function literal pools, we must ensure that any code
7853      section is aligned to the minimal instruction length, lest we get
7854      errors from the assembler re "unaligned instructions".  */
7855   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7856     ASM_OUTPUT_ALIGN (f, 2);
7857 }
7858
7859 /* Costs.  */
7860
7861 /* Helper function for rtx cost calculation.  Strip a shift expression
7862    from X.  Returns the inner operand if successful, or the original
7863    expression on failure.  */
7864 static rtx
7865 aarch64_strip_shift (rtx x)
7866 {
7867   rtx op = x;
7868
7869   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7870      we can convert both to ROR during final output.  */
7871   if ((GET_CODE (op) == ASHIFT
7872        || GET_CODE (op) == ASHIFTRT
7873        || GET_CODE (op) == LSHIFTRT
7874        || GET_CODE (op) == ROTATERT
7875        || GET_CODE (op) == ROTATE)
7876       && CONST_INT_P (XEXP (op, 1)))
7877     return XEXP (op, 0);
7878
7879   if (GET_CODE (op) == MULT
7880       && CONST_INT_P (XEXP (op, 1))
7881       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7882     return XEXP (op, 0);
7883
7884   return x;
7885 }
7886
7887 /* Helper function for rtx cost calculation.  Strip an extend
7888    expression from X.  Returns the inner operand if successful, or the
7889    original expression on failure.  We deal with a number of possible
7890    canonicalization variations here. If STRIP_SHIFT is true, then
7891    we can strip off a shift also.  */
7892 static rtx
7893 aarch64_strip_extend (rtx x, bool strip_shift)
7894 {
7895   scalar_int_mode mode;
7896   rtx op = x;
7897
7898   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7899     return op;
7900
7901   /* Zero and sign extraction of a widened value.  */
7902   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7903       && XEXP (op, 2) == const0_rtx
7904       && GET_CODE (XEXP (op, 0)) == MULT
7905       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7906                                          XEXP (op, 1)))
7907     return XEXP (XEXP (op, 0), 0);
7908
7909   /* It can also be represented (for zero-extend) as an AND with an
7910      immediate.  */
7911   if (GET_CODE (op) == AND
7912       && GET_CODE (XEXP (op, 0)) == MULT
7913       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7914       && CONST_INT_P (XEXP (op, 1))
7915       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7916                            INTVAL (XEXP (op, 1))) != 0)
7917     return XEXP (XEXP (op, 0), 0);
7918
7919   /* Now handle extended register, as this may also have an optional
7920      left shift by 1..4.  */
7921   if (strip_shift
7922       && GET_CODE (op) == ASHIFT
7923       && CONST_INT_P (XEXP (op, 1))
7924       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7925     op = XEXP (op, 0);
7926
7927   if (GET_CODE (op) == ZERO_EXTEND
7928       || GET_CODE (op) == SIGN_EXTEND)
7929     op = XEXP (op, 0);
7930
7931   if (op != x)
7932     return op;
7933
7934   return x;
7935 }
7936
7937 /* Return true iff CODE is a shift supported in combination
7938    with arithmetic instructions.  */
7939
7940 static bool
7941 aarch64_shift_p (enum rtx_code code)
7942 {
7943   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7944 }
7945
7946
7947 /* Return true iff X is a cheap shift without a sign extend. */
7948
7949 static bool
7950 aarch64_cheap_mult_shift_p (rtx x)
7951 {
7952   rtx op0, op1;
7953
7954   op0 = XEXP (x, 0);
7955   op1 = XEXP (x, 1);
7956
7957   if (!(aarch64_tune_params.extra_tuning_flags
7958                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7959     return false;
7960
7961   if (GET_CODE (op0) == SIGN_EXTEND)
7962     return false;
7963
7964   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7965       && UINTVAL (op1) <= 4)
7966     return true;
7967
7968   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7969     return false;
7970
7971   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7972
7973   if (l2 > 0 && l2 <= 4)
7974     return true;
7975
7976   return false;
7977 }
7978
7979 /* Helper function for rtx cost calculation.  Calculate the cost of
7980    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7981    Return the calculated cost of the expression, recursing manually in to
7982    operands where needed.  */
7983
7984 static int
7985 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7986 {
7987   rtx op0, op1;
7988   const struct cpu_cost_table *extra_cost
7989     = aarch64_tune_params.insn_extra_cost;
7990   int cost = 0;
7991   bool compound_p = (outer == PLUS || outer == MINUS);
7992   machine_mode mode = GET_MODE (x);
7993
7994   gcc_checking_assert (code == MULT);
7995
7996   op0 = XEXP (x, 0);
7997   op1 = XEXP (x, 1);
7998
7999   if (VECTOR_MODE_P (mode))
8000     mode = GET_MODE_INNER (mode);
8001
8002   /* Integer multiply/fma.  */
8003   if (GET_MODE_CLASS (mode) == MODE_INT)
8004     {
8005       /* The multiply will be canonicalized as a shift, cost it as such.  */
8006       if (aarch64_shift_p (GET_CODE (x))
8007           || (CONST_INT_P (op1)
8008               && exact_log2 (INTVAL (op1)) > 0))
8009         {
8010           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
8011                            || GET_CODE (op0) == SIGN_EXTEND;
8012           if (speed)
8013             {
8014               if (compound_p)
8015                 {
8016                   /* If the shift is considered cheap,
8017                      then don't add any cost. */
8018                   if (aarch64_cheap_mult_shift_p (x))
8019                     ;
8020                   else if (REG_P (op1))
8021                     /* ARITH + shift-by-register.  */
8022                     cost += extra_cost->alu.arith_shift_reg;
8023                   else if (is_extend)
8024                     /* ARITH + extended register.  We don't have a cost field
8025                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
8026                     cost += extra_cost->alu.extend_arith;
8027                   else
8028                     /* ARITH + shift-by-immediate.  */
8029                     cost += extra_cost->alu.arith_shift;
8030                 }
8031               else
8032                 /* LSL (immediate).  */
8033                 cost += extra_cost->alu.shift;
8034
8035             }
8036           /* Strip extends as we will have costed them in the case above.  */
8037           if (is_extend)
8038             op0 = aarch64_strip_extend (op0, true);
8039
8040           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
8041
8042           return cost;
8043         }
8044
8045       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
8046          compound and let the below cases handle it.  After all, MNEG is a
8047          special-case alias of MSUB.  */
8048       if (GET_CODE (op0) == NEG)
8049         {
8050           op0 = XEXP (op0, 0);
8051           compound_p = true;
8052         }
8053
8054       /* Integer multiplies or FMAs have zero/sign extending variants.  */
8055       if ((GET_CODE (op0) == ZERO_EXTEND
8056            && GET_CODE (op1) == ZERO_EXTEND)
8057           || (GET_CODE (op0) == SIGN_EXTEND
8058               && GET_CODE (op1) == SIGN_EXTEND))
8059         {
8060           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8061           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8062
8063           if (speed)
8064             {
8065               if (compound_p)
8066                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
8067                 cost += extra_cost->mult[0].extend_add;
8068               else
8069                 /* MUL/SMULL/UMULL.  */
8070                 cost += extra_cost->mult[0].extend;
8071             }
8072
8073           return cost;
8074         }
8075
8076       /* This is either an integer multiply or a MADD.  In both cases
8077          we want to recurse and cost the operands.  */
8078       cost += rtx_cost (op0, mode, MULT, 0, speed);
8079       cost += rtx_cost (op1, mode, MULT, 1, speed);
8080
8081       if (speed)
8082         {
8083           if (compound_p)
8084             /* MADD/MSUB.  */
8085             cost += extra_cost->mult[mode == DImode].add;
8086           else
8087             /* MUL.  */
8088             cost += extra_cost->mult[mode == DImode].simple;
8089         }
8090
8091       return cost;
8092     }
8093   else
8094     {
8095       if (speed)
8096         {
8097           /* Floating-point FMA/FMUL can also support negations of the
8098              operands, unless the rounding mode is upward or downward in
8099              which case FNMUL is different than FMUL with operand negation.  */
8100           bool neg0 = GET_CODE (op0) == NEG;
8101           bool neg1 = GET_CODE (op1) == NEG;
8102           if (compound_p || !flag_rounding_math || (neg0 && neg1))
8103             {
8104               if (neg0)
8105                 op0 = XEXP (op0, 0);
8106               if (neg1)
8107                 op1 = XEXP (op1, 0);
8108             }
8109
8110           if (compound_p)
8111             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
8112             cost += extra_cost->fp[mode == DFmode].fma;
8113           else
8114             /* FMUL/FNMUL.  */
8115             cost += extra_cost->fp[mode == DFmode].mult;
8116         }
8117
8118       cost += rtx_cost (op0, mode, MULT, 0, speed);
8119       cost += rtx_cost (op1, mode, MULT, 1, speed);
8120       return cost;
8121     }
8122 }
8123
8124 static int
8125 aarch64_address_cost (rtx x,
8126                       machine_mode mode,
8127                       addr_space_t as ATTRIBUTE_UNUSED,
8128                       bool speed)
8129 {
8130   enum rtx_code c = GET_CODE (x);
8131   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8132   struct aarch64_address_info info;
8133   int cost = 0;
8134   info.shift = 0;
8135
8136   if (!aarch64_classify_address (&info, x, mode, false))
8137     {
8138       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8139         {
8140           /* This is a CONST or SYMBOL ref which will be split
8141              in a different way depending on the code model in use.
8142              Cost it through the generic infrastructure.  */
8143           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8144           /* Divide through by the cost of one instruction to
8145              bring it to the same units as the address costs.  */
8146           cost_symbol_ref /= COSTS_N_INSNS (1);
8147           /* The cost is then the cost of preparing the address,
8148              followed by an immediate (possibly 0) offset.  */
8149           return cost_symbol_ref + addr_cost->imm_offset;
8150         }
8151       else
8152         {
8153           /* This is most likely a jump table from a case
8154              statement.  */
8155           return addr_cost->register_offset;
8156         }
8157     }
8158
8159   switch (info.type)
8160     {
8161       case ADDRESS_LO_SUM:
8162       case ADDRESS_SYMBOLIC:
8163       case ADDRESS_REG_IMM:
8164         cost += addr_cost->imm_offset;
8165         break;
8166
8167       case ADDRESS_REG_WB:
8168         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8169           cost += addr_cost->pre_modify;
8170         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8171           cost += addr_cost->post_modify;
8172         else
8173           gcc_unreachable ();
8174
8175         break;
8176
8177       case ADDRESS_REG_REG:
8178         cost += addr_cost->register_offset;
8179         break;
8180
8181       case ADDRESS_REG_SXTW:
8182         cost += addr_cost->register_sextend;
8183         break;
8184
8185       case ADDRESS_REG_UXTW:
8186         cost += addr_cost->register_zextend;
8187         break;
8188
8189       default:
8190         gcc_unreachable ();
8191     }
8192
8193
8194   if (info.shift > 0)
8195     {
8196       /* For the sake of calculating the cost of the shifted register
8197          component, we can treat same sized modes in the same way.  */
8198       if (known_eq (GET_MODE_BITSIZE (mode), 16))
8199         cost += addr_cost->addr_scale_costs.hi;
8200       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8201         cost += addr_cost->addr_scale_costs.si;
8202       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8203         cost += addr_cost->addr_scale_costs.di;
8204       else
8205         /* We can't tell, or this is a 128-bit vector.  */
8206         cost += addr_cost->addr_scale_costs.ti;
8207     }
8208
8209   return cost;
8210 }
8211
8212 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
8213    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
8214    to be taken.  */
8215
8216 int
8217 aarch64_branch_cost (bool speed_p, bool predictable_p)
8218 {
8219   /* When optimizing for speed, use the cost of unpredictable branches.  */
8220   const struct cpu_branch_cost *branch_costs =
8221     aarch64_tune_params.branch_costs;
8222
8223   if (!speed_p || predictable_p)
8224     return branch_costs->predictable;
8225   else
8226     return branch_costs->unpredictable;
8227 }
8228
8229 /* Return true if the RTX X in mode MODE is a zero or sign extract
8230    usable in an ADD or SUB (extended register) instruction.  */
8231 static bool
8232 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8233 {
8234   /* Catch add with a sign extract.
8235      This is add_<optab><mode>_multp2.  */
8236   if (GET_CODE (x) == SIGN_EXTRACT
8237       || GET_CODE (x) == ZERO_EXTRACT)
8238     {
8239       rtx op0 = XEXP (x, 0);
8240       rtx op1 = XEXP (x, 1);
8241       rtx op2 = XEXP (x, 2);
8242
8243       if (GET_CODE (op0) == MULT
8244           && CONST_INT_P (op1)
8245           && op2 == const0_rtx
8246           && CONST_INT_P (XEXP (op0, 1))
8247           && aarch64_is_extend_from_extract (mode,
8248                                              XEXP (op0, 1),
8249                                              op1))
8250         {
8251           return true;
8252         }
8253     }
8254   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8255      No shift.  */
8256   else if (GET_CODE (x) == SIGN_EXTEND
8257            || GET_CODE (x) == ZERO_EXTEND)
8258     return REG_P (XEXP (x, 0));
8259
8260   return false;
8261 }
8262
8263 static bool
8264 aarch64_frint_unspec_p (unsigned int u)
8265 {
8266   switch (u)
8267     {
8268       case UNSPEC_FRINTZ:
8269       case UNSPEC_FRINTP:
8270       case UNSPEC_FRINTM:
8271       case UNSPEC_FRINTA:
8272       case UNSPEC_FRINTN:
8273       case UNSPEC_FRINTX:
8274       case UNSPEC_FRINTI:
8275         return true;
8276
8277       default:
8278         return false;
8279     }
8280 }
8281
8282 /* Return true iff X is an rtx that will match an extr instruction
8283    i.e. as described in the *extr<mode>5_insn family of patterns.
8284    OP0 and OP1 will be set to the operands of the shifts involved
8285    on success and will be NULL_RTX otherwise.  */
8286
8287 static bool
8288 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8289 {
8290   rtx op0, op1;
8291   scalar_int_mode mode;
8292   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8293     return false;
8294
8295   *res_op0 = NULL_RTX;
8296   *res_op1 = NULL_RTX;
8297
8298   if (GET_CODE (x) != IOR)
8299     return false;
8300
8301   op0 = XEXP (x, 0);
8302   op1 = XEXP (x, 1);
8303
8304   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8305       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8306     {
8307      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
8308       if (GET_CODE (op1) == ASHIFT)
8309         std::swap (op0, op1);
8310
8311       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8312         return false;
8313
8314       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8315       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8316
8317       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8318           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8319         {
8320           *res_op0 = XEXP (op0, 0);
8321           *res_op1 = XEXP (op1, 0);
8322           return true;
8323         }
8324     }
8325
8326   return false;
8327 }
8328
8329 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8330    storing it in *COST.  Result is true if the total cost of the operation
8331    has now been calculated.  */
8332 static bool
8333 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8334 {
8335   rtx inner;
8336   rtx comparator;
8337   enum rtx_code cmpcode;
8338
8339   if (COMPARISON_P (op0))
8340     {
8341       inner = XEXP (op0, 0);
8342       comparator = XEXP (op0, 1);
8343       cmpcode = GET_CODE (op0);
8344     }
8345   else
8346     {
8347       inner = op0;
8348       comparator = const0_rtx;
8349       cmpcode = NE;
8350     }
8351
8352   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8353     {
8354       /* Conditional branch.  */
8355       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8356         return true;
8357       else
8358         {
8359           if (cmpcode == NE || cmpcode == EQ)
8360             {
8361               if (comparator == const0_rtx)
8362                 {
8363                   /* TBZ/TBNZ/CBZ/CBNZ.  */
8364                   if (GET_CODE (inner) == ZERO_EXTRACT)
8365                     /* TBZ/TBNZ.  */
8366                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8367                                        ZERO_EXTRACT, 0, speed);
8368                   else
8369                     /* CBZ/CBNZ.  */
8370                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8371
8372                 return true;
8373               }
8374             }
8375           else if (cmpcode == LT || cmpcode == GE)
8376             {
8377               /* TBZ/TBNZ.  */
8378               if (comparator == const0_rtx)
8379                 return true;
8380             }
8381         }
8382     }
8383   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8384     {
8385       /* CCMP.  */
8386       if (GET_CODE (op1) == COMPARE)
8387         {
8388           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
8389           if (XEXP (op1, 1) == const0_rtx)
8390             *cost += 1;
8391           if (speed)
8392             {
8393               machine_mode mode = GET_MODE (XEXP (op1, 0));
8394               const struct cpu_cost_table *extra_cost
8395                 = aarch64_tune_params.insn_extra_cost;
8396
8397               if (GET_MODE_CLASS (mode) == MODE_INT)
8398                 *cost += extra_cost->alu.arith;
8399               else
8400                 *cost += extra_cost->fp[mode == DFmode].compare;
8401             }
8402           return true;
8403         }
8404
8405       /* It's a conditional operation based on the status flags,
8406          so it must be some flavor of CSEL.  */
8407
8408       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
8409       if (GET_CODE (op1) == NEG
8410           || GET_CODE (op1) == NOT
8411           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8412         op1 = XEXP (op1, 0);
8413       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8414         {
8415           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
8416           op1 = XEXP (op1, 0);
8417           op2 = XEXP (op2, 0);
8418         }
8419
8420       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8421       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8422       return true;
8423     }
8424
8425   /* We don't know what this is, cost all operands.  */
8426   return false;
8427 }
8428
8429 /* Check whether X is a bitfield operation of the form shift + extend that
8430    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
8431    operand to which the bitfield operation is applied.  Otherwise return
8432    NULL_RTX.  */
8433
8434 static rtx
8435 aarch64_extend_bitfield_pattern_p (rtx x)
8436 {
8437   rtx_code outer_code = GET_CODE (x);
8438   machine_mode outer_mode = GET_MODE (x);
8439
8440   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8441       && outer_mode != SImode && outer_mode != DImode)
8442     return NULL_RTX;
8443
8444   rtx inner = XEXP (x, 0);
8445   rtx_code inner_code = GET_CODE (inner);
8446   machine_mode inner_mode = GET_MODE (inner);
8447   rtx op = NULL_RTX;
8448
8449   switch (inner_code)
8450     {
8451       case ASHIFT:
8452         if (CONST_INT_P (XEXP (inner, 1))
8453             && (inner_mode == QImode || inner_mode == HImode))
8454           op = XEXP (inner, 0);
8455         break;
8456       case LSHIFTRT:
8457         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8458             && (inner_mode == QImode || inner_mode == HImode))
8459           op = XEXP (inner, 0);
8460         break;
8461       case ASHIFTRT:
8462         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8463             && (inner_mode == QImode || inner_mode == HImode))
8464           op = XEXP (inner, 0);
8465         break;
8466       default:
8467         break;
8468     }
8469
8470   return op;
8471 }
8472
8473 /* Return true if the mask and a shift amount from an RTX of the form
8474    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8475    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
8476
8477 bool
8478 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8479                                     rtx shft_amnt)
8480 {
8481   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8482          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8483          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8484          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8485 }
8486
8487 /* Calculate the cost of calculating X, storing it in *COST.  Result
8488    is true if the total cost of the operation has now been calculated.  */
8489 static bool
8490 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8491                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8492 {
8493   rtx op0, op1, op2;
8494   const struct cpu_cost_table *extra_cost
8495     = aarch64_tune_params.insn_extra_cost;
8496   int code = GET_CODE (x);
8497   scalar_int_mode int_mode;
8498
8499   /* By default, assume that everything has equivalent cost to the
8500      cheapest instruction.  Any additional costs are applied as a delta
8501      above this default.  */
8502   *cost = COSTS_N_INSNS (1);
8503
8504   switch (code)
8505     {
8506     case SET:
8507       /* The cost depends entirely on the operands to SET.  */
8508       *cost = 0;
8509       op0 = SET_DEST (x);
8510       op1 = SET_SRC (x);
8511
8512       switch (GET_CODE (op0))
8513         {
8514         case MEM:
8515           if (speed)
8516             {
8517               rtx address = XEXP (op0, 0);
8518               if (VECTOR_MODE_P (mode))
8519                 *cost += extra_cost->ldst.storev;
8520               else if (GET_MODE_CLASS (mode) == MODE_INT)
8521                 *cost += extra_cost->ldst.store;
8522               else if (mode == SFmode)
8523                 *cost += extra_cost->ldst.storef;
8524               else if (mode == DFmode)
8525                 *cost += extra_cost->ldst.stored;
8526
8527               *cost +=
8528                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8529                                                      0, speed));
8530             }
8531
8532           *cost += rtx_cost (op1, mode, SET, 1, speed);
8533           return true;
8534
8535         case SUBREG:
8536           if (! REG_P (SUBREG_REG (op0)))
8537             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8538
8539           /* Fall through.  */
8540         case REG:
8541           /* The cost is one per vector-register copied.  */
8542           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8543             {
8544               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8545               *cost = COSTS_N_INSNS (nregs);
8546             }
8547           /* const0_rtx is in general free, but we will use an
8548              instruction to set a register to 0.  */
8549           else if (REG_P (op1) || op1 == const0_rtx)
8550             {
8551               /* The cost is 1 per register copied.  */
8552               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8553               *cost = COSTS_N_INSNS (nregs);
8554             }
8555           else
8556             /* Cost is just the cost of the RHS of the set.  */
8557             *cost += rtx_cost (op1, mode, SET, 1, speed);
8558           return true;
8559
8560         case ZERO_EXTRACT:
8561         case SIGN_EXTRACT:
8562           /* Bit-field insertion.  Strip any redundant widening of
8563              the RHS to meet the width of the target.  */
8564           if (GET_CODE (op1) == SUBREG)
8565             op1 = SUBREG_REG (op1);
8566           if ((GET_CODE (op1) == ZERO_EXTEND
8567                || GET_CODE (op1) == SIGN_EXTEND)
8568               && CONST_INT_P (XEXP (op0, 1))
8569               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8570               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8571             op1 = XEXP (op1, 0);
8572
8573           if (CONST_INT_P (op1))
8574             {
8575               /* MOV immediate is assumed to always be cheap.  */
8576               *cost = COSTS_N_INSNS (1);
8577             }
8578           else
8579             {
8580               /* BFM.  */
8581               if (speed)
8582                 *cost += extra_cost->alu.bfi;
8583               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8584             }
8585
8586           return true;
8587
8588         default:
8589           /* We can't make sense of this, assume default cost.  */
8590           *cost = COSTS_N_INSNS (1);
8591           return false;
8592         }
8593       return false;
8594
8595     case CONST_INT:
8596       /* If an instruction can incorporate a constant within the
8597          instruction, the instruction's expression avoids calling
8598          rtx_cost() on the constant.  If rtx_cost() is called on a
8599          constant, then it is usually because the constant must be
8600          moved into a register by one or more instructions.
8601
8602          The exception is constant 0, which can be expressed
8603          as XZR/WZR and is therefore free.  The exception to this is
8604          if we have (set (reg) (const0_rtx)) in which case we must cost
8605          the move.  However, we can catch that when we cost the SET, so
8606          we don't need to consider that here.  */
8607       if (x == const0_rtx)
8608         *cost = 0;
8609       else
8610         {
8611           /* To an approximation, building any other constant is
8612              proportionally expensive to the number of instructions
8613              required to build that constant.  This is true whether we
8614              are compiling for SPEED or otherwise.  */
8615           if (!is_a <scalar_int_mode> (mode, &int_mode))
8616             int_mode = word_mode;
8617           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8618                                  (NULL_RTX, x, false, int_mode));
8619         }
8620       return true;
8621
8622     case CONST_DOUBLE:
8623
8624       /* First determine number of instructions to do the move
8625           as an integer constant.  */
8626       if (!aarch64_float_const_representable_p (x)
8627            && !aarch64_can_const_movi_rtx_p (x, mode)
8628            && aarch64_float_const_rtx_p (x))
8629         {
8630           unsigned HOST_WIDE_INT ival;
8631           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8632           gcc_assert (succeed);
8633
8634           scalar_int_mode imode = (mode == HFmode
8635                                    ? SImode
8636                                    : int_mode_for_mode (mode).require ());
8637           int ncost = aarch64_internal_mov_immediate
8638                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8639           *cost += COSTS_N_INSNS (ncost);
8640           return true;
8641         }
8642
8643       if (speed)
8644         {
8645           /* mov[df,sf]_aarch64.  */
8646           if (aarch64_float_const_representable_p (x))
8647             /* FMOV (scalar immediate).  */
8648             *cost += extra_cost->fp[mode == DFmode].fpconst;
8649           else if (!aarch64_float_const_zero_rtx_p (x))
8650             {
8651               /* This will be a load from memory.  */
8652               if (mode == DFmode)
8653                 *cost += extra_cost->ldst.loadd;
8654               else
8655                 *cost += extra_cost->ldst.loadf;
8656             }
8657           else
8658             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
8659                or MOV v0.s[0], wzr - neither of which are modeled by the
8660                cost tables.  Just use the default cost.  */
8661             {
8662             }
8663         }
8664
8665       return true;
8666
8667     case MEM:
8668       if (speed)
8669         {
8670           /* For loads we want the base cost of a load, plus an
8671              approximation for the additional cost of the addressing
8672              mode.  */
8673           rtx address = XEXP (x, 0);
8674           if (VECTOR_MODE_P (mode))
8675             *cost += extra_cost->ldst.loadv;
8676           else if (GET_MODE_CLASS (mode) == MODE_INT)
8677             *cost += extra_cost->ldst.load;
8678           else if (mode == SFmode)
8679             *cost += extra_cost->ldst.loadf;
8680           else if (mode == DFmode)
8681             *cost += extra_cost->ldst.loadd;
8682
8683           *cost +=
8684                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8685                                                      0, speed));
8686         }
8687
8688       return true;
8689
8690     case NEG:
8691       op0 = XEXP (x, 0);
8692
8693       if (VECTOR_MODE_P (mode))
8694         {
8695           if (speed)
8696             {
8697               /* FNEG.  */
8698               *cost += extra_cost->vect.alu;
8699             }
8700           return false;
8701         }
8702
8703       if (GET_MODE_CLASS (mode) == MODE_INT)
8704         {
8705           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8706               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8707             {
8708               /* CSETM.  */
8709               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8710               return true;
8711             }
8712
8713           /* Cost this as SUB wzr, X.  */
8714           op0 = CONST0_RTX (mode);
8715           op1 = XEXP (x, 0);
8716           goto cost_minus;
8717         }
8718
8719       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8720         {
8721           /* Support (neg(fma...)) as a single instruction only if
8722              sign of zeros is unimportant.  This matches the decision
8723              making in aarch64.md.  */
8724           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8725             {
8726               /* FNMADD.  */
8727               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8728               return true;
8729             }
8730           if (GET_CODE (op0) == MULT)
8731             {
8732               /* FNMUL.  */
8733               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8734               return true;
8735             }
8736           if (speed)
8737             /* FNEG.  */
8738             *cost += extra_cost->fp[mode == DFmode].neg;
8739           return false;
8740         }
8741
8742       return false;
8743
8744     case CLRSB:
8745     case CLZ:
8746       if (speed)
8747         {
8748           if (VECTOR_MODE_P (mode))
8749             *cost += extra_cost->vect.alu;
8750           else
8751             *cost += extra_cost->alu.clz;
8752         }
8753
8754       return false;
8755
8756     case COMPARE:
8757       op0 = XEXP (x, 0);
8758       op1 = XEXP (x, 1);
8759
8760       if (op1 == const0_rtx
8761           && GET_CODE (op0) == AND)
8762         {
8763           x = op0;
8764           mode = GET_MODE (op0);
8765           goto cost_logic;
8766         }
8767
8768       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8769         {
8770           /* TODO: A write to the CC flags possibly costs extra, this
8771              needs encoding in the cost tables.  */
8772
8773           mode = GET_MODE (op0);
8774           /* ANDS.  */
8775           if (GET_CODE (op0) == AND)
8776             {
8777               x = op0;
8778               goto cost_logic;
8779             }
8780
8781           if (GET_CODE (op0) == PLUS)
8782             {
8783               /* ADDS (and CMN alias).  */
8784               x = op0;
8785               goto cost_plus;
8786             }
8787
8788           if (GET_CODE (op0) == MINUS)
8789             {
8790               /* SUBS.  */
8791               x = op0;
8792               goto cost_minus;
8793             }
8794
8795           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8796               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8797               && CONST_INT_P (XEXP (op0, 2)))
8798             {
8799               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8800                  Handle it here directly rather than going to cost_logic
8801                  since we know the immediate generated for the TST is valid
8802                  so we can avoid creating an intermediate rtx for it only
8803                  for costing purposes.  */
8804               if (speed)
8805                 *cost += extra_cost->alu.logical;
8806
8807               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8808                                  ZERO_EXTRACT, 0, speed);
8809               return true;
8810             }
8811
8812           if (GET_CODE (op1) == NEG)
8813             {
8814               /* CMN.  */
8815               if (speed)
8816                 *cost += extra_cost->alu.arith;
8817
8818               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8819               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8820               return true;
8821             }
8822
8823           /* CMP.
8824
8825              Compare can freely swap the order of operands, and
8826              canonicalization puts the more complex operation first.
8827              But the integer MINUS logic expects the shift/extend
8828              operation in op1.  */
8829           if (! (REG_P (op0)
8830                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8831           {
8832             op0 = XEXP (x, 1);
8833             op1 = XEXP (x, 0);
8834           }
8835           goto cost_minus;
8836         }
8837
8838       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8839         {
8840           /* FCMP.  */
8841           if (speed)
8842             *cost += extra_cost->fp[mode == DFmode].compare;
8843
8844           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8845             {
8846               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8847               /* FCMP supports constant 0.0 for no extra cost. */
8848               return true;
8849             }
8850           return false;
8851         }
8852
8853       if (VECTOR_MODE_P (mode))
8854         {
8855           /* Vector compare.  */
8856           if (speed)
8857             *cost += extra_cost->vect.alu;
8858
8859           if (aarch64_float_const_zero_rtx_p (op1))
8860             {
8861               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8862                  cost.  */
8863               return true;
8864             }
8865           return false;
8866         }
8867       return false;
8868
8869     case MINUS:
8870       {
8871         op0 = XEXP (x, 0);
8872         op1 = XEXP (x, 1);
8873
8874 cost_minus:
8875         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8876
8877         /* Detect valid immediates.  */
8878         if ((GET_MODE_CLASS (mode) == MODE_INT
8879              || (GET_MODE_CLASS (mode) == MODE_CC
8880                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8881             && CONST_INT_P (op1)
8882             && aarch64_uimm12_shift (INTVAL (op1)))
8883           {
8884             if (speed)
8885               /* SUB(S) (immediate).  */
8886               *cost += extra_cost->alu.arith;
8887             return true;
8888           }
8889
8890         /* Look for SUB (extended register).  */
8891         if (is_a <scalar_int_mode> (mode, &int_mode)
8892             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8893           {
8894             if (speed)
8895               *cost += extra_cost->alu.extend_arith;
8896
8897             op1 = aarch64_strip_extend (op1, true);
8898             *cost += rtx_cost (op1, VOIDmode,
8899                                (enum rtx_code) GET_CODE (op1), 0, speed);
8900             return true;
8901           }
8902
8903         rtx new_op1 = aarch64_strip_extend (op1, false);
8904
8905         /* Cost this as an FMA-alike operation.  */
8906         if ((GET_CODE (new_op1) == MULT
8907              || aarch64_shift_p (GET_CODE (new_op1)))
8908             && code != COMPARE)
8909           {
8910             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8911                                             (enum rtx_code) code,
8912                                             speed);
8913             return true;
8914           }
8915
8916         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8917
8918         if (speed)
8919           {
8920             if (VECTOR_MODE_P (mode))
8921               {
8922                 /* Vector SUB.  */
8923                 *cost += extra_cost->vect.alu;
8924               }
8925             else if (GET_MODE_CLASS (mode) == MODE_INT)
8926               {
8927                 /* SUB(S).  */
8928                 *cost += extra_cost->alu.arith;
8929               }
8930             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8931               {
8932                 /* FSUB.  */
8933                 *cost += extra_cost->fp[mode == DFmode].addsub;
8934               }
8935           }
8936         return true;
8937       }
8938
8939     case PLUS:
8940       {
8941         rtx new_op0;
8942
8943         op0 = XEXP (x, 0);
8944         op1 = XEXP (x, 1);
8945
8946 cost_plus:
8947         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8948             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8949           {
8950             /* CSINC.  */
8951             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8952             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8953             return true;
8954           }
8955
8956         if (GET_MODE_CLASS (mode) == MODE_INT
8957             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8958                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8959           {
8960             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8961
8962             if (speed)
8963               /* ADD (immediate).  */
8964               *cost += extra_cost->alu.arith;
8965             return true;
8966           }
8967
8968         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8969
8970         /* Look for ADD (extended register).  */
8971         if (is_a <scalar_int_mode> (mode, &int_mode)
8972             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8973           {
8974             if (speed)
8975               *cost += extra_cost->alu.extend_arith;
8976
8977             op0 = aarch64_strip_extend (op0, true);
8978             *cost += rtx_cost (op0, VOIDmode,
8979                                (enum rtx_code) GET_CODE (op0), 0, speed);
8980             return true;
8981           }
8982
8983         /* Strip any extend, leave shifts behind as we will
8984            cost them through mult_cost.  */
8985         new_op0 = aarch64_strip_extend (op0, false);
8986
8987         if (GET_CODE (new_op0) == MULT
8988             || aarch64_shift_p (GET_CODE (new_op0)))
8989           {
8990             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8991                                             speed);
8992             return true;
8993           }
8994
8995         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8996
8997         if (speed)
8998           {
8999             if (VECTOR_MODE_P (mode))
9000               {
9001                 /* Vector ADD.  */
9002                 *cost += extra_cost->vect.alu;
9003               }
9004             else if (GET_MODE_CLASS (mode) == MODE_INT)
9005               {
9006                 /* ADD.  */
9007                 *cost += extra_cost->alu.arith;
9008               }
9009             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9010               {
9011                 /* FADD.  */
9012                 *cost += extra_cost->fp[mode == DFmode].addsub;
9013               }
9014           }
9015         return true;
9016       }
9017
9018     case BSWAP:
9019       *cost = COSTS_N_INSNS (1);
9020
9021       if (speed)
9022         {
9023           if (VECTOR_MODE_P (mode))
9024             *cost += extra_cost->vect.alu;
9025           else
9026             *cost += extra_cost->alu.rev;
9027         }
9028       return false;
9029
9030     case IOR:
9031       if (aarch_rev16_p (x))
9032         {
9033           *cost = COSTS_N_INSNS (1);
9034
9035           if (speed)
9036             {
9037               if (VECTOR_MODE_P (mode))
9038                 *cost += extra_cost->vect.alu;
9039               else
9040                 *cost += extra_cost->alu.rev;
9041             }
9042           return true;
9043         }
9044
9045       if (aarch64_extr_rtx_p (x, &op0, &op1))
9046         {
9047           *cost += rtx_cost (op0, mode, IOR, 0, speed);
9048           *cost += rtx_cost (op1, mode, IOR, 1, speed);
9049           if (speed)
9050             *cost += extra_cost->alu.shift;
9051
9052           return true;
9053         }
9054     /* Fall through.  */
9055     case XOR:
9056     case AND:
9057     cost_logic:
9058       op0 = XEXP (x, 0);
9059       op1 = XEXP (x, 1);
9060
9061       if (VECTOR_MODE_P (mode))
9062         {
9063           if (speed)
9064             *cost += extra_cost->vect.alu;
9065           return true;
9066         }
9067
9068       if (code == AND
9069           && GET_CODE (op0) == MULT
9070           && CONST_INT_P (XEXP (op0, 1))
9071           && CONST_INT_P (op1)
9072           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9073                                INTVAL (op1)) != 0)
9074         {
9075           /* This is a UBFM/SBFM.  */
9076           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9077           if (speed)
9078             *cost += extra_cost->alu.bfx;
9079           return true;
9080         }
9081
9082       if (is_int_mode (mode, &int_mode))
9083         {
9084           if (CONST_INT_P (op1))
9085             {
9086               /* We have a mask + shift version of a UBFIZ
9087                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
9088               if (GET_CODE (op0) == ASHIFT
9089                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9090                                                          XEXP (op0, 1)))
9091                 {
9092                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
9093                                      (enum rtx_code) code, 0, speed);
9094                   if (speed)
9095                     *cost += extra_cost->alu.bfx;
9096
9097                   return true;
9098                 }
9099               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9100                 {
9101                 /* We possibly get the immediate for free, this is not
9102                    modelled.  */
9103                   *cost += rtx_cost (op0, int_mode,
9104                                      (enum rtx_code) code, 0, speed);
9105                   if (speed)
9106                     *cost += extra_cost->alu.logical;
9107
9108                   return true;
9109                 }
9110             }
9111           else
9112             {
9113               rtx new_op0 = op0;
9114
9115               /* Handle ORN, EON, or BIC.  */
9116               if (GET_CODE (op0) == NOT)
9117                 op0 = XEXP (op0, 0);
9118
9119               new_op0 = aarch64_strip_shift (op0);
9120
9121               /* If we had a shift on op0 then this is a logical-shift-
9122                  by-register/immediate operation.  Otherwise, this is just
9123                  a logical operation.  */
9124               if (speed)
9125                 {
9126                   if (new_op0 != op0)
9127                     {
9128                       /* Shift by immediate.  */
9129                       if (CONST_INT_P (XEXP (op0, 1)))
9130                         *cost += extra_cost->alu.log_shift;
9131                       else
9132                         *cost += extra_cost->alu.log_shift_reg;
9133                     }
9134                   else
9135                     *cost += extra_cost->alu.logical;
9136                 }
9137
9138               /* In both cases we want to cost both operands.  */
9139               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9140                                  0, speed);
9141               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9142                                  1, speed);
9143
9144               return true;
9145             }
9146         }
9147       return false;
9148
9149     case NOT:
9150       x = XEXP (x, 0);
9151       op0 = aarch64_strip_shift (x);
9152
9153       if (VECTOR_MODE_P (mode))
9154         {
9155           /* Vector NOT.  */
9156           *cost += extra_cost->vect.alu;
9157           return false;
9158         }
9159
9160       /* MVN-shifted-reg.  */
9161       if (op0 != x)
9162         {
9163           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9164
9165           if (speed)
9166             *cost += extra_cost->alu.log_shift;
9167
9168           return true;
9169         }
9170       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9171          Handle the second form here taking care that 'a' in the above can
9172          be a shift.  */
9173       else if (GET_CODE (op0) == XOR)
9174         {
9175           rtx newop0 = XEXP (op0, 0);
9176           rtx newop1 = XEXP (op0, 1);
9177           rtx op0_stripped = aarch64_strip_shift (newop0);
9178
9179           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9180           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9181
9182           if (speed)
9183             {
9184               if (op0_stripped != newop0)
9185                 *cost += extra_cost->alu.log_shift;
9186               else
9187                 *cost += extra_cost->alu.logical;
9188             }
9189
9190           return true;
9191         }
9192       /* MVN.  */
9193       if (speed)
9194         *cost += extra_cost->alu.logical;
9195
9196       return false;
9197
9198     case ZERO_EXTEND:
9199
9200       op0 = XEXP (x, 0);
9201       /* If a value is written in SI mode, then zero extended to DI
9202          mode, the operation will in general be free as a write to
9203          a 'w' register implicitly zeroes the upper bits of an 'x'
9204          register.  However, if this is
9205
9206            (set (reg) (zero_extend (reg)))
9207
9208          we must cost the explicit register move.  */
9209       if (mode == DImode
9210           && GET_MODE (op0) == SImode
9211           && outer == SET)
9212         {
9213           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9214
9215         /* If OP_COST is non-zero, then the cost of the zero extend
9216            is effectively the cost of the inner operation.  Otherwise
9217            we have a MOV instruction and we take the cost from the MOV
9218            itself.  This is true independently of whether we are
9219            optimizing for space or time.  */
9220           if (op_cost)
9221             *cost = op_cost;
9222
9223           return true;
9224         }
9225       else if (MEM_P (op0))
9226         {
9227           /* All loads can zero extend to any size for free.  */
9228           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9229           return true;
9230         }
9231
9232       op0 = aarch64_extend_bitfield_pattern_p (x);
9233       if (op0)
9234         {
9235           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9236           if (speed)
9237             *cost += extra_cost->alu.bfx;
9238           return true;
9239         }
9240
9241       if (speed)
9242         {
9243           if (VECTOR_MODE_P (mode))
9244             {
9245               /* UMOV.  */
9246               *cost += extra_cost->vect.alu;
9247             }
9248           else
9249             {
9250               /* We generate an AND instead of UXTB/UXTH.  */
9251               *cost += extra_cost->alu.logical;
9252             }
9253         }
9254       return false;
9255
9256     case SIGN_EXTEND:
9257       if (MEM_P (XEXP (x, 0)))
9258         {
9259           /* LDRSH.  */
9260           if (speed)
9261             {
9262               rtx address = XEXP (XEXP (x, 0), 0);
9263               *cost += extra_cost->ldst.load_sign_extend;
9264
9265               *cost +=
9266                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9267                                                      0, speed));
9268             }
9269           return true;
9270         }
9271
9272       op0 = aarch64_extend_bitfield_pattern_p (x);
9273       if (op0)
9274         {
9275           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9276           if (speed)
9277             *cost += extra_cost->alu.bfx;
9278           return true;
9279         }
9280
9281       if (speed)
9282         {
9283           if (VECTOR_MODE_P (mode))
9284             *cost += extra_cost->vect.alu;
9285           else
9286             *cost += extra_cost->alu.extend;
9287         }
9288       return false;
9289
9290     case ASHIFT:
9291       op0 = XEXP (x, 0);
9292       op1 = XEXP (x, 1);
9293
9294       if (CONST_INT_P (op1))
9295         {
9296           if (speed)
9297             {
9298               if (VECTOR_MODE_P (mode))
9299                 {
9300                   /* Vector shift (immediate).  */
9301                   *cost += extra_cost->vect.alu;
9302                 }
9303               else
9304                 {
9305                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
9306                      aliases.  */
9307                   *cost += extra_cost->alu.shift;
9308                 }
9309             }
9310
9311           /* We can incorporate zero/sign extend for free.  */
9312           if (GET_CODE (op0) == ZERO_EXTEND
9313               || GET_CODE (op0) == SIGN_EXTEND)
9314             op0 = XEXP (op0, 0);
9315
9316           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9317           return true;
9318         }
9319       else
9320         {
9321           if (VECTOR_MODE_P (mode))
9322             {
9323               if (speed)
9324                 /* Vector shift (register).  */
9325                 *cost += extra_cost->vect.alu;
9326             }
9327           else
9328             {
9329               if (speed)
9330                 /* LSLV.  */
9331                 *cost += extra_cost->alu.shift_reg;
9332
9333               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9334                   && CONST_INT_P (XEXP (op1, 1))
9335                   && known_eq (INTVAL (XEXP (op1, 1)),
9336                                GET_MODE_BITSIZE (mode) - 1))
9337                 {
9338                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9339                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9340                      don't recurse into it.  */
9341                   return true;
9342                 }
9343             }
9344           return false;  /* All arguments need to be in registers.  */
9345         }
9346
9347     case ROTATE:
9348     case ROTATERT:
9349     case LSHIFTRT:
9350     case ASHIFTRT:
9351       op0 = XEXP (x, 0);
9352       op1 = XEXP (x, 1);
9353
9354       if (CONST_INT_P (op1))
9355         {
9356           /* ASR (immediate) and friends.  */
9357           if (speed)
9358             {
9359               if (VECTOR_MODE_P (mode))
9360                 *cost += extra_cost->vect.alu;
9361               else
9362                 *cost += extra_cost->alu.shift;
9363             }
9364
9365           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9366           return true;
9367         }
9368       else
9369         {
9370           if (VECTOR_MODE_P (mode))
9371             {
9372               if (speed)
9373                 /* Vector shift (register).  */
9374                 *cost += extra_cost->vect.alu;
9375             }
9376           else
9377             {
9378               if (speed)
9379                 /* ASR (register) and friends.  */
9380                 *cost += extra_cost->alu.shift_reg;
9381
9382               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9383                   && CONST_INT_P (XEXP (op1, 1))
9384                   && known_eq (INTVAL (XEXP (op1, 1)),
9385                                GET_MODE_BITSIZE (mode) - 1))
9386                 {
9387                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9388                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9389                      don't recurse into it.  */
9390                   return true;
9391                 }
9392             }
9393           return false;  /* All arguments need to be in registers.  */
9394         }
9395
9396     case SYMBOL_REF:
9397
9398       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9399           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9400         {
9401           /* LDR.  */
9402           if (speed)
9403             *cost += extra_cost->ldst.load;
9404         }
9405       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9406                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9407         {
9408           /* ADRP, followed by ADD.  */
9409           *cost += COSTS_N_INSNS (1);
9410           if (speed)
9411             *cost += 2 * extra_cost->alu.arith;
9412         }
9413       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9414                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9415         {
9416           /* ADR.  */
9417           if (speed)
9418             *cost += extra_cost->alu.arith;
9419         }
9420
9421       if (flag_pic)
9422         {
9423           /* One extra load instruction, after accessing the GOT.  */
9424           *cost += COSTS_N_INSNS (1);
9425           if (speed)
9426             *cost += extra_cost->ldst.load;
9427         }
9428       return true;
9429
9430     case HIGH:
9431     case LO_SUM:
9432       /* ADRP/ADD (immediate).  */
9433       if (speed)
9434         *cost += extra_cost->alu.arith;
9435       return true;
9436
9437     case ZERO_EXTRACT:
9438     case SIGN_EXTRACT:
9439       /* UBFX/SBFX.  */
9440       if (speed)
9441         {
9442           if (VECTOR_MODE_P (mode))
9443             *cost += extra_cost->vect.alu;
9444           else
9445             *cost += extra_cost->alu.bfx;
9446         }
9447
9448       /* We can trust that the immediates used will be correct (there
9449          are no by-register forms), so we need only cost op0.  */
9450       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9451       return true;
9452
9453     case MULT:
9454       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9455       /* aarch64_rtx_mult_cost always handles recursion to its
9456          operands.  */
9457       return true;
9458
9459     case MOD:
9460     /* We can expand signed mod by power of 2 using a NEGS, two parallel
9461        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
9462        an unconditional negate.  This case should only ever be reached through
9463        the set_smod_pow2_cheap check in expmed.c.  */
9464       if (CONST_INT_P (XEXP (x, 1))
9465           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9466           && (mode == SImode || mode == DImode))
9467         {
9468           /* We expand to 4 instructions.  Reset the baseline.  */
9469           *cost = COSTS_N_INSNS (4);
9470
9471           if (speed)
9472             *cost += 2 * extra_cost->alu.logical
9473                      + 2 * extra_cost->alu.arith;
9474
9475           return true;
9476         }
9477
9478     /* Fall-through.  */
9479     case UMOD:
9480       if (speed)
9481         {
9482           /* Slighly prefer UMOD over SMOD.  */
9483           if (VECTOR_MODE_P (mode))
9484             *cost += extra_cost->vect.alu;
9485           else if (GET_MODE_CLASS (mode) == MODE_INT)
9486             *cost += (extra_cost->mult[mode == DImode].add
9487                       + extra_cost->mult[mode == DImode].idiv
9488                       + (code == MOD ? 1 : 0));
9489         }
9490       return false;  /* All arguments need to be in registers.  */
9491
9492     case DIV:
9493     case UDIV:
9494     case SQRT:
9495       if (speed)
9496         {
9497           if (VECTOR_MODE_P (mode))
9498             *cost += extra_cost->vect.alu;
9499           else if (GET_MODE_CLASS (mode) == MODE_INT)
9500             /* There is no integer SQRT, so only DIV and UDIV can get
9501                here.  */
9502             *cost += (extra_cost->mult[mode == DImode].idiv
9503                      /* Slighly prefer UDIV over SDIV.  */
9504                      + (code == DIV ? 1 : 0));
9505           else
9506             *cost += extra_cost->fp[mode == DFmode].div;
9507         }
9508       return false;  /* All arguments need to be in registers.  */
9509
9510     case IF_THEN_ELSE:
9511       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9512                                          XEXP (x, 2), cost, speed);
9513
9514     case EQ:
9515     case NE:
9516     case GT:
9517     case GTU:
9518     case LT:
9519     case LTU:
9520     case GE:
9521     case GEU:
9522     case LE:
9523     case LEU:
9524
9525       return false; /* All arguments must be in registers.  */
9526
9527     case FMA:
9528       op0 = XEXP (x, 0);
9529       op1 = XEXP (x, 1);
9530       op2 = XEXP (x, 2);
9531
9532       if (speed)
9533         {
9534           if (VECTOR_MODE_P (mode))
9535             *cost += extra_cost->vect.alu;
9536           else
9537             *cost += extra_cost->fp[mode == DFmode].fma;
9538         }
9539
9540       /* FMSUB, FNMADD, and FNMSUB are free.  */
9541       if (GET_CODE (op0) == NEG)
9542         op0 = XEXP (op0, 0);
9543
9544       if (GET_CODE (op2) == NEG)
9545         op2 = XEXP (op2, 0);
9546
9547       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9548          and the by-element operand as operand 0.  */
9549       if (GET_CODE (op1) == NEG)
9550         op1 = XEXP (op1, 0);
9551
9552       /* Catch vector-by-element operations.  The by-element operand can
9553          either be (vec_duplicate (vec_select (x))) or just
9554          (vec_select (x)), depending on whether we are multiplying by
9555          a vector or a scalar.
9556
9557          Canonicalization is not very good in these cases, FMA4 will put the
9558          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
9559       if (GET_CODE (op0) == VEC_DUPLICATE)
9560         op0 = XEXP (op0, 0);
9561       else if (GET_CODE (op1) == VEC_DUPLICATE)
9562         op1 = XEXP (op1, 0);
9563
9564       if (GET_CODE (op0) == VEC_SELECT)
9565         op0 = XEXP (op0, 0);
9566       else if (GET_CODE (op1) == VEC_SELECT)
9567         op1 = XEXP (op1, 0);
9568
9569       /* If the remaining parameters are not registers,
9570          get the cost to put them into registers.  */
9571       *cost += rtx_cost (op0, mode, FMA, 0, speed);
9572       *cost += rtx_cost (op1, mode, FMA, 1, speed);
9573       *cost += rtx_cost (op2, mode, FMA, 2, speed);
9574       return true;
9575
9576     case FLOAT:
9577     case UNSIGNED_FLOAT:
9578       if (speed)
9579         *cost += extra_cost->fp[mode == DFmode].fromint;
9580       return false;
9581
9582     case FLOAT_EXTEND:
9583       if (speed)
9584         {
9585           if (VECTOR_MODE_P (mode))
9586             {
9587               /*Vector truncate.  */
9588               *cost += extra_cost->vect.alu;
9589             }
9590           else
9591             *cost += extra_cost->fp[mode == DFmode].widen;
9592         }
9593       return false;
9594
9595     case FLOAT_TRUNCATE:
9596       if (speed)
9597         {
9598           if (VECTOR_MODE_P (mode))
9599             {
9600               /*Vector conversion.  */
9601               *cost += extra_cost->vect.alu;
9602             }
9603           else
9604             *cost += extra_cost->fp[mode == DFmode].narrow;
9605         }
9606       return false;
9607
9608     case FIX:
9609     case UNSIGNED_FIX:
9610       x = XEXP (x, 0);
9611       /* Strip the rounding part.  They will all be implemented
9612          by the fcvt* family of instructions anyway.  */
9613       if (GET_CODE (x) == UNSPEC)
9614         {
9615           unsigned int uns_code = XINT (x, 1);
9616
9617           if (uns_code == UNSPEC_FRINTA
9618               || uns_code == UNSPEC_FRINTM
9619               || uns_code == UNSPEC_FRINTN
9620               || uns_code == UNSPEC_FRINTP
9621               || uns_code == UNSPEC_FRINTZ)
9622             x = XVECEXP (x, 0, 0);
9623         }
9624
9625       if (speed)
9626         {
9627           if (VECTOR_MODE_P (mode))
9628             *cost += extra_cost->vect.alu;
9629           else
9630             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9631         }
9632
9633       /* We can combine fmul by a power of 2 followed by a fcvt into a single
9634          fixed-point fcvt.  */
9635       if (GET_CODE (x) == MULT
9636           && ((VECTOR_MODE_P (mode)
9637                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9638               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9639         {
9640           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9641                              0, speed);
9642           return true;
9643         }
9644
9645       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9646       return true;
9647
9648     case ABS:
9649       if (VECTOR_MODE_P (mode))
9650         {
9651           /* ABS (vector).  */
9652           if (speed)
9653             *cost += extra_cost->vect.alu;
9654         }
9655       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9656         {
9657           op0 = XEXP (x, 0);
9658
9659           /* FABD, which is analogous to FADD.  */
9660           if (GET_CODE (op0) == MINUS)
9661             {
9662               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9663               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9664               if (speed)
9665                 *cost += extra_cost->fp[mode == DFmode].addsub;
9666
9667               return true;
9668             }
9669           /* Simple FABS is analogous to FNEG.  */
9670           if (speed)
9671             *cost += extra_cost->fp[mode == DFmode].neg;
9672         }
9673       else
9674         {
9675           /* Integer ABS will either be split to
9676              two arithmetic instructions, or will be an ABS
9677              (scalar), which we don't model.  */
9678           *cost = COSTS_N_INSNS (2);
9679           if (speed)
9680             *cost += 2 * extra_cost->alu.arith;
9681         }
9682       return false;
9683
9684     case SMAX:
9685     case SMIN:
9686       if (speed)
9687         {
9688           if (VECTOR_MODE_P (mode))
9689             *cost += extra_cost->vect.alu;
9690           else
9691             {
9692               /* FMAXNM/FMINNM/FMAX/FMIN.
9693                  TODO: This may not be accurate for all implementations, but
9694                  we do not model this in the cost tables.  */
9695               *cost += extra_cost->fp[mode == DFmode].addsub;
9696             }
9697         }
9698       return false;
9699
9700     case UNSPEC:
9701       /* The floating point round to integer frint* instructions.  */
9702       if (aarch64_frint_unspec_p (XINT (x, 1)))
9703         {
9704           if (speed)
9705             *cost += extra_cost->fp[mode == DFmode].roundint;
9706
9707           return false;
9708         }
9709
9710       if (XINT (x, 1) == UNSPEC_RBIT)
9711         {
9712           if (speed)
9713             *cost += extra_cost->alu.rev;
9714
9715           return false;
9716         }
9717       break;
9718
9719     case TRUNCATE:
9720
9721       /* Decompose <su>muldi3_highpart.  */
9722       if (/* (truncate:DI  */
9723           mode == DImode
9724           /*   (lshiftrt:TI  */
9725           && GET_MODE (XEXP (x, 0)) == TImode
9726           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9727           /*      (mult:TI  */
9728           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9729           /*        (ANY_EXTEND:TI (reg:DI))
9730                     (ANY_EXTEND:TI (reg:DI)))  */
9731           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9732                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9733               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9734                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9735           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9736           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9737           /*     (const_int 64)  */
9738           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9739           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9740         {
9741           /* UMULH/SMULH.  */
9742           if (speed)
9743             *cost += extra_cost->mult[mode == DImode].extend;
9744           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9745                              mode, MULT, 0, speed);
9746           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9747                              mode, MULT, 1, speed);
9748           return true;
9749         }
9750
9751       /* Fall through.  */
9752     default:
9753       break;
9754     }
9755
9756   if (dump_file
9757       && flag_aarch64_verbose_cost)
9758     fprintf (dump_file,
9759       "\nFailed to cost RTX.  Assuming default cost.\n");
9760
9761   return true;
9762 }
9763
9764 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9765    calculated for X.  This cost is stored in *COST.  Returns true
9766    if the total cost of X was calculated.  */
9767 static bool
9768 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9769                    int param, int *cost, bool speed)
9770 {
9771   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9772
9773   if (dump_file
9774       && flag_aarch64_verbose_cost)
9775     {
9776       print_rtl_single (dump_file, x);
9777       fprintf (dump_file, "\n%s cost: %d (%s)\n",
9778                speed ? "Hot" : "Cold",
9779                *cost, result ? "final" : "partial");
9780     }
9781
9782   return result;
9783 }
9784
9785 static int
9786 aarch64_register_move_cost (machine_mode mode,
9787                             reg_class_t from_i, reg_class_t to_i)
9788 {
9789   enum reg_class from = (enum reg_class) from_i;
9790   enum reg_class to = (enum reg_class) to_i;
9791   const struct cpu_regmove_cost *regmove_cost
9792     = aarch64_tune_params.regmove_cost;
9793
9794   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
9795   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9796     to = GENERAL_REGS;
9797
9798   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9799     from = GENERAL_REGS;
9800
9801   /* Moving between GPR and stack cost is the same as GP2GP.  */
9802   if ((from == GENERAL_REGS && to == STACK_REG)
9803       || (to == GENERAL_REGS && from == STACK_REG))
9804     return regmove_cost->GP2GP;
9805
9806   /* To/From the stack register, we move via the gprs.  */
9807   if (to == STACK_REG || from == STACK_REG)
9808     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9809             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9810
9811   if (known_eq (GET_MODE_SIZE (mode), 16))
9812     {
9813       /* 128-bit operations on general registers require 2 instructions.  */
9814       if (from == GENERAL_REGS && to == GENERAL_REGS)
9815         return regmove_cost->GP2GP * 2;
9816       else if (from == GENERAL_REGS)
9817         return regmove_cost->GP2FP * 2;
9818       else if (to == GENERAL_REGS)
9819         return regmove_cost->FP2GP * 2;
9820
9821       /* When AdvSIMD instructions are disabled it is not possible to move
9822          a 128-bit value directly between Q registers.  This is handled in
9823          secondary reload.  A general register is used as a scratch to move
9824          the upper DI value and the lower DI value is moved directly,
9825          hence the cost is the sum of three moves. */
9826       if (! TARGET_SIMD)
9827         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9828
9829       return regmove_cost->FP2FP;
9830     }
9831
9832   if (from == GENERAL_REGS && to == GENERAL_REGS)
9833     return regmove_cost->GP2GP;
9834   else if (from == GENERAL_REGS)
9835     return regmove_cost->GP2FP;
9836   else if (to == GENERAL_REGS)
9837     return regmove_cost->FP2GP;
9838
9839   return regmove_cost->FP2FP;
9840 }
9841
9842 static int
9843 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9844                           reg_class_t rclass ATTRIBUTE_UNUSED,
9845                           bool in ATTRIBUTE_UNUSED)
9846 {
9847   return aarch64_tune_params.memmov_cost;
9848 }
9849
9850 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9851    to optimize 1.0/sqrt.  */
9852
9853 static bool
9854 use_rsqrt_p (machine_mode mode)
9855 {
9856   return (!flag_trapping_math
9857           && flag_unsafe_math_optimizations
9858           && ((aarch64_tune_params.approx_modes->recip_sqrt
9859                & AARCH64_APPROX_MODE (mode))
9860               || flag_mrecip_low_precision_sqrt));
9861 }
9862
9863 /* Function to decide when to use the approximate reciprocal square root
9864    builtin.  */
9865
9866 static tree
9867 aarch64_builtin_reciprocal (tree fndecl)
9868 {
9869   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9870
9871   if (!use_rsqrt_p (mode))
9872     return NULL_TREE;
9873   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9874 }
9875
9876 typedef rtx (*rsqrte_type) (rtx, rtx);
9877
9878 /* Select reciprocal square root initial estimate insn depending on machine
9879    mode.  */
9880
9881 static rsqrte_type
9882 get_rsqrte_type (machine_mode mode)
9883 {
9884   switch (mode)
9885   {
9886     case E_DFmode:   return gen_aarch64_rsqrtedf;
9887     case E_SFmode:   return gen_aarch64_rsqrtesf;
9888     case E_V2DFmode: return gen_aarch64_rsqrtev2df;
9889     case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
9890     case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
9891     default: gcc_unreachable ();
9892   }
9893 }
9894
9895 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
9896
9897 /* Select reciprocal square root series step insn depending on machine mode.  */
9898
9899 static rsqrts_type
9900 get_rsqrts_type (machine_mode mode)
9901 {
9902   switch (mode)
9903   {
9904     case E_DFmode:   return gen_aarch64_rsqrtsdf;
9905     case E_SFmode:   return gen_aarch64_rsqrtssf;
9906     case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
9907     case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
9908     case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
9909     default: gcc_unreachable ();
9910   }
9911 }
9912
9913 /* Emit instruction sequence to compute either the approximate square root
9914    or its approximate reciprocal, depending on the flag RECP, and return
9915    whether the sequence was emitted or not.  */
9916
9917 bool
9918 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9919 {
9920   machine_mode mode = GET_MODE (dst);
9921
9922   if (GET_MODE_INNER (mode) == HFmode)
9923     {
9924       gcc_assert (!recp);
9925       return false;
9926     }
9927
9928   if (!recp)
9929     {
9930       if (!(flag_mlow_precision_sqrt
9931             || (aarch64_tune_params.approx_modes->sqrt
9932                 & AARCH64_APPROX_MODE (mode))))
9933         return false;
9934
9935       if (flag_finite_math_only
9936           || flag_trapping_math
9937           || !flag_unsafe_math_optimizations
9938           || optimize_function_for_size_p (cfun))
9939         return false;
9940     }
9941   else
9942     /* Caller assumes we cannot fail.  */
9943     gcc_assert (use_rsqrt_p (mode));
9944
9945   machine_mode mmsk = mode_for_int_vector (mode).require ();
9946   rtx xmsk = gen_reg_rtx (mmsk);
9947   if (!recp)
9948     /* When calculating the approximate square root, compare the
9949        argument with 0.0 and create a mask.  */
9950     emit_insn (gen_rtx_SET (xmsk,
9951                             gen_rtx_NEG (mmsk,
9952                                          gen_rtx_EQ (mmsk, src,
9953                                                      CONST0_RTX (mode)))));
9954
9955   /* Estimate the approximate reciprocal square root.  */
9956   rtx xdst = gen_reg_rtx (mode);
9957   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
9958
9959   /* Iterate over the series twice for SF and thrice for DF.  */
9960   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9961
9962   /* Optionally iterate over the series once less for faster performance
9963      while sacrificing the accuracy.  */
9964   if ((recp && flag_mrecip_low_precision_sqrt)
9965       || (!recp && flag_mlow_precision_sqrt))
9966     iterations--;
9967
9968   /* Iterate over the series to calculate the approximate reciprocal square
9969      root.  */
9970   rtx x1 = gen_reg_rtx (mode);
9971   while (iterations--)
9972     {
9973       rtx x2 = gen_reg_rtx (mode);
9974       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9975
9976       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
9977
9978       if (iterations > 0)
9979         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9980     }
9981
9982   if (!recp)
9983     {
9984       /* Qualify the approximate reciprocal square root when the argument is
9985          0.0 by squashing the intermediary result to 0.0.  */
9986       rtx xtmp = gen_reg_rtx (mmsk);
9987       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9988                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
9989       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9990
9991       /* Calculate the approximate square root.  */
9992       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9993     }
9994
9995   /* Finalize the approximation.  */
9996   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9997
9998   return true;
9999 }
10000
10001 typedef rtx (*recpe_type) (rtx, rtx);
10002
10003 /* Select reciprocal initial estimate insn depending on machine mode.  */
10004
10005 static recpe_type
10006 get_recpe_type (machine_mode mode)
10007 {
10008   switch (mode)
10009   {
10010     case E_SFmode:   return (gen_aarch64_frecpesf);
10011     case E_V2SFmode: return (gen_aarch64_frecpev2sf);
10012     case E_V4SFmode: return (gen_aarch64_frecpev4sf);
10013     case E_DFmode:   return (gen_aarch64_frecpedf);
10014     case E_V2DFmode: return (gen_aarch64_frecpev2df);
10015     default:         gcc_unreachable ();
10016   }
10017 }
10018
10019 typedef rtx (*recps_type) (rtx, rtx, rtx);
10020
10021 /* Select reciprocal series step insn depending on machine mode.  */
10022
10023 static recps_type
10024 get_recps_type (machine_mode mode)
10025 {
10026   switch (mode)
10027   {
10028     case E_SFmode:   return (gen_aarch64_frecpssf);
10029     case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
10030     case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
10031     case E_DFmode:   return (gen_aarch64_frecpsdf);
10032     case E_V2DFmode: return (gen_aarch64_frecpsv2df);
10033     default:         gcc_unreachable ();
10034   }
10035 }
10036
10037 /* Emit the instruction sequence to compute the approximation for the division
10038    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
10039
10040 bool
10041 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10042 {
10043   machine_mode mode = GET_MODE (quo);
10044
10045   if (GET_MODE_INNER (mode) == HFmode)
10046     return false;
10047
10048   bool use_approx_division_p = (flag_mlow_precision_div
10049                                 || (aarch64_tune_params.approx_modes->division
10050                                     & AARCH64_APPROX_MODE (mode)));
10051
10052   if (!flag_finite_math_only
10053       || flag_trapping_math
10054       || !flag_unsafe_math_optimizations
10055       || optimize_function_for_size_p (cfun)
10056       || !use_approx_division_p)
10057     return false;
10058
10059   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10060     return false;
10061
10062   /* Estimate the approximate reciprocal.  */
10063   rtx xrcp = gen_reg_rtx (mode);
10064   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
10065
10066   /* Iterate over the series twice for SF and thrice for DF.  */
10067   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10068
10069   /* Optionally iterate over the series once less for faster performance,
10070      while sacrificing the accuracy.  */
10071   if (flag_mlow_precision_div)
10072     iterations--;
10073
10074   /* Iterate over the series to calculate the approximate reciprocal.  */
10075   rtx xtmp = gen_reg_rtx (mode);
10076   while (iterations--)
10077     {
10078       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
10079
10080       if (iterations > 0)
10081         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10082     }
10083
10084   if (num != CONST1_RTX (mode))
10085     {
10086       /* As the approximate reciprocal of DEN is already calculated, only
10087          calculate the approximate division when NUM is not 1.0.  */
10088       rtx xnum = force_reg (mode, num);
10089       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10090     }
10091
10092   /* Finalize the approximation.  */
10093   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10094   return true;
10095 }
10096
10097 /* Return the number of instructions that can be issued per cycle.  */
10098 static int
10099 aarch64_sched_issue_rate (void)
10100 {
10101   return aarch64_tune_params.issue_rate;
10102 }
10103
10104 static int
10105 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10106 {
10107   int issue_rate = aarch64_sched_issue_rate ();
10108
10109   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10110 }
10111
10112
10113 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10114    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
10115    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
10116
10117 static int
10118 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10119                                                     int ready_index)
10120 {
10121   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10122 }
10123
10124
10125 /* Vectorizer cost model target hooks.  */
10126
10127 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
10128 static int
10129 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10130                                     tree vectype,
10131                                     int misalign ATTRIBUTE_UNUSED)
10132 {
10133   unsigned elements;
10134   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10135   bool fp = false;
10136
10137   if (vectype != NULL)
10138     fp = FLOAT_TYPE_P (vectype);
10139
10140   switch (type_of_cost)
10141     {
10142       case scalar_stmt:
10143         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10144
10145       case scalar_load:
10146         return costs->scalar_load_cost;
10147
10148       case scalar_store:
10149         return costs->scalar_store_cost;
10150
10151       case vector_stmt:
10152         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10153
10154       case vector_load:
10155         return costs->vec_align_load_cost;
10156
10157       case vector_store:
10158         return costs->vec_store_cost;
10159
10160       case vec_to_scalar:
10161         return costs->vec_to_scalar_cost;
10162
10163       case scalar_to_vec:
10164         return costs->scalar_to_vec_cost;
10165
10166       case unaligned_load:
10167       case vector_gather_load:
10168         return costs->vec_unalign_load_cost;
10169
10170       case unaligned_store:
10171       case vector_scatter_store:
10172         return costs->vec_unalign_store_cost;
10173
10174       case cond_branch_taken:
10175         return costs->cond_taken_branch_cost;
10176
10177       case cond_branch_not_taken:
10178         return costs->cond_not_taken_branch_cost;
10179
10180       case vec_perm:
10181         return costs->vec_permute_cost;
10182
10183       case vec_promote_demote:
10184         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10185
10186       case vec_construct:
10187         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10188         return elements / 2 + 1;
10189
10190       default:
10191         gcc_unreachable ();
10192     }
10193 }
10194
10195 /* Implement targetm.vectorize.add_stmt_cost.  */
10196 static unsigned
10197 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10198                        struct _stmt_vec_info *stmt_info, int misalign,
10199                        enum vect_cost_model_location where)
10200 {
10201   unsigned *cost = (unsigned *) data;
10202   unsigned retval = 0;
10203
10204   if (flag_vect_cost_model)
10205     {
10206       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10207       int stmt_cost =
10208             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10209
10210       /* Statements in an inner loop relative to the loop being
10211          vectorized are weighted more heavily.  The value here is
10212          arbitrary and could potentially be improved with analysis.  */
10213       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10214         count *= 50; /*  FIXME  */
10215
10216       retval = (unsigned) (count * stmt_cost);
10217       cost[where] += retval;
10218     }
10219
10220   return retval;
10221 }
10222
10223 static void initialize_aarch64_code_model (struct gcc_options *);
10224
10225 /* Parse the TO_PARSE string and put the architecture struct that it
10226    selects into RES and the architectural features into ISA_FLAGS.
10227    Return an aarch64_parse_opt_result describing the parse result.
10228    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
10229
10230 static enum aarch64_parse_opt_result
10231 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10232                     unsigned long *isa_flags)
10233 {
10234   char *ext;
10235   const struct processor *arch;
10236   char *str = (char *) alloca (strlen (to_parse) + 1);
10237   size_t len;
10238
10239   strcpy (str, to_parse);
10240
10241   ext = strchr (str, '+');
10242
10243   if (ext != NULL)
10244     len = ext - str;
10245   else
10246     len = strlen (str);
10247
10248   if (len == 0)
10249     return AARCH64_PARSE_MISSING_ARG;
10250
10251
10252   /* Loop through the list of supported ARCHes to find a match.  */
10253   for (arch = all_architectures; arch->name != NULL; arch++)
10254     {
10255       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10256         {
10257           unsigned long isa_temp = arch->flags;
10258
10259           if (ext != NULL)
10260             {
10261               /* TO_PARSE string contains at least one extension.  */
10262               enum aarch64_parse_opt_result ext_res
10263                 = aarch64_parse_extension (ext, &isa_temp);
10264
10265               if (ext_res != AARCH64_PARSE_OK)
10266                 return ext_res;
10267             }
10268           /* Extension parsing was successful.  Confirm the result
10269              arch and ISA flags.  */
10270           *res = arch;
10271           *isa_flags = isa_temp;
10272           return AARCH64_PARSE_OK;
10273         }
10274     }
10275
10276   /* ARCH name not found in list.  */
10277   return AARCH64_PARSE_INVALID_ARG;
10278 }
10279
10280 /* Parse the TO_PARSE string and put the result tuning in RES and the
10281    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
10282    describing the parse result.  If there is an error parsing, RES and
10283    ISA_FLAGS are left unchanged.  */
10284
10285 static enum aarch64_parse_opt_result
10286 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10287                    unsigned long *isa_flags)
10288 {
10289   char *ext;
10290   const struct processor *cpu;
10291   char *str = (char *) alloca (strlen (to_parse) + 1);
10292   size_t len;
10293
10294   strcpy (str, to_parse);
10295
10296   ext = strchr (str, '+');
10297
10298   if (ext != NULL)
10299     len = ext - str;
10300   else
10301     len = strlen (str);
10302
10303   if (len == 0)
10304     return AARCH64_PARSE_MISSING_ARG;
10305
10306
10307   /* Loop through the list of supported CPUs to find a match.  */
10308   for (cpu = all_cores; cpu->name != NULL; cpu++)
10309     {
10310       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10311         {
10312           unsigned long isa_temp = cpu->flags;
10313
10314
10315           if (ext != NULL)
10316             {
10317               /* TO_PARSE string contains at least one extension.  */
10318               enum aarch64_parse_opt_result ext_res
10319                 = aarch64_parse_extension (ext, &isa_temp);
10320
10321               if (ext_res != AARCH64_PARSE_OK)
10322                 return ext_res;
10323             }
10324           /* Extension parsing was successfull.  Confirm the result
10325              cpu and ISA flags.  */
10326           *res = cpu;
10327           *isa_flags = isa_temp;
10328           return AARCH64_PARSE_OK;
10329         }
10330     }
10331
10332   /* CPU name not found in list.  */
10333   return AARCH64_PARSE_INVALID_ARG;
10334 }
10335
10336 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10337    Return an aarch64_parse_opt_result describing the parse result.
10338    If the parsing fails the RES does not change.  */
10339
10340 static enum aarch64_parse_opt_result
10341 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10342 {
10343   const struct processor *cpu;
10344   char *str = (char *) alloca (strlen (to_parse) + 1);
10345
10346   strcpy (str, to_parse);
10347
10348   /* Loop through the list of supported CPUs to find a match.  */
10349   for (cpu = all_cores; cpu->name != NULL; cpu++)
10350     {
10351       if (strcmp (cpu->name, str) == 0)
10352         {
10353           *res = cpu;
10354           return AARCH64_PARSE_OK;
10355         }
10356     }
10357
10358   /* CPU name not found in list.  */
10359   return AARCH64_PARSE_INVALID_ARG;
10360 }
10361
10362 /* Parse TOKEN, which has length LENGTH to see if it is an option
10363    described in FLAG.  If it is, return the index bit for that fusion type.
10364    If not, error (printing OPTION_NAME) and return zero.  */
10365
10366 static unsigned int
10367 aarch64_parse_one_option_token (const char *token,
10368                                 size_t length,
10369                                 const struct aarch64_flag_desc *flag,
10370                                 const char *option_name)
10371 {
10372   for (; flag->name != NULL; flag++)
10373     {
10374       if (length == strlen (flag->name)
10375           && !strncmp (flag->name, token, length))
10376         return flag->flag;
10377     }
10378
10379   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10380   return 0;
10381 }
10382
10383 /* Parse OPTION which is a comma-separated list of flags to enable.
10384    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10385    default state we inherit from the CPU tuning structures.  OPTION_NAME
10386    gives the top-level option we are parsing in the -moverride string,
10387    for use in error messages.  */
10388
10389 static unsigned int
10390 aarch64_parse_boolean_options (const char *option,
10391                                const struct aarch64_flag_desc *flags,
10392                                unsigned int initial_state,
10393                                const char *option_name)
10394 {
10395   const char separator = '.';
10396   const char* specs = option;
10397   const char* ntoken = option;
10398   unsigned int found_flags = initial_state;
10399
10400   while ((ntoken = strchr (specs, separator)))
10401     {
10402       size_t token_length = ntoken - specs;
10403       unsigned token_ops = aarch64_parse_one_option_token (specs,
10404                                                            token_length,
10405                                                            flags,
10406                                                            option_name);
10407       /* If we find "none" (or, for simplicity's sake, an error) anywhere
10408          in the token stream, reset the supported operations.  So:
10409
10410            adrp+add.cmp+branch.none.adrp+add
10411
10412            would have the result of turning on only adrp+add fusion.  */
10413       if (!token_ops)
10414         found_flags = 0;
10415
10416       found_flags |= token_ops;
10417       specs = ++ntoken;
10418     }
10419
10420   /* We ended with a comma, print something.  */
10421   if (!(*specs))
10422     {
10423       error ("%s string ill-formed\n", option_name);
10424       return 0;
10425     }
10426
10427   /* We still have one more token to parse.  */
10428   size_t token_length = strlen (specs);
10429   unsigned token_ops = aarch64_parse_one_option_token (specs,
10430                                                        token_length,
10431                                                        flags,
10432                                                        option_name);
10433    if (!token_ops)
10434      found_flags = 0;
10435
10436   found_flags |= token_ops;
10437   return found_flags;
10438 }
10439
10440 /* Support for overriding instruction fusion.  */
10441
10442 static void
10443 aarch64_parse_fuse_string (const char *fuse_string,
10444                             struct tune_params *tune)
10445 {
10446   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10447                                                      aarch64_fusible_pairs,
10448                                                      tune->fusible_ops,
10449                                                      "fuse=");
10450 }
10451
10452 /* Support for overriding other tuning flags.  */
10453
10454 static void
10455 aarch64_parse_tune_string (const char *tune_string,
10456                             struct tune_params *tune)
10457 {
10458   tune->extra_tuning_flags
10459     = aarch64_parse_boolean_options (tune_string,
10460                                      aarch64_tuning_flags,
10461                                      tune->extra_tuning_flags,
10462                                      "tune=");
10463 }
10464
10465 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10466    we understand.  If it is, extract the option string and handoff to
10467    the appropriate function.  */
10468
10469 void
10470 aarch64_parse_one_override_token (const char* token,
10471                                   size_t length,
10472                                   struct tune_params *tune)
10473 {
10474   const struct aarch64_tuning_override_function *fn
10475     = aarch64_tuning_override_functions;
10476
10477   const char *option_part = strchr (token, '=');
10478   if (!option_part)
10479     {
10480       error ("tuning string missing in option (%s)", token);
10481       return;
10482     }
10483
10484   /* Get the length of the option name.  */
10485   length = option_part - token;
10486   /* Skip the '=' to get to the option string.  */
10487   option_part++;
10488
10489   for (; fn->name != NULL; fn++)
10490     {
10491       if (!strncmp (fn->name, token, length))
10492         {
10493           fn->parse_override (option_part, tune);
10494           return;
10495         }
10496     }
10497
10498   error ("unknown tuning option (%s)",token);
10499   return;
10500 }
10501
10502 /* A checking mechanism for the implementation of the tls size.  */
10503
10504 static void
10505 initialize_aarch64_tls_size (struct gcc_options *opts)
10506 {
10507   if (aarch64_tls_size == 0)
10508     aarch64_tls_size = 24;
10509
10510   switch (opts->x_aarch64_cmodel_var)
10511     {
10512     case AARCH64_CMODEL_TINY:
10513       /* Both the default and maximum TLS size allowed under tiny is 1M which
10514          needs two instructions to address, so we clamp the size to 24.  */
10515       if (aarch64_tls_size > 24)
10516         aarch64_tls_size = 24;
10517       break;
10518     case AARCH64_CMODEL_SMALL:
10519       /* The maximum TLS size allowed under small is 4G.  */
10520       if (aarch64_tls_size > 32)
10521         aarch64_tls_size = 32;
10522       break;
10523     case AARCH64_CMODEL_LARGE:
10524       /* The maximum TLS size allowed under large is 16E.
10525          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
10526       if (aarch64_tls_size > 48)
10527         aarch64_tls_size = 48;
10528       break;
10529     default:
10530       gcc_unreachable ();
10531     }
10532
10533   return;
10534 }
10535
10536 /* Parse STRING looking for options in the format:
10537      string     :: option:string
10538      option     :: name=substring
10539      name       :: {a-z}
10540      substring  :: defined by option.  */
10541
10542 static void
10543 aarch64_parse_override_string (const char* input_string,
10544                                struct tune_params* tune)
10545 {
10546   const char separator = ':';
10547   size_t string_length = strlen (input_string) + 1;
10548   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10549   char *string = string_root;
10550   strncpy (string, input_string, string_length);
10551   string[string_length - 1] = '\0';
10552
10553   char* ntoken = string;
10554
10555   while ((ntoken = strchr (string, separator)))
10556     {
10557       size_t token_length = ntoken - string;
10558       /* Make this substring look like a string.  */
10559       *ntoken = '\0';
10560       aarch64_parse_one_override_token (string, token_length, tune);
10561       string = ++ntoken;
10562     }
10563
10564   /* One last option to parse.  */
10565   aarch64_parse_one_override_token (string, strlen (string), tune);
10566   free (string_root);
10567 }
10568
10569
10570 static void
10571 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10572 {
10573   /* PR 70044: We have to be careful about being called multiple times for the
10574      same function.  This means all changes should be repeatable.  */
10575
10576   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
10577      Disable the frame pointer flag so the mid-end will not use a frame
10578      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
10579      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
10580      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
10581   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
10582   if (opts->x_flag_omit_frame_pointer == 0)
10583     opts->x_flag_omit_frame_pointer = 2;
10584
10585   /* If not optimizing for size, set the default
10586      alignment to what the target wants.  */
10587   if (!opts->x_optimize_size)
10588     {
10589       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
10590         opts->x_str_align_loops = aarch64_tune_params.loop_align;
10591       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
10592         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
10593       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
10594         opts->x_str_align_functions = aarch64_tune_params.function_align;
10595     }
10596
10597   /* We default to no pc-relative literal loads.  */
10598
10599   aarch64_pcrelative_literal_loads = false;
10600
10601   /* If -mpc-relative-literal-loads is set on the command line, this
10602      implies that the user asked for PC relative literal loads.  */
10603   if (opts->x_pcrelative_literal_loads == 1)
10604     aarch64_pcrelative_literal_loads = true;
10605
10606   /* In the tiny memory model it makes no sense to disallow PC relative
10607      literal pool loads.  */
10608   if (aarch64_cmodel == AARCH64_CMODEL_TINY
10609       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10610     aarch64_pcrelative_literal_loads = true;
10611
10612   /* When enabling the lower precision Newton series for the square root, also
10613      enable it for the reciprocal square root, since the latter is an
10614      intermediary step for the former.  */
10615   if (flag_mlow_precision_sqrt)
10616     flag_mrecip_low_precision_sqrt = true;
10617 }
10618
10619 /* 'Unpack' up the internal tuning structs and update the options
10620     in OPTS.  The caller must have set up selected_tune and selected_arch
10621     as all the other target-specific codegen decisions are
10622     derived from them.  */
10623
10624 void
10625 aarch64_override_options_internal (struct gcc_options *opts)
10626 {
10627   aarch64_tune_flags = selected_tune->flags;
10628   aarch64_tune = selected_tune->sched_core;
10629   /* Make a copy of the tuning parameters attached to the core, which
10630      we may later overwrite.  */
10631   aarch64_tune_params = *(selected_tune->tune);
10632   aarch64_architecture_version = selected_arch->architecture_version;
10633
10634   if (opts->x_aarch64_override_tune_string)
10635     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10636                                   &aarch64_tune_params);
10637
10638   /* This target defaults to strict volatile bitfields.  */
10639   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10640     opts->x_flag_strict_volatile_bitfields = 1;
10641
10642   initialize_aarch64_code_model (opts);
10643   initialize_aarch64_tls_size (opts);
10644
10645   int queue_depth = 0;
10646   switch (aarch64_tune_params.autoprefetcher_model)
10647     {
10648       case tune_params::AUTOPREFETCHER_OFF:
10649         queue_depth = -1;
10650         break;
10651       case tune_params::AUTOPREFETCHER_WEAK:
10652         queue_depth = 0;
10653         break;
10654       case tune_params::AUTOPREFETCHER_STRONG:
10655         queue_depth = max_insn_queue_index + 1;
10656         break;
10657       default:
10658         gcc_unreachable ();
10659     }
10660
10661   /* We don't mind passing in global_options_set here as we don't use
10662      the *options_set structs anyway.  */
10663   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10664                          queue_depth,
10665                          opts->x_param_values,
10666                          global_options_set.x_param_values);
10667
10668   /* Set up parameters to be used in prefetching algorithm.  Do not
10669      override the defaults unless we are tuning for a core we have
10670      researched values for.  */
10671   if (aarch64_tune_params.prefetch->num_slots > 0)
10672     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10673                            aarch64_tune_params.prefetch->num_slots,
10674                            opts->x_param_values,
10675                            global_options_set.x_param_values);
10676   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10677     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10678                            aarch64_tune_params.prefetch->l1_cache_size,
10679                            opts->x_param_values,
10680                            global_options_set.x_param_values);
10681   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10682     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10683                            aarch64_tune_params.prefetch->l1_cache_line_size,
10684                            opts->x_param_values,
10685                            global_options_set.x_param_values);
10686   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10687     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10688                            aarch64_tune_params.prefetch->l2_cache_size,
10689                            opts->x_param_values,
10690                            global_options_set.x_param_values);
10691   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
10692     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
10693                            0,
10694                            opts->x_param_values,
10695                            global_options_set.x_param_values);
10696   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
10697     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
10698                            aarch64_tune_params.prefetch->minimum_stride,
10699                            opts->x_param_values,
10700                            global_options_set.x_param_values);
10701
10702   /* Use the alternative scheduling-pressure algorithm by default.  */
10703   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10704                          opts->x_param_values,
10705                          global_options_set.x_param_values);
10706
10707   /* Enable sw prefetching at specified optimization level for
10708      CPUS that have prefetch.  Lower optimization level threshold by 1
10709      when profiling is enabled.  */
10710   if (opts->x_flag_prefetch_loop_arrays < 0
10711       && !opts->x_optimize_size
10712       && aarch64_tune_params.prefetch->default_opt_level >= 0
10713       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10714     opts->x_flag_prefetch_loop_arrays = 1;
10715
10716   aarch64_override_options_after_change_1 (opts);
10717 }
10718
10719 /* Print a hint with a suggestion for a core or architecture name that
10720    most closely resembles what the user passed in STR.  ARCH is true if
10721    the user is asking for an architecture name.  ARCH is false if the user
10722    is asking for a core name.  */
10723
10724 static void
10725 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10726 {
10727   auto_vec<const char *> candidates;
10728   const struct processor *entry = arch ? all_architectures : all_cores;
10729   for (; entry->name != NULL; entry++)
10730     candidates.safe_push (entry->name);
10731
10732 #ifdef HAVE_LOCAL_CPU_DETECT
10733   /* Add also "native" as possible value.  */
10734   if (arch)
10735     candidates.safe_push ("native");
10736 #endif
10737
10738   char *s;
10739   const char *hint = candidates_list_and_hint (str, s, candidates);
10740   if (hint)
10741     inform (input_location, "valid arguments are: %s;"
10742                              " did you mean %qs?", s, hint);
10743   else
10744     inform (input_location, "valid arguments are: %s", s);
10745
10746   XDELETEVEC (s);
10747 }
10748
10749 /* Print a hint with a suggestion for a core name that most closely resembles
10750    what the user passed in STR.  */
10751
10752 inline static void
10753 aarch64_print_hint_for_core (const char *str)
10754 {
10755   aarch64_print_hint_for_core_or_arch (str, false);
10756 }
10757
10758 /* Print a hint with a suggestion for an architecture name that most closely
10759    resembles what the user passed in STR.  */
10760
10761 inline static void
10762 aarch64_print_hint_for_arch (const char *str)
10763 {
10764   aarch64_print_hint_for_core_or_arch (str, true);
10765 }
10766
10767 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
10768    specified in STR and throw errors if appropriate.  Put the results if
10769    they are valid in RES and ISA_FLAGS.  Return whether the option is
10770    valid.  */
10771
10772 static bool
10773 aarch64_validate_mcpu (const char *str, const struct processor **res,
10774                        unsigned long *isa_flags)
10775 {
10776   enum aarch64_parse_opt_result parse_res
10777     = aarch64_parse_cpu (str, res, isa_flags);
10778
10779   if (parse_res == AARCH64_PARSE_OK)
10780     return true;
10781
10782   switch (parse_res)
10783     {
10784       case AARCH64_PARSE_MISSING_ARG:
10785         error ("missing cpu name in %<-mcpu=%s%>", str);
10786         break;
10787       case AARCH64_PARSE_INVALID_ARG:
10788         error ("unknown value %qs for -mcpu", str);
10789         aarch64_print_hint_for_core (str);
10790         break;
10791       case AARCH64_PARSE_INVALID_FEATURE:
10792         error ("invalid feature modifier in %<-mcpu=%s%>", str);
10793         break;
10794       default:
10795         gcc_unreachable ();
10796     }
10797
10798   return false;
10799 }
10800
10801 /* Validate a command-line -march option.  Parse the arch and extensions
10802    (if any) specified in STR and throw errors if appropriate.  Put the
10803    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
10804    option is valid.  */
10805
10806 static bool
10807 aarch64_validate_march (const char *str, const struct processor **res,
10808                          unsigned long *isa_flags)
10809 {
10810   enum aarch64_parse_opt_result parse_res
10811     = aarch64_parse_arch (str, res, isa_flags);
10812
10813   if (parse_res == AARCH64_PARSE_OK)
10814     return true;
10815
10816   switch (parse_res)
10817     {
10818       case AARCH64_PARSE_MISSING_ARG:
10819         error ("missing arch name in %<-march=%s%>", str);
10820         break;
10821       case AARCH64_PARSE_INVALID_ARG:
10822         error ("unknown value %qs for -march", str);
10823         aarch64_print_hint_for_arch (str);
10824         break;
10825       case AARCH64_PARSE_INVALID_FEATURE:
10826         error ("invalid feature modifier in %<-march=%s%>", str);
10827         break;
10828       default:
10829         gcc_unreachable ();
10830     }
10831
10832   return false;
10833 }
10834
10835 /* Validate a command-line -mtune option.  Parse the cpu
10836    specified in STR and throw errors if appropriate.  Put the
10837    result, if it is valid, in RES.  Return whether the option is
10838    valid.  */
10839
10840 static bool
10841 aarch64_validate_mtune (const char *str, const struct processor **res)
10842 {
10843   enum aarch64_parse_opt_result parse_res
10844     = aarch64_parse_tune (str, res);
10845
10846   if (parse_res == AARCH64_PARSE_OK)
10847     return true;
10848
10849   switch (parse_res)
10850     {
10851       case AARCH64_PARSE_MISSING_ARG:
10852         error ("missing cpu name in %<-mtune=%s%>", str);
10853         break;
10854       case AARCH64_PARSE_INVALID_ARG:
10855         error ("unknown value %qs for -mtune", str);
10856         aarch64_print_hint_for_core (str);
10857         break;
10858       default:
10859         gcc_unreachable ();
10860     }
10861   return false;
10862 }
10863
10864 /* Return the CPU corresponding to the enum CPU.
10865    If it doesn't specify a cpu, return the default.  */
10866
10867 static const struct processor *
10868 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10869 {
10870   if (cpu != aarch64_none)
10871     return &all_cores[cpu];
10872
10873   /* The & 0x3f is to extract the bottom 6 bits that encode the
10874      default cpu as selected by the --with-cpu GCC configure option
10875      in config.gcc.
10876      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10877      flags mechanism should be reworked to make it more sane.  */
10878   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10879 }
10880
10881 /* Return the architecture corresponding to the enum ARCH.
10882    If it doesn't specify a valid architecture, return the default.  */
10883
10884 static const struct processor *
10885 aarch64_get_arch (enum aarch64_arch arch)
10886 {
10887   if (arch != aarch64_no_arch)
10888     return &all_architectures[arch];
10889
10890   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10891
10892   return &all_architectures[cpu->arch];
10893 }
10894
10895 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
10896
10897 static poly_uint16
10898 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10899 {
10900   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10901      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10902      deciding which .md file patterns to use and when deciding whether
10903      something is a legitimate address or constant.  */
10904   if (value == SVE_SCALABLE || value == SVE_128)
10905     return poly_uint16 (2, 2);
10906   else
10907     return (int) value / 64;
10908 }
10909
10910 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
10911    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10912    tuning structs.  In particular it must set selected_tune and
10913    aarch64_isa_flags that define the available ISA features and tuning
10914    decisions.  It must also set selected_arch as this will be used to
10915    output the .arch asm tags for each function.  */
10916
10917 static void
10918 aarch64_override_options (void)
10919 {
10920   unsigned long cpu_isa = 0;
10921   unsigned long arch_isa = 0;
10922   aarch64_isa_flags = 0;
10923
10924   bool valid_cpu = true;
10925   bool valid_tune = true;
10926   bool valid_arch = true;
10927
10928   selected_cpu = NULL;
10929   selected_arch = NULL;
10930   selected_tune = NULL;
10931
10932   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10933      If either of -march or -mtune is given, they override their
10934      respective component of -mcpu.  */
10935   if (aarch64_cpu_string)
10936     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10937                                         &cpu_isa);
10938
10939   if (aarch64_arch_string)
10940     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10941                                           &arch_isa);
10942
10943   if (aarch64_tune_string)
10944     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10945
10946   /* If the user did not specify a processor, choose the default
10947      one for them.  This will be the CPU set during configuration using
10948      --with-cpu, otherwise it is "generic".  */
10949   if (!selected_cpu)
10950     {
10951       if (selected_arch)
10952         {
10953           selected_cpu = &all_cores[selected_arch->ident];
10954           aarch64_isa_flags = arch_isa;
10955           explicit_arch = selected_arch->arch;
10956         }
10957       else
10958         {
10959           /* Get default configure-time CPU.  */
10960           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10961           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10962         }
10963
10964       if (selected_tune)
10965         explicit_tune_core = selected_tune->ident;
10966     }
10967   /* If both -mcpu and -march are specified check that they are architecturally
10968      compatible, warn if they're not and prefer the -march ISA flags.  */
10969   else if (selected_arch)
10970     {
10971       if (selected_arch->arch != selected_cpu->arch)
10972         {
10973           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10974                        all_architectures[selected_cpu->arch].name,
10975                        selected_arch->name);
10976         }
10977       aarch64_isa_flags = arch_isa;
10978       explicit_arch = selected_arch->arch;
10979       explicit_tune_core = selected_tune ? selected_tune->ident
10980                                           : selected_cpu->ident;
10981     }
10982   else
10983     {
10984       /* -mcpu but no -march.  */
10985       aarch64_isa_flags = cpu_isa;
10986       explicit_tune_core = selected_tune ? selected_tune->ident
10987                                           : selected_cpu->ident;
10988       gcc_assert (selected_cpu);
10989       selected_arch = &all_architectures[selected_cpu->arch];
10990       explicit_arch = selected_arch->arch;
10991     }
10992
10993   /* Set the arch as well as we will need it when outputing
10994      the .arch directive in assembly.  */
10995   if (!selected_arch)
10996     {
10997       gcc_assert (selected_cpu);
10998       selected_arch = &all_architectures[selected_cpu->arch];
10999     }
11000
11001   if (!selected_tune)
11002     selected_tune = selected_cpu;
11003
11004 #ifndef HAVE_AS_MABI_OPTION
11005   /* The compiler may have been configured with 2.23.* binutils, which does
11006      not have support for ILP32.  */
11007   if (TARGET_ILP32)
11008     error ("assembler does not support -mabi=ilp32");
11009 #endif
11010
11011   /* Convert -msve-vector-bits to a VG count.  */
11012   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
11013
11014   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
11015     sorry ("return address signing is only supported for -mabi=lp64");
11016
11017   /* Make sure we properly set up the explicit options.  */
11018   if ((aarch64_cpu_string && valid_cpu)
11019        || (aarch64_tune_string && valid_tune))
11020     gcc_assert (explicit_tune_core != aarch64_none);
11021
11022   if ((aarch64_cpu_string && valid_cpu)
11023        || (aarch64_arch_string && valid_arch))
11024     gcc_assert (explicit_arch != aarch64_no_arch);
11025
11026   aarch64_override_options_internal (&global_options);
11027
11028   /* Save these options as the default ones in case we push and pop them later
11029      while processing functions with potential target attributes.  */
11030   target_option_default_node = target_option_current_node
11031       = build_target_option_node (&global_options);
11032 }
11033
11034 /* Implement targetm.override_options_after_change.  */
11035
11036 static void
11037 aarch64_override_options_after_change (void)
11038 {
11039   aarch64_override_options_after_change_1 (&global_options);
11040 }
11041
11042 static struct machine_function *
11043 aarch64_init_machine_status (void)
11044 {
11045   struct machine_function *machine;
11046   machine = ggc_cleared_alloc<machine_function> ();
11047   return machine;
11048 }
11049
11050 void
11051 aarch64_init_expanders (void)
11052 {
11053   init_machine_status = aarch64_init_machine_status;
11054 }
11055
11056 /* A checking mechanism for the implementation of the various code models.  */
11057 static void
11058 initialize_aarch64_code_model (struct gcc_options *opts)
11059 {
11060    if (opts->x_flag_pic)
11061      {
11062        switch (opts->x_aarch64_cmodel_var)
11063          {
11064          case AARCH64_CMODEL_TINY:
11065            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
11066            break;
11067          case AARCH64_CMODEL_SMALL:
11068 #ifdef HAVE_AS_SMALL_PIC_RELOCS
11069            aarch64_cmodel = (flag_pic == 2
11070                              ? AARCH64_CMODEL_SMALL_PIC
11071                              : AARCH64_CMODEL_SMALL_SPIC);
11072 #else
11073            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
11074 #endif
11075            break;
11076          case AARCH64_CMODEL_LARGE:
11077            sorry ("code model %qs with -f%s", "large",
11078                   opts->x_flag_pic > 1 ? "PIC" : "pic");
11079            break;
11080          default:
11081            gcc_unreachable ();
11082          }
11083      }
11084    else
11085      aarch64_cmodel = opts->x_aarch64_cmodel_var;
11086 }
11087
11088 /* Implement TARGET_OPTION_SAVE.  */
11089
11090 static void
11091 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
11092 {
11093   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
11094 }
11095
11096 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
11097    using the information saved in PTR.  */
11098
11099 static void
11100 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
11101 {
11102   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
11103   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11104   opts->x_explicit_arch = ptr->x_explicit_arch;
11105   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
11106   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
11107
11108   aarch64_override_options_internal (opts);
11109 }
11110
11111 /* Implement TARGET_OPTION_PRINT.  */
11112
11113 static void
11114 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
11115 {
11116   const struct processor *cpu
11117     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11118   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
11119   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
11120   std::string extension
11121     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
11122
11123   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
11124   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
11125            arch->name, extension.c_str ());
11126 }
11127
11128 static GTY(()) tree aarch64_previous_fndecl;
11129
11130 void
11131 aarch64_reset_previous_fndecl (void)
11132 {
11133   aarch64_previous_fndecl = NULL;
11134 }
11135
11136 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11137    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11138    make sure optab availability predicates are recomputed when necessary.  */
11139
11140 void
11141 aarch64_save_restore_target_globals (tree new_tree)
11142 {
11143   if (TREE_TARGET_GLOBALS (new_tree))
11144     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
11145   else if (new_tree == target_option_default_node)
11146     restore_target_globals (&default_target_globals);
11147   else
11148     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
11149 }
11150
11151 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
11152    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11153    of the function, if such exists.  This function may be called multiple
11154    times on a single function so use aarch64_previous_fndecl to avoid
11155    setting up identical state.  */
11156
11157 static void
11158 aarch64_set_current_function (tree fndecl)
11159 {
11160   if (!fndecl || fndecl == aarch64_previous_fndecl)
11161     return;
11162
11163   tree old_tree = (aarch64_previous_fndecl
11164                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
11165                    : NULL_TREE);
11166
11167   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11168
11169   /* If current function has no attributes but the previous one did,
11170      use the default node.  */
11171   if (!new_tree && old_tree)
11172     new_tree = target_option_default_node;
11173
11174   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
11175      the default have been handled by aarch64_save_restore_target_globals from
11176      aarch64_pragma_target_parse.  */
11177   if (old_tree == new_tree)
11178     return;
11179
11180   aarch64_previous_fndecl = fndecl;
11181
11182   /* First set the target options.  */
11183   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
11184
11185   aarch64_save_restore_target_globals (new_tree);
11186 }
11187
11188 /* Enum describing the various ways we can handle attributes.
11189    In many cases we can reuse the generic option handling machinery.  */
11190
11191 enum aarch64_attr_opt_type
11192 {
11193   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
11194   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
11195   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
11196   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
11197 };
11198
11199 /* All the information needed to handle a target attribute.
11200    NAME is the name of the attribute.
11201    ATTR_TYPE specifies the type of behavior of the attribute as described
11202    in the definition of enum aarch64_attr_opt_type.
11203    ALLOW_NEG is true if the attribute supports a "no-" form.
11204    HANDLER is the function that takes the attribute string as an argument
11205    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11206    OPT_NUM is the enum specifying the option that the attribute modifies.
11207    This is needed for attributes that mirror the behavior of a command-line
11208    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11209    aarch64_attr_enum.  */
11210
11211 struct aarch64_attribute_info
11212 {
11213   const char *name;
11214   enum aarch64_attr_opt_type attr_type;
11215   bool allow_neg;
11216   bool (*handler) (const char *);
11217   enum opt_code opt_num;
11218 };
11219
11220 /* Handle the ARCH_STR argument to the arch= target attribute.  */
11221
11222 static bool
11223 aarch64_handle_attr_arch (const char *str)
11224 {
11225   const struct processor *tmp_arch = NULL;
11226   enum aarch64_parse_opt_result parse_res
11227     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11228
11229   if (parse_res == AARCH64_PARSE_OK)
11230     {
11231       gcc_assert (tmp_arch);
11232       selected_arch = tmp_arch;
11233       explicit_arch = selected_arch->arch;
11234       return true;
11235     }
11236
11237   switch (parse_res)
11238     {
11239       case AARCH64_PARSE_MISSING_ARG:
11240         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11241         break;
11242       case AARCH64_PARSE_INVALID_ARG:
11243         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11244         aarch64_print_hint_for_arch (str);
11245         break;
11246       case AARCH64_PARSE_INVALID_FEATURE:
11247         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11248         break;
11249       default:
11250         gcc_unreachable ();
11251     }
11252
11253   return false;
11254 }
11255
11256 /* Handle the argument CPU_STR to the cpu= target attribute.  */
11257
11258 static bool
11259 aarch64_handle_attr_cpu (const char *str)
11260 {
11261   const struct processor *tmp_cpu = NULL;
11262   enum aarch64_parse_opt_result parse_res
11263     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11264
11265   if (parse_res == AARCH64_PARSE_OK)
11266     {
11267       gcc_assert (tmp_cpu);
11268       selected_tune = tmp_cpu;
11269       explicit_tune_core = selected_tune->ident;
11270
11271       selected_arch = &all_architectures[tmp_cpu->arch];
11272       explicit_arch = selected_arch->arch;
11273       return true;
11274     }
11275
11276   switch (parse_res)
11277     {
11278       case AARCH64_PARSE_MISSING_ARG:
11279         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11280         break;
11281       case AARCH64_PARSE_INVALID_ARG:
11282         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11283         aarch64_print_hint_for_core (str);
11284         break;
11285       case AARCH64_PARSE_INVALID_FEATURE:
11286         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11287         break;
11288       default:
11289         gcc_unreachable ();
11290     }
11291
11292   return false;
11293 }
11294
11295 /* Handle the argument STR to the tune= target attribute.  */
11296
11297 static bool
11298 aarch64_handle_attr_tune (const char *str)
11299 {
11300   const struct processor *tmp_tune = NULL;
11301   enum aarch64_parse_opt_result parse_res
11302     = aarch64_parse_tune (str, &tmp_tune);
11303
11304   if (parse_res == AARCH64_PARSE_OK)
11305     {
11306       gcc_assert (tmp_tune);
11307       selected_tune = tmp_tune;
11308       explicit_tune_core = selected_tune->ident;
11309       return true;
11310     }
11311
11312   switch (parse_res)
11313     {
11314       case AARCH64_PARSE_INVALID_ARG:
11315         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11316         aarch64_print_hint_for_core (str);
11317         break;
11318       default:
11319         gcc_unreachable ();
11320     }
11321
11322   return false;
11323 }
11324
11325 /* Parse an architecture extensions target attribute string specified in STR.
11326    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
11327    if successful.  Update aarch64_isa_flags to reflect the ISA features
11328    modified.  */
11329
11330 static bool
11331 aarch64_handle_attr_isa_flags (char *str)
11332 {
11333   enum aarch64_parse_opt_result parse_res;
11334   unsigned long isa_flags = aarch64_isa_flags;
11335
11336   /* We allow "+nothing" in the beginning to clear out all architectural
11337      features if the user wants to handpick specific features.  */
11338   if (strncmp ("+nothing", str, 8) == 0)
11339     {
11340       isa_flags = 0;
11341       str += 8;
11342     }
11343
11344   parse_res = aarch64_parse_extension (str, &isa_flags);
11345
11346   if (parse_res == AARCH64_PARSE_OK)
11347     {
11348       aarch64_isa_flags = isa_flags;
11349       return true;
11350     }
11351
11352   switch (parse_res)
11353     {
11354       case AARCH64_PARSE_MISSING_ARG:
11355         error ("missing value in %<target()%> pragma or attribute");
11356         break;
11357
11358       case AARCH64_PARSE_INVALID_FEATURE:
11359         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11360         break;
11361
11362       default:
11363         gcc_unreachable ();
11364     }
11365
11366  return false;
11367 }
11368
11369 /* The target attributes that we support.  On top of these we also support just
11370    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
11371    handled explicitly in aarch64_process_one_target_attr.  */
11372
11373 static const struct aarch64_attribute_info aarch64_attributes[] =
11374 {
11375   { "general-regs-only", aarch64_attr_mask, false, NULL,
11376      OPT_mgeneral_regs_only },
11377   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11378      OPT_mfix_cortex_a53_835769 },
11379   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11380      OPT_mfix_cortex_a53_843419 },
11381   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11382   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
11383   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11384      OPT_momit_leaf_frame_pointer },
11385   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11386   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11387      OPT_march_ },
11388   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11389   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11390      OPT_mtune_ },
11391   { "sign-return-address", aarch64_attr_enum, false, NULL,
11392      OPT_msign_return_address_ },
11393   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11394 };
11395
11396 /* Parse ARG_STR which contains the definition of one target attribute.
11397    Show appropriate errors if any or return true if the attribute is valid.  */
11398
11399 static bool
11400 aarch64_process_one_target_attr (char *arg_str)
11401 {
11402   bool invert = false;
11403
11404   size_t len = strlen (arg_str);
11405
11406   if (len == 0)
11407     {
11408       error ("malformed %<target()%> pragma or attribute");
11409       return false;
11410     }
11411
11412   char *str_to_check = (char *) alloca (len + 1);
11413   strcpy (str_to_check, arg_str);
11414
11415   /* Skip leading whitespace.  */
11416   while (*str_to_check == ' ' || *str_to_check == '\t')
11417     str_to_check++;
11418
11419   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11420      It is easier to detect and handle it explicitly here rather than going
11421      through the machinery for the rest of the target attributes in this
11422      function.  */
11423   if (*str_to_check == '+')
11424     return aarch64_handle_attr_isa_flags (str_to_check);
11425
11426   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11427     {
11428       invert = true;
11429       str_to_check += 3;
11430     }
11431   char *arg = strchr (str_to_check, '=');
11432
11433   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11434      and point ARG to "foo".  */
11435   if (arg)
11436     {
11437       *arg = '\0';
11438       arg++;
11439     }
11440   const struct aarch64_attribute_info *p_attr;
11441   bool found = false;
11442   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11443     {
11444       /* If the names don't match up, or the user has given an argument
11445          to an attribute that doesn't accept one, or didn't give an argument
11446          to an attribute that expects one, fail to match.  */
11447       if (strcmp (str_to_check, p_attr->name) != 0)
11448         continue;
11449
11450       found = true;
11451       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11452                               || p_attr->attr_type == aarch64_attr_enum;
11453
11454       if (attr_need_arg_p ^ (arg != NULL))
11455         {
11456           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11457           return false;
11458         }
11459
11460       /* If the name matches but the attribute does not allow "no-" versions
11461          then we can't match.  */
11462       if (invert && !p_attr->allow_neg)
11463         {
11464           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11465           return false;
11466         }
11467
11468       switch (p_attr->attr_type)
11469         {
11470         /* Has a custom handler registered.
11471            For example, cpu=, arch=, tune=.  */
11472           case aarch64_attr_custom:
11473             gcc_assert (p_attr->handler);
11474             if (!p_attr->handler (arg))
11475               return false;
11476             break;
11477
11478           /* Either set or unset a boolean option.  */
11479           case aarch64_attr_bool:
11480             {
11481               struct cl_decoded_option decoded;
11482
11483               generate_option (p_attr->opt_num, NULL, !invert,
11484                                CL_TARGET, &decoded);
11485               aarch64_handle_option (&global_options, &global_options_set,
11486                                       &decoded, input_location);
11487               break;
11488             }
11489           /* Set or unset a bit in the target_flags.  aarch64_handle_option
11490              should know what mask to apply given the option number.  */
11491           case aarch64_attr_mask:
11492             {
11493               struct cl_decoded_option decoded;
11494               /* We only need to specify the option number.
11495                  aarch64_handle_option will know which mask to apply.  */
11496               decoded.opt_index = p_attr->opt_num;
11497               decoded.value = !invert;
11498               aarch64_handle_option (&global_options, &global_options_set,
11499                                       &decoded, input_location);
11500               break;
11501             }
11502           /* Use the option setting machinery to set an option to an enum.  */
11503           case aarch64_attr_enum:
11504             {
11505               gcc_assert (arg);
11506               bool valid;
11507               int value;
11508               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11509                                               &value, CL_TARGET);
11510               if (valid)
11511                 {
11512                   set_option (&global_options, NULL, p_attr->opt_num, value,
11513                               NULL, DK_UNSPECIFIED, input_location,
11514                               global_dc);
11515                 }
11516               else
11517                 {
11518                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11519                 }
11520               break;
11521             }
11522           default:
11523             gcc_unreachable ();
11524         }
11525     }
11526
11527   /* If we reached here we either have found an attribute and validated
11528      it or didn't match any.  If we matched an attribute but its arguments
11529      were malformed we will have returned false already.  */
11530   return found;
11531 }
11532
11533 /* Count how many times the character C appears in
11534    NULL-terminated string STR.  */
11535
11536 static unsigned int
11537 num_occurences_in_str (char c, char *str)
11538 {
11539   unsigned int res = 0;
11540   while (*str != '\0')
11541     {
11542       if (*str == c)
11543         res++;
11544
11545       str++;
11546     }
11547
11548   return res;
11549 }
11550
11551 /* Parse the tree in ARGS that contains the target attribute information
11552    and update the global target options space.  */
11553
11554 bool
11555 aarch64_process_target_attr (tree args)
11556 {
11557   if (TREE_CODE (args) == TREE_LIST)
11558     {
11559       do
11560         {
11561           tree head = TREE_VALUE (args);
11562           if (head)
11563             {
11564               if (!aarch64_process_target_attr (head))
11565                 return false;
11566             }
11567           args = TREE_CHAIN (args);
11568         } while (args);
11569
11570       return true;
11571     }
11572
11573   if (TREE_CODE (args) != STRING_CST)
11574     {
11575       error ("attribute %<target%> argument not a string");
11576       return false;
11577     }
11578
11579   size_t len = strlen (TREE_STRING_POINTER (args));
11580   char *str_to_check = (char *) alloca (len + 1);
11581   strcpy (str_to_check, TREE_STRING_POINTER (args));
11582
11583   if (len == 0)
11584     {
11585       error ("malformed %<target()%> pragma or attribute");
11586       return false;
11587     }
11588
11589   /* Used to catch empty spaces between commas i.e.
11590      attribute ((target ("attr1,,attr2"))).  */
11591   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11592
11593   /* Handle multiple target attributes separated by ','.  */
11594   char *token = strtok (str_to_check, ",");
11595
11596   unsigned int num_attrs = 0;
11597   while (token)
11598     {
11599       num_attrs++;
11600       if (!aarch64_process_one_target_attr (token))
11601         {
11602           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11603           return false;
11604         }
11605
11606       token = strtok (NULL, ",");
11607     }
11608
11609   if (num_attrs != num_commas + 1)
11610     {
11611       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11612       return false;
11613     }
11614
11615   return true;
11616 }
11617
11618 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
11619    process attribute ((target ("..."))).  */
11620
11621 static bool
11622 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11623 {
11624   struct cl_target_option cur_target;
11625   bool ret;
11626   tree old_optimize;
11627   tree new_target, new_optimize;
11628   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11629
11630   /* If what we're processing is the current pragma string then the
11631      target option node is already stored in target_option_current_node
11632      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
11633      having to re-parse the string.  This is especially useful to keep
11634      arm_neon.h compile times down since that header contains a lot
11635      of intrinsics enclosed in pragmas.  */
11636   if (!existing_target && args == current_target_pragma)
11637     {
11638       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11639       return true;
11640     }
11641   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11642
11643   old_optimize = build_optimization_node (&global_options);
11644   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11645
11646   /* If the function changed the optimization levels as well as setting
11647      target options, start with the optimizations specified.  */
11648   if (func_optimize && func_optimize != old_optimize)
11649     cl_optimization_restore (&global_options,
11650                              TREE_OPTIMIZATION (func_optimize));
11651
11652   /* Save the current target options to restore at the end.  */
11653   cl_target_option_save (&cur_target, &global_options);
11654
11655   /* If fndecl already has some target attributes applied to it, unpack
11656      them so that we add this attribute on top of them, rather than
11657      overwriting them.  */
11658   if (existing_target)
11659     {
11660       struct cl_target_option *existing_options
11661         = TREE_TARGET_OPTION (existing_target);
11662
11663       if (existing_options)
11664         cl_target_option_restore (&global_options, existing_options);
11665     }
11666   else
11667     cl_target_option_restore (&global_options,
11668                         TREE_TARGET_OPTION (target_option_current_node));
11669
11670   ret = aarch64_process_target_attr (args);
11671
11672   /* Set up any additional state.  */
11673   if (ret)
11674     {
11675       aarch64_override_options_internal (&global_options);
11676       /* Initialize SIMD builtins if we haven't already.
11677          Set current_target_pragma to NULL for the duration so that
11678          the builtin initialization code doesn't try to tag the functions
11679          being built with the attributes specified by any current pragma, thus
11680          going into an infinite recursion.  */
11681       if (TARGET_SIMD)
11682         {
11683           tree saved_current_target_pragma = current_target_pragma;
11684           current_target_pragma = NULL;
11685           aarch64_init_simd_builtins ();
11686           current_target_pragma = saved_current_target_pragma;
11687         }
11688       new_target = build_target_option_node (&global_options);
11689     }
11690   else
11691     new_target = NULL;
11692
11693   new_optimize = build_optimization_node (&global_options);
11694
11695   if (fndecl && ret)
11696     {
11697       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11698
11699       if (old_optimize != new_optimize)
11700         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11701     }
11702
11703   cl_target_option_restore (&global_options, &cur_target);
11704
11705   if (old_optimize != new_optimize)
11706     cl_optimization_restore (&global_options,
11707                              TREE_OPTIMIZATION (old_optimize));
11708   return ret;
11709 }
11710
11711 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
11712    tri-bool options (yes, no, don't care) and the default value is
11713    DEF, determine whether to reject inlining.  */
11714
11715 static bool
11716 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11717                                      int dont_care, int def)
11718 {
11719   /* If the callee doesn't care, always allow inlining.  */
11720   if (callee == dont_care)
11721     return true;
11722
11723   /* If the caller doesn't care, always allow inlining.  */
11724   if (caller == dont_care)
11725     return true;
11726
11727   /* Otherwise, allow inlining if either the callee and caller values
11728      agree, or if the callee is using the default value.  */
11729   return (callee == caller || callee == def);
11730 }
11731
11732 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
11733    to inline CALLEE into CALLER based on target-specific info.
11734    Make sure that the caller and callee have compatible architectural
11735    features.  Then go through the other possible target attributes
11736    and see if they can block inlining.  Try not to reject always_inline
11737    callees unless they are incompatible architecturally.  */
11738
11739 static bool
11740 aarch64_can_inline_p (tree caller, tree callee)
11741 {
11742   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11743   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11744
11745   struct cl_target_option *caller_opts
11746         = TREE_TARGET_OPTION (caller_tree ? caller_tree
11747                                            : target_option_default_node);
11748
11749   struct cl_target_option *callee_opts
11750         = TREE_TARGET_OPTION (callee_tree ? callee_tree
11751                                            : target_option_default_node);
11752
11753   /* Callee's ISA flags should be a subset of the caller's.  */
11754   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11755        != callee_opts->x_aarch64_isa_flags)
11756     return false;
11757
11758   /* Allow non-strict aligned functions inlining into strict
11759      aligned ones.  */
11760   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11761        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11762       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11763            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11764     return false;
11765
11766   bool always_inline = lookup_attribute ("always_inline",
11767                                           DECL_ATTRIBUTES (callee));
11768
11769   /* If the architectural features match up and the callee is always_inline
11770      then the other attributes don't matter.  */
11771   if (always_inline)
11772     return true;
11773
11774   if (caller_opts->x_aarch64_cmodel_var
11775       != callee_opts->x_aarch64_cmodel_var)
11776     return false;
11777
11778   if (caller_opts->x_aarch64_tls_dialect
11779       != callee_opts->x_aarch64_tls_dialect)
11780     return false;
11781
11782   /* Honour explicit requests to workaround errata.  */
11783   if (!aarch64_tribools_ok_for_inlining_p (
11784           caller_opts->x_aarch64_fix_a53_err835769,
11785           callee_opts->x_aarch64_fix_a53_err835769,
11786           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11787     return false;
11788
11789   if (!aarch64_tribools_ok_for_inlining_p (
11790           caller_opts->x_aarch64_fix_a53_err843419,
11791           callee_opts->x_aarch64_fix_a53_err843419,
11792           2, TARGET_FIX_ERR_A53_843419))
11793     return false;
11794
11795   /* If the user explicitly specified -momit-leaf-frame-pointer for the
11796      caller and calle and they don't match up, reject inlining.  */
11797   if (!aarch64_tribools_ok_for_inlining_p (
11798           caller_opts->x_flag_omit_leaf_frame_pointer,
11799           callee_opts->x_flag_omit_leaf_frame_pointer,
11800           2, 1))
11801     return false;
11802
11803   /* If the callee has specific tuning overrides, respect them.  */
11804   if (callee_opts->x_aarch64_override_tune_string != NULL
11805       && caller_opts->x_aarch64_override_tune_string == NULL)
11806     return false;
11807
11808   /* If the user specified tuning override strings for the
11809      caller and callee and they don't match up, reject inlining.
11810      We just do a string compare here, we don't analyze the meaning
11811      of the string, as it would be too costly for little gain.  */
11812   if (callee_opts->x_aarch64_override_tune_string
11813       && caller_opts->x_aarch64_override_tune_string
11814       && (strcmp (callee_opts->x_aarch64_override_tune_string,
11815                   caller_opts->x_aarch64_override_tune_string) != 0))
11816     return false;
11817
11818   return true;
11819 }
11820
11821 /* Return true if SYMBOL_REF X binds locally.  */
11822
11823 static bool
11824 aarch64_symbol_binds_local_p (const_rtx x)
11825 {
11826   return (SYMBOL_REF_DECL (x)
11827           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11828           : SYMBOL_REF_LOCAL_P (x));
11829 }
11830
11831 /* Return true if SYMBOL_REF X is thread local */
11832 static bool
11833 aarch64_tls_symbol_p (rtx x)
11834 {
11835   if (! TARGET_HAVE_TLS)
11836     return false;
11837
11838   if (GET_CODE (x) != SYMBOL_REF)
11839     return false;
11840
11841   return SYMBOL_REF_TLS_MODEL (x) != 0;
11842 }
11843
11844 /* Classify a TLS symbol into one of the TLS kinds.  */
11845 enum aarch64_symbol_type
11846 aarch64_classify_tls_symbol (rtx x)
11847 {
11848   enum tls_model tls_kind = tls_symbolic_operand_type (x);
11849
11850   switch (tls_kind)
11851     {
11852     case TLS_MODEL_GLOBAL_DYNAMIC:
11853     case TLS_MODEL_LOCAL_DYNAMIC:
11854       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11855
11856     case TLS_MODEL_INITIAL_EXEC:
11857       switch (aarch64_cmodel)
11858         {
11859         case AARCH64_CMODEL_TINY:
11860         case AARCH64_CMODEL_TINY_PIC:
11861           return SYMBOL_TINY_TLSIE;
11862         default:
11863           return SYMBOL_SMALL_TLSIE;
11864         }
11865
11866     case TLS_MODEL_LOCAL_EXEC:
11867       if (aarch64_tls_size == 12)
11868         return SYMBOL_TLSLE12;
11869       else if (aarch64_tls_size == 24)
11870         return SYMBOL_TLSLE24;
11871       else if (aarch64_tls_size == 32)
11872         return SYMBOL_TLSLE32;
11873       else if (aarch64_tls_size == 48)
11874         return SYMBOL_TLSLE48;
11875       else
11876         gcc_unreachable ();
11877
11878     case TLS_MODEL_EMULATED:
11879     case TLS_MODEL_NONE:
11880       return SYMBOL_FORCE_TO_MEM;
11881
11882     default:
11883       gcc_unreachable ();
11884     }
11885 }
11886
11887 /* Return the correct method for accessing X + OFFSET, where X is either
11888    a SYMBOL_REF or LABEL_REF.  */
11889
11890 enum aarch64_symbol_type
11891 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11892 {
11893   if (GET_CODE (x) == LABEL_REF)
11894     {
11895       switch (aarch64_cmodel)
11896         {
11897         case AARCH64_CMODEL_LARGE:
11898           return SYMBOL_FORCE_TO_MEM;
11899
11900         case AARCH64_CMODEL_TINY_PIC:
11901         case AARCH64_CMODEL_TINY:
11902           return SYMBOL_TINY_ABSOLUTE;
11903
11904         case AARCH64_CMODEL_SMALL_SPIC:
11905         case AARCH64_CMODEL_SMALL_PIC:
11906         case AARCH64_CMODEL_SMALL:
11907           return SYMBOL_SMALL_ABSOLUTE;
11908
11909         default:
11910           gcc_unreachable ();
11911         }
11912     }
11913
11914   if (GET_CODE (x) == SYMBOL_REF)
11915     {
11916       if (aarch64_tls_symbol_p (x))
11917         return aarch64_classify_tls_symbol (x);
11918
11919       switch (aarch64_cmodel)
11920         {
11921         case AARCH64_CMODEL_TINY:
11922           /* When we retrieve symbol + offset address, we have to make sure
11923              the offset does not cause overflow of the final address.  But
11924              we have no way of knowing the address of symbol at compile time
11925              so we can't accurately say if the distance between the PC and
11926              symbol + offset is outside the addressible range of +/-1M in the
11927              TINY code model.  So we rely on images not being greater than
11928              1M and cap the offset at 1M and anything beyond 1M will have to
11929              be loaded using an alternative mechanism.  Furthermore if the
11930              symbol is a weak reference to something that isn't known to
11931              resolve to a symbol in this module, then force to memory.  */
11932           if ((SYMBOL_REF_WEAK (x)
11933                && !aarch64_symbol_binds_local_p (x))
11934               || !IN_RANGE (offset, -1048575, 1048575))
11935             return SYMBOL_FORCE_TO_MEM;
11936           return SYMBOL_TINY_ABSOLUTE;
11937
11938         case AARCH64_CMODEL_SMALL:
11939           /* Same reasoning as the tiny code model, but the offset cap here is
11940              4G.  */
11941           if ((SYMBOL_REF_WEAK (x)
11942                && !aarch64_symbol_binds_local_p (x))
11943               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11944                             HOST_WIDE_INT_C (4294967264)))
11945             return SYMBOL_FORCE_TO_MEM;
11946           return SYMBOL_SMALL_ABSOLUTE;
11947
11948         case AARCH64_CMODEL_TINY_PIC:
11949           if (!aarch64_symbol_binds_local_p (x))
11950             return SYMBOL_TINY_GOT;
11951           return SYMBOL_TINY_ABSOLUTE;
11952
11953         case AARCH64_CMODEL_SMALL_SPIC:
11954         case AARCH64_CMODEL_SMALL_PIC:
11955           if (!aarch64_symbol_binds_local_p (x))
11956             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11957                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11958           return SYMBOL_SMALL_ABSOLUTE;
11959
11960         case AARCH64_CMODEL_LARGE:
11961           /* This is alright even in PIC code as the constant
11962              pool reference is always PC relative and within
11963              the same translation unit.  */
11964           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11965             return SYMBOL_SMALL_ABSOLUTE;
11966           else
11967             return SYMBOL_FORCE_TO_MEM;
11968
11969         default:
11970           gcc_unreachable ();
11971         }
11972     }
11973
11974   /* By default push everything into the constant pool.  */
11975   return SYMBOL_FORCE_TO_MEM;
11976 }
11977
11978 bool
11979 aarch64_constant_address_p (rtx x)
11980 {
11981   return (CONSTANT_P (x) && memory_address_p (DImode, x));
11982 }
11983
11984 bool
11985 aarch64_legitimate_pic_operand_p (rtx x)
11986 {
11987   if (GET_CODE (x) == SYMBOL_REF
11988       || (GET_CODE (x) == CONST
11989           && GET_CODE (XEXP (x, 0)) == PLUS
11990           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11991      return false;
11992
11993   return true;
11994 }
11995
11996 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
11997    that should be rematerialized rather than spilled.  */
11998
11999 static bool
12000 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
12001 {
12002   /* Support CSE and rematerialization of common constants.  */
12003   if (CONST_INT_P (x)
12004       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
12005       || GET_CODE (x) == CONST_VECTOR)
12006     return true;
12007
12008   /* Do not allow vector struct mode constants for Advanced SIMD.
12009      We could support 0 and -1 easily, but they need support in
12010      aarch64-simd.md.  */
12011   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12012   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
12013     return false;
12014
12015   /* Only accept variable-length vector constants if they can be
12016      handled directly.
12017
12018      ??? It would be possible to handle rematerialization of other
12019      constants via secondary reloads.  */
12020   if (vec_flags & VEC_ANY_SVE)
12021     return aarch64_simd_valid_immediate (x, NULL);
12022
12023   if (GET_CODE (x) == HIGH)
12024     x = XEXP (x, 0);
12025
12026   /* Accept polynomial constants that can be calculated by using the
12027      destination of a move as the sole temporary.  Constants that
12028      require a second temporary cannot be rematerialized (they can't be
12029      forced to memory and also aren't legitimate constants).  */
12030   poly_int64 offset;
12031   if (poly_int_rtx_p (x, &offset))
12032     return aarch64_offset_temporaries (false, offset) <= 1;
12033
12034   /* If an offset is being added to something else, we need to allow the
12035      base to be moved into the destination register, meaning that there
12036      are no free temporaries for the offset.  */
12037   x = strip_offset (x, &offset);
12038   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
12039     return false;
12040
12041   /* Do not allow const (plus (anchor_symbol, const_int)).  */
12042   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
12043     return false;
12044
12045   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
12046      so spilling them is better than rematerialization.  */
12047   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
12048     return true;
12049
12050   /* Label references are always constant.  */
12051   if (GET_CODE (x) == LABEL_REF)
12052     return true;
12053
12054   return false;
12055 }
12056
12057 rtx
12058 aarch64_load_tp (rtx target)
12059 {
12060   if (!target
12061       || GET_MODE (target) != Pmode
12062       || !register_operand (target, Pmode))
12063     target = gen_reg_rtx (Pmode);
12064
12065   /* Can return in any reg.  */
12066   emit_insn (gen_aarch64_load_tp_hard (target));
12067   return target;
12068 }
12069
12070 /* On AAPCS systems, this is the "struct __va_list".  */
12071 static GTY(()) tree va_list_type;
12072
12073 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
12074    Return the type to use as __builtin_va_list.
12075
12076    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
12077
12078    struct __va_list
12079    {
12080      void *__stack;
12081      void *__gr_top;
12082      void *__vr_top;
12083      int   __gr_offs;
12084      int   __vr_offs;
12085    };  */
12086
12087 static tree
12088 aarch64_build_builtin_va_list (void)
12089 {
12090   tree va_list_name;
12091   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12092
12093   /* Create the type.  */
12094   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
12095   /* Give it the required name.  */
12096   va_list_name = build_decl (BUILTINS_LOCATION,
12097                              TYPE_DECL,
12098                              get_identifier ("__va_list"),
12099                              va_list_type);
12100   DECL_ARTIFICIAL (va_list_name) = 1;
12101   TYPE_NAME (va_list_type) = va_list_name;
12102   TYPE_STUB_DECL (va_list_type) = va_list_name;
12103
12104   /* Create the fields.  */
12105   f_stack = build_decl (BUILTINS_LOCATION,
12106                         FIELD_DECL, get_identifier ("__stack"),
12107                         ptr_type_node);
12108   f_grtop = build_decl (BUILTINS_LOCATION,
12109                         FIELD_DECL, get_identifier ("__gr_top"),
12110                         ptr_type_node);
12111   f_vrtop = build_decl (BUILTINS_LOCATION,
12112                         FIELD_DECL, get_identifier ("__vr_top"),
12113                         ptr_type_node);
12114   f_groff = build_decl (BUILTINS_LOCATION,
12115                         FIELD_DECL, get_identifier ("__gr_offs"),
12116                         integer_type_node);
12117   f_vroff = build_decl (BUILTINS_LOCATION,
12118                         FIELD_DECL, get_identifier ("__vr_offs"),
12119                         integer_type_node);
12120
12121   /* Tell tree-stdarg pass about our internal offset fields.
12122      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12123      purpose to identify whether the code is updating va_list internal
12124      offset fields through irregular way.  */
12125   va_list_gpr_counter_field = f_groff;
12126   va_list_fpr_counter_field = f_vroff;
12127
12128   DECL_ARTIFICIAL (f_stack) = 1;
12129   DECL_ARTIFICIAL (f_grtop) = 1;
12130   DECL_ARTIFICIAL (f_vrtop) = 1;
12131   DECL_ARTIFICIAL (f_groff) = 1;
12132   DECL_ARTIFICIAL (f_vroff) = 1;
12133
12134   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
12135   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
12136   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
12137   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
12138   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
12139
12140   TYPE_FIELDS (va_list_type) = f_stack;
12141   DECL_CHAIN (f_stack) = f_grtop;
12142   DECL_CHAIN (f_grtop) = f_vrtop;
12143   DECL_CHAIN (f_vrtop) = f_groff;
12144   DECL_CHAIN (f_groff) = f_vroff;
12145
12146   /* Compute its layout.  */
12147   layout_type (va_list_type);
12148
12149   return va_list_type;
12150 }
12151
12152 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
12153 static void
12154 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
12155 {
12156   const CUMULATIVE_ARGS *cum;
12157   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12158   tree stack, grtop, vrtop, groff, vroff;
12159   tree t;
12160   int gr_save_area_size = cfun->va_list_gpr_size;
12161   int vr_save_area_size = cfun->va_list_fpr_size;
12162   int vr_offset;
12163
12164   cum = &crtl->args.info;
12165   if (cfun->va_list_gpr_size)
12166     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
12167                              cfun->va_list_gpr_size);
12168   if (cfun->va_list_fpr_size)
12169     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
12170                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
12171
12172   if (!TARGET_FLOAT)
12173     {
12174       gcc_assert (cum->aapcs_nvrn == 0);
12175       vr_save_area_size = 0;
12176     }
12177
12178   f_stack = TYPE_FIELDS (va_list_type_node);
12179   f_grtop = DECL_CHAIN (f_stack);
12180   f_vrtop = DECL_CHAIN (f_grtop);
12181   f_groff = DECL_CHAIN (f_vrtop);
12182   f_vroff = DECL_CHAIN (f_groff);
12183
12184   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
12185                   NULL_TREE);
12186   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
12187                   NULL_TREE);
12188   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
12189                   NULL_TREE);
12190   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
12191                   NULL_TREE);
12192   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
12193                   NULL_TREE);
12194
12195   /* Emit code to initialize STACK, which points to the next varargs stack
12196      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
12197      by named arguments.  STACK is 8-byte aligned.  */
12198   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12199   if (cum->aapcs_stack_size > 0)
12200     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12201   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12202   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12203
12204   /* Emit code to initialize GRTOP, the top of the GR save area.
12205      virtual_incoming_args_rtx should have been 16 byte aligned.  */
12206   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12207   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12208   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12209
12210   /* Emit code to initialize VRTOP, the top of the VR save area.
12211      This address is gr_save_area_bytes below GRTOP, rounded
12212      down to the next 16-byte boundary.  */
12213   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12214   vr_offset = ROUND_UP (gr_save_area_size,
12215                         STACK_BOUNDARY / BITS_PER_UNIT);
12216
12217   if (vr_offset)
12218     t = fold_build_pointer_plus_hwi (t, -vr_offset);
12219   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12220   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12221
12222   /* Emit code to initialize GROFF, the offset from GRTOP of the
12223      next GPR argument.  */
12224   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12225               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12226   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12227
12228   /* Likewise emit code to initialize VROFF, the offset from FTOP
12229      of the next VR argument.  */
12230   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12231               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12232   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12233 }
12234
12235 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
12236
12237 static tree
12238 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12239                               gimple_seq *post_p ATTRIBUTE_UNUSED)
12240 {
12241   tree addr;
12242   bool indirect_p;
12243   bool is_ha;           /* is HFA or HVA.  */
12244   bool dw_align;        /* double-word align.  */
12245   machine_mode ag_mode = VOIDmode;
12246   int nregs;
12247   machine_mode mode;
12248
12249   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12250   tree stack, f_top, f_off, off, arg, roundup, on_stack;
12251   HOST_WIDE_INT size, rsize, adjust, align;
12252   tree t, u, cond1, cond2;
12253
12254   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12255   if (indirect_p)
12256     type = build_pointer_type (type);
12257
12258   mode = TYPE_MODE (type);
12259
12260   f_stack = TYPE_FIELDS (va_list_type_node);
12261   f_grtop = DECL_CHAIN (f_stack);
12262   f_vrtop = DECL_CHAIN (f_grtop);
12263   f_groff = DECL_CHAIN (f_vrtop);
12264   f_vroff = DECL_CHAIN (f_groff);
12265
12266   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12267                   f_stack, NULL_TREE);
12268   size = int_size_in_bytes (type);
12269   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12270
12271   dw_align = false;
12272   adjust = 0;
12273   if (aarch64_vfp_is_call_or_return_candidate (mode,
12274                                                type,
12275                                                &ag_mode,
12276                                                &nregs,
12277                                                &is_ha))
12278     {
12279       /* No frontends can create types with variable-sized modes, so we
12280          shouldn't be asked to pass or return them.  */
12281       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12282
12283       /* TYPE passed in fp/simd registers.  */
12284       if (!TARGET_FLOAT)
12285         aarch64_err_no_fpadvsimd (mode);
12286
12287       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12288                       unshare_expr (valist), f_vrtop, NULL_TREE);
12289       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12290                       unshare_expr (valist), f_vroff, NULL_TREE);
12291
12292       rsize = nregs * UNITS_PER_VREG;
12293
12294       if (is_ha)
12295         {
12296           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12297             adjust = UNITS_PER_VREG - ag_size;
12298         }
12299       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12300                && size < UNITS_PER_VREG)
12301         {
12302           adjust = UNITS_PER_VREG - size;
12303         }
12304     }
12305   else
12306     {
12307       /* TYPE passed in general registers.  */
12308       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12309                       unshare_expr (valist), f_grtop, NULL_TREE);
12310       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12311                       unshare_expr (valist), f_groff, NULL_TREE);
12312       rsize = ROUND_UP (size, UNITS_PER_WORD);
12313       nregs = rsize / UNITS_PER_WORD;
12314
12315       if (align > 8)
12316         dw_align = true;
12317
12318       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12319           && size < UNITS_PER_WORD)
12320         {
12321           adjust = UNITS_PER_WORD  - size;
12322         }
12323     }
12324
12325   /* Get a local temporary for the field value.  */
12326   off = get_initialized_tmp_var (f_off, pre_p, NULL);
12327
12328   /* Emit code to branch if off >= 0.  */
12329   t = build2 (GE_EXPR, boolean_type_node, off,
12330               build_int_cst (TREE_TYPE (off), 0));
12331   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12332
12333   if (dw_align)
12334     {
12335       /* Emit: offs = (offs + 15) & -16.  */
12336       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12337                   build_int_cst (TREE_TYPE (off), 15));
12338       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12339                   build_int_cst (TREE_TYPE (off), -16));
12340       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12341     }
12342   else
12343     roundup = NULL;
12344
12345   /* Update ap.__[g|v]r_offs  */
12346   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12347               build_int_cst (TREE_TYPE (off), rsize));
12348   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12349
12350   /* String up.  */
12351   if (roundup)
12352     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12353
12354   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
12355   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12356               build_int_cst (TREE_TYPE (f_off), 0));
12357   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12358
12359   /* String up: make sure the assignment happens before the use.  */
12360   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12361   COND_EXPR_ELSE (cond1) = t;
12362
12363   /* Prepare the trees handling the argument that is passed on the stack;
12364      the top level node will store in ON_STACK.  */
12365   arg = get_initialized_tmp_var (stack, pre_p, NULL);
12366   if (align > 8)
12367     {
12368       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
12369       t = fold_build_pointer_plus_hwi (arg, 15);
12370       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12371                   build_int_cst (TREE_TYPE (t), -16));
12372       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12373     }
12374   else
12375     roundup = NULL;
12376   /* Advance ap.__stack  */
12377   t = fold_build_pointer_plus_hwi (arg, size + 7);
12378   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12379               build_int_cst (TREE_TYPE (t), -8));
12380   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12381   /* String up roundup and advance.  */
12382   if (roundup)
12383     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12384   /* String up with arg */
12385   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12386   /* Big-endianness related address adjustment.  */
12387   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12388       && size < UNITS_PER_WORD)
12389   {
12390     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12391                 size_int (UNITS_PER_WORD - size));
12392     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12393   }
12394
12395   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12396   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12397
12398   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
12399   t = off;
12400   if (adjust)
12401     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12402                 build_int_cst (TREE_TYPE (off), adjust));
12403
12404   t = fold_convert (sizetype, t);
12405   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12406
12407   if (is_ha)
12408     {
12409       /* type ha; // treat as "struct {ftype field[n];}"
12410          ... [computing offs]
12411          for (i = 0; i <nregs; ++i, offs += 16)
12412            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12413          return ha;  */
12414       int i;
12415       tree tmp_ha, field_t, field_ptr_t;
12416
12417       /* Declare a local variable.  */
12418       tmp_ha = create_tmp_var_raw (type, "ha");
12419       gimple_add_tmp_var (tmp_ha);
12420
12421       /* Establish the base type.  */
12422       switch (ag_mode)
12423         {
12424         case E_SFmode:
12425           field_t = float_type_node;
12426           field_ptr_t = float_ptr_type_node;
12427           break;
12428         case E_DFmode:
12429           field_t = double_type_node;
12430           field_ptr_t = double_ptr_type_node;
12431           break;
12432         case E_TFmode:
12433           field_t = long_double_type_node;
12434           field_ptr_t = long_double_ptr_type_node;
12435           break;
12436         case E_HFmode:
12437           field_t = aarch64_fp16_type_node;
12438           field_ptr_t = aarch64_fp16_ptr_type_node;
12439           break;
12440         case E_V2SImode:
12441         case E_V4SImode:
12442             {
12443               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12444               field_t = build_vector_type_for_mode (innertype, ag_mode);
12445               field_ptr_t = build_pointer_type (field_t);
12446             }
12447           break;
12448         default:
12449           gcc_assert (0);
12450         }
12451
12452       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
12453       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12454       addr = t;
12455       t = fold_convert (field_ptr_t, addr);
12456       t = build2 (MODIFY_EXPR, field_t,
12457                   build1 (INDIRECT_REF, field_t, tmp_ha),
12458                   build1 (INDIRECT_REF, field_t, t));
12459
12460       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
12461       for (i = 1; i < nregs; ++i)
12462         {
12463           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12464           u = fold_convert (field_ptr_t, addr);
12465           u = build2 (MODIFY_EXPR, field_t,
12466                       build2 (MEM_REF, field_t, tmp_ha,
12467                               build_int_cst (field_ptr_t,
12468                                              (i *
12469                                               int_size_in_bytes (field_t)))),
12470                       build1 (INDIRECT_REF, field_t, u));
12471           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12472         }
12473
12474       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12475       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12476     }
12477
12478   COND_EXPR_ELSE (cond2) = t;
12479   addr = fold_convert (build_pointer_type (type), cond1);
12480   addr = build_va_arg_indirect_ref (addr);
12481
12482   if (indirect_p)
12483     addr = build_va_arg_indirect_ref (addr);
12484
12485   return addr;
12486 }
12487
12488 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
12489
12490 static void
12491 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12492                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12493                                 int no_rtl)
12494 {
12495   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12496   CUMULATIVE_ARGS local_cum;
12497   int gr_saved = cfun->va_list_gpr_size;
12498   int vr_saved = cfun->va_list_fpr_size;
12499
12500   /* The caller has advanced CUM up to, but not beyond, the last named
12501      argument.  Advance a local copy of CUM past the last "real" named
12502      argument, to find out how many registers are left over.  */
12503   local_cum = *cum;
12504   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12505
12506   /* Found out how many registers we need to save.
12507      Honor tree-stdvar analysis results.  */
12508   if (cfun->va_list_gpr_size)
12509     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12510                     cfun->va_list_gpr_size / UNITS_PER_WORD);
12511   if (cfun->va_list_fpr_size)
12512     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12513                     cfun->va_list_fpr_size / UNITS_PER_VREG);
12514
12515   if (!TARGET_FLOAT)
12516     {
12517       gcc_assert (local_cum.aapcs_nvrn == 0);
12518       vr_saved = 0;
12519     }
12520
12521   if (!no_rtl)
12522     {
12523       if (gr_saved > 0)
12524         {
12525           rtx ptr, mem;
12526
12527           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
12528           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12529                                - gr_saved * UNITS_PER_WORD);
12530           mem = gen_frame_mem (BLKmode, ptr);
12531           set_mem_alias_set (mem, get_varargs_alias_set ());
12532
12533           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12534                                mem, gr_saved);
12535         }
12536       if (vr_saved > 0)
12537         {
12538           /* We can't use move_block_from_reg, because it will use
12539              the wrong mode, storing D regs only.  */
12540           machine_mode mode = TImode;
12541           int off, i, vr_start;
12542
12543           /* Set OFF to the offset from virtual_incoming_args_rtx of
12544              the first vector register.  The VR save area lies below
12545              the GR one, and is aligned to 16 bytes.  */
12546           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12547                            STACK_BOUNDARY / BITS_PER_UNIT);
12548           off -= vr_saved * UNITS_PER_VREG;
12549
12550           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12551           for (i = 0; i < vr_saved; ++i)
12552             {
12553               rtx ptr, mem;
12554
12555               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12556               mem = gen_frame_mem (mode, ptr);
12557               set_mem_alias_set (mem, get_varargs_alias_set ());
12558               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12559               off += UNITS_PER_VREG;
12560             }
12561         }
12562     }
12563
12564   /* We don't save the size into *PRETEND_SIZE because we want to avoid
12565      any complication of having crtl->args.pretend_args_size changed.  */
12566   cfun->machine->frame.saved_varargs_size
12567     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12568                  STACK_BOUNDARY / BITS_PER_UNIT)
12569        + vr_saved * UNITS_PER_VREG);
12570 }
12571
12572 static void
12573 aarch64_conditional_register_usage (void)
12574 {
12575   int i;
12576   if (!TARGET_FLOAT)
12577     {
12578       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12579         {
12580           fixed_regs[i] = 1;
12581           call_used_regs[i] = 1;
12582         }
12583     }
12584   if (!TARGET_SVE)
12585     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12586       {
12587         fixed_regs[i] = 1;
12588         call_used_regs[i] = 1;
12589       }
12590 }
12591
12592 /* Walk down the type tree of TYPE counting consecutive base elements.
12593    If *MODEP is VOIDmode, then set it to the first valid floating point
12594    type.  If a non-floating point type is found, or if a floating point
12595    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12596    otherwise return the count in the sub-tree.  */
12597 static int
12598 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12599 {
12600   machine_mode mode;
12601   HOST_WIDE_INT size;
12602
12603   switch (TREE_CODE (type))
12604     {
12605     case REAL_TYPE:
12606       mode = TYPE_MODE (type);
12607       if (mode != DFmode && mode != SFmode
12608           && mode != TFmode && mode != HFmode)
12609         return -1;
12610
12611       if (*modep == VOIDmode)
12612         *modep = mode;
12613
12614       if (*modep == mode)
12615         return 1;
12616
12617       break;
12618
12619     case COMPLEX_TYPE:
12620       mode = TYPE_MODE (TREE_TYPE (type));
12621       if (mode != DFmode && mode != SFmode
12622           && mode != TFmode && mode != HFmode)
12623         return -1;
12624
12625       if (*modep == VOIDmode)
12626         *modep = mode;
12627
12628       if (*modep == mode)
12629         return 2;
12630
12631       break;
12632
12633     case VECTOR_TYPE:
12634       /* Use V2SImode and V4SImode as representatives of all 64-bit
12635          and 128-bit vector types.  */
12636       size = int_size_in_bytes (type);
12637       switch (size)
12638         {
12639         case 8:
12640           mode = V2SImode;
12641           break;
12642         case 16:
12643           mode = V4SImode;
12644           break;
12645         default:
12646           return -1;
12647         }
12648
12649       if (*modep == VOIDmode)
12650         *modep = mode;
12651
12652       /* Vector modes are considered to be opaque: two vectors are
12653          equivalent for the purposes of being homogeneous aggregates
12654          if they are the same size.  */
12655       if (*modep == mode)
12656         return 1;
12657
12658       break;
12659
12660     case ARRAY_TYPE:
12661       {
12662         int count;
12663         tree index = TYPE_DOMAIN (type);
12664
12665         /* Can't handle incomplete types nor sizes that are not
12666            fixed.  */
12667         if (!COMPLETE_TYPE_P (type)
12668             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12669           return -1;
12670
12671         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12672         if (count == -1
12673             || !index
12674             || !TYPE_MAX_VALUE (index)
12675             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12676             || !TYPE_MIN_VALUE (index)
12677             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12678             || count < 0)
12679           return -1;
12680
12681         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12682                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12683
12684         /* There must be no padding.  */
12685         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12686                       count * GET_MODE_BITSIZE (*modep)))
12687           return -1;
12688
12689         return count;
12690       }
12691
12692     case RECORD_TYPE:
12693       {
12694         int count = 0;
12695         int sub_count;
12696         tree field;
12697
12698         /* Can't handle incomplete types nor sizes that are not
12699            fixed.  */
12700         if (!COMPLETE_TYPE_P (type)
12701             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12702           return -1;
12703
12704         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12705           {
12706             if (TREE_CODE (field) != FIELD_DECL)
12707               continue;
12708
12709             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12710             if (sub_count < 0)
12711               return -1;
12712             count += sub_count;
12713           }
12714
12715         /* There must be no padding.  */
12716         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12717                       count * GET_MODE_BITSIZE (*modep)))
12718           return -1;
12719
12720         return count;
12721       }
12722
12723     case UNION_TYPE:
12724     case QUAL_UNION_TYPE:
12725       {
12726         /* These aren't very interesting except in a degenerate case.  */
12727         int count = 0;
12728         int sub_count;
12729         tree field;
12730
12731         /* Can't handle incomplete types nor sizes that are not
12732            fixed.  */
12733         if (!COMPLETE_TYPE_P (type)
12734             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12735           return -1;
12736
12737         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12738           {
12739             if (TREE_CODE (field) != FIELD_DECL)
12740               continue;
12741
12742             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12743             if (sub_count < 0)
12744               return -1;
12745             count = count > sub_count ? count : sub_count;
12746           }
12747
12748         /* There must be no padding.  */
12749         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12750                       count * GET_MODE_BITSIZE (*modep)))
12751           return -1;
12752
12753         return count;
12754       }
12755
12756     default:
12757       break;
12758     }
12759
12760   return -1;
12761 }
12762
12763 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12764    type as described in AAPCS64 \S 4.1.2.
12765
12766    See the comment above aarch64_composite_type_p for the notes on MODE.  */
12767
12768 static bool
12769 aarch64_short_vector_p (const_tree type,
12770                         machine_mode mode)
12771 {
12772   poly_int64 size = -1;
12773
12774   if (type && TREE_CODE (type) == VECTOR_TYPE)
12775     size = int_size_in_bytes (type);
12776   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12777             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12778     size = GET_MODE_SIZE (mode);
12779
12780   return known_eq (size, 8) || known_eq (size, 16);
12781 }
12782
12783 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12784    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
12785    array types.  The C99 floating-point complex types are also considered
12786    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
12787    types, which are GCC extensions and out of the scope of AAPCS64, are
12788    treated as composite types here as well.
12789
12790    Note that MODE itself is not sufficient in determining whether a type
12791    is such a composite type or not.  This is because
12792    stor-layout.c:compute_record_mode may have already changed the MODE
12793    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
12794    structure with only one field may have its MODE set to the mode of the
12795    field.  Also an integer mode whose size matches the size of the
12796    RECORD_TYPE type may be used to substitute the original mode
12797    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
12798    solely relied on.  */
12799
12800 static bool
12801 aarch64_composite_type_p (const_tree type,
12802                           machine_mode mode)
12803 {
12804   if (aarch64_short_vector_p (type, mode))
12805     return false;
12806
12807   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12808     return true;
12809
12810   if (mode == BLKmode
12811       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12812       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12813     return true;
12814
12815   return false;
12816 }
12817
12818 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12819    shall be passed or returned in simd/fp register(s) (providing these
12820    parameter passing registers are available).
12821
12822    Upon successful return, *COUNT returns the number of needed registers,
12823    *BASE_MODE returns the mode of the individual register and when IS_HAF
12824    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12825    floating-point aggregate or a homogeneous short-vector aggregate.  */
12826
12827 static bool
12828 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12829                                          const_tree type,
12830                                          machine_mode *base_mode,
12831                                          int *count,
12832                                          bool *is_ha)
12833 {
12834   machine_mode new_mode = VOIDmode;
12835   bool composite_p = aarch64_composite_type_p (type, mode);
12836
12837   if (is_ha != NULL) *is_ha = false;
12838
12839   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12840       || aarch64_short_vector_p (type, mode))
12841     {
12842       *count = 1;
12843       new_mode = mode;
12844     }
12845   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12846     {
12847       if (is_ha != NULL) *is_ha = true;
12848       *count = 2;
12849       new_mode = GET_MODE_INNER (mode);
12850     }
12851   else if (type && composite_p)
12852     {
12853       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12854
12855       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12856         {
12857           if (is_ha != NULL) *is_ha = true;
12858           *count = ag_count;
12859         }
12860       else
12861         return false;
12862     }
12863   else
12864     return false;
12865
12866   *base_mode = new_mode;
12867   return true;
12868 }
12869
12870 /* Implement TARGET_STRUCT_VALUE_RTX.  */
12871
12872 static rtx
12873 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12874                           int incoming ATTRIBUTE_UNUSED)
12875 {
12876   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12877 }
12878
12879 /* Implements target hook vector_mode_supported_p.  */
12880 static bool
12881 aarch64_vector_mode_supported_p (machine_mode mode)
12882 {
12883   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12884   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12885 }
12886
12887 /* Return appropriate SIMD container
12888    for MODE within a vector of WIDTH bits.  */
12889 static machine_mode
12890 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12891 {
12892   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12893     switch (mode)
12894       {
12895       case E_DFmode:
12896         return VNx2DFmode;
12897       case E_SFmode:
12898         return VNx4SFmode;
12899       case E_HFmode:
12900         return VNx8HFmode;
12901       case E_DImode:
12902         return VNx2DImode;
12903       case E_SImode:
12904         return VNx4SImode;
12905       case E_HImode:
12906         return VNx8HImode;
12907       case E_QImode:
12908         return VNx16QImode;
12909       default:
12910         return word_mode;
12911       }
12912
12913   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12914   if (TARGET_SIMD)
12915     {
12916       if (known_eq (width, 128))
12917         switch (mode)
12918           {
12919           case E_DFmode:
12920             return V2DFmode;
12921           case E_SFmode:
12922             return V4SFmode;
12923           case E_HFmode:
12924             return V8HFmode;
12925           case E_SImode:
12926             return V4SImode;
12927           case E_HImode:
12928             return V8HImode;
12929           case E_QImode:
12930             return V16QImode;
12931           case E_DImode:
12932             return V2DImode;
12933           default:
12934             break;
12935           }
12936       else
12937         switch (mode)
12938           {
12939           case E_SFmode:
12940             return V2SFmode;
12941           case E_HFmode:
12942             return V4HFmode;
12943           case E_SImode:
12944             return V2SImode;
12945           case E_HImode:
12946             return V4HImode;
12947           case E_QImode:
12948             return V8QImode;
12949           default:
12950             break;
12951           }
12952     }
12953   return word_mode;
12954 }
12955
12956 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
12957 static machine_mode
12958 aarch64_preferred_simd_mode (scalar_mode mode)
12959 {
12960   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12961   return aarch64_simd_container_mode (mode, bits);
12962 }
12963
12964 /* Return a list of possible vector sizes for the vectorizer
12965    to iterate over.  */
12966 static void
12967 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12968 {
12969   if (TARGET_SVE)
12970     sizes->safe_push (BYTES_PER_SVE_VECTOR);
12971   sizes->safe_push (16);
12972   sizes->safe_push (8);
12973 }
12974
12975 /* Implement TARGET_MANGLE_TYPE.  */
12976
12977 static const char *
12978 aarch64_mangle_type (const_tree type)
12979 {
12980   /* The AArch64 ABI documents say that "__va_list" has to be
12981      managled as if it is in the "std" namespace.  */
12982   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12983     return "St9__va_list";
12984
12985   /* Half-precision float.  */
12986   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12987     return "Dh";
12988
12989   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
12990      builtin types.  */
12991   if (TYPE_NAME (type) != NULL)
12992     return aarch64_mangle_builtin_type (type);
12993
12994   /* Use the default mangling.  */
12995   return NULL;
12996 }
12997
12998 /* Find the first rtx_insn before insn that will generate an assembly
12999    instruction.  */
13000
13001 static rtx_insn *
13002 aarch64_prev_real_insn (rtx_insn *insn)
13003 {
13004   if (!insn)
13005     return NULL;
13006
13007   do
13008     {
13009       insn = prev_real_insn (insn);
13010     }
13011   while (insn && recog_memoized (insn) < 0);
13012
13013   return insn;
13014 }
13015
13016 static bool
13017 is_madd_op (enum attr_type t1)
13018 {
13019   unsigned int i;
13020   /* A number of these may be AArch32 only.  */
13021   enum attr_type mlatypes[] = {
13022     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
13023     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
13024     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
13025   };
13026
13027   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
13028     {
13029       if (t1 == mlatypes[i])
13030         return true;
13031     }
13032
13033   return false;
13034 }
13035
13036 /* Check if there is a register dependency between a load and the insn
13037    for which we hold recog_data.  */
13038
13039 static bool
13040 dep_between_memop_and_curr (rtx memop)
13041 {
13042   rtx load_reg;
13043   int opno;
13044
13045   gcc_assert (GET_CODE (memop) == SET);
13046
13047   if (!REG_P (SET_DEST (memop)))
13048     return false;
13049
13050   load_reg = SET_DEST (memop);
13051   for (opno = 1; opno < recog_data.n_operands; opno++)
13052     {
13053       rtx operand = recog_data.operand[opno];
13054       if (REG_P (operand)
13055           && reg_overlap_mentioned_p (load_reg, operand))
13056         return true;
13057
13058     }
13059   return false;
13060 }
13061
13062
13063 /* When working around the Cortex-A53 erratum 835769,
13064    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
13065    instruction and has a preceding memory instruction such that a NOP
13066    should be inserted between them.  */
13067
13068 bool
13069 aarch64_madd_needs_nop (rtx_insn* insn)
13070 {
13071   enum attr_type attr_type;
13072   rtx_insn *prev;
13073   rtx body;
13074
13075   if (!TARGET_FIX_ERR_A53_835769)
13076     return false;
13077
13078   if (!INSN_P (insn) || recog_memoized (insn) < 0)
13079     return false;
13080
13081   attr_type = get_attr_type (insn);
13082   if (!is_madd_op (attr_type))
13083     return false;
13084
13085   prev = aarch64_prev_real_insn (insn);
13086   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
13087      Restore recog state to INSN to avoid state corruption.  */
13088   extract_constrain_insn_cached (insn);
13089
13090   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
13091     return false;
13092
13093   body = single_set (prev);
13094
13095   /* If the previous insn is a memory op and there is no dependency between
13096      it and the DImode madd, emit a NOP between them.  If body is NULL then we
13097      have a complex memory operation, probably a load/store pair.
13098      Be conservative for now and emit a NOP.  */
13099   if (GET_MODE (recog_data.operand[0]) == DImode
13100       && (!body || !dep_between_memop_and_curr (body)))
13101     return true;
13102
13103   return false;
13104
13105 }
13106
13107
13108 /* Implement FINAL_PRESCAN_INSN.  */
13109
13110 void
13111 aarch64_final_prescan_insn (rtx_insn *insn)
13112 {
13113   if (aarch64_madd_needs_nop (insn))
13114     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
13115 }
13116
13117
13118 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13119    instruction.  */
13120
13121 bool
13122 aarch64_sve_index_immediate_p (rtx base_or_step)
13123 {
13124   return (CONST_INT_P (base_or_step)
13125           && IN_RANGE (INTVAL (base_or_step), -16, 15));
13126 }
13127
13128 /* Return true if X is a valid immediate for the SVE ADD and SUB
13129    instructions.  Negate X first if NEGATE_P is true.  */
13130
13131 bool
13132 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
13133 {
13134   rtx elt;
13135
13136   if (!const_vec_duplicate_p (x, &elt)
13137       || !CONST_INT_P (elt))
13138     return false;
13139
13140   HOST_WIDE_INT val = INTVAL (elt);
13141   if (negate_p)
13142     val = -val;
13143   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
13144
13145   if (val & 0xff)
13146     return IN_RANGE (val, 0, 0xff);
13147   return IN_RANGE (val, 0, 0xff00);
13148 }
13149
13150 /* Return true if X is a valid immediate operand for an SVE logical
13151    instruction such as AND.  */
13152
13153 bool
13154 aarch64_sve_bitmask_immediate_p (rtx x)
13155 {
13156   rtx elt;
13157
13158   return (const_vec_duplicate_p (x, &elt)
13159           && CONST_INT_P (elt)
13160           && aarch64_bitmask_imm (INTVAL (elt),
13161                                   GET_MODE_INNER (GET_MODE (x))));
13162 }
13163
13164 /* Return true if X is a valid immediate for the SVE DUP and CPY
13165    instructions.  */
13166
13167 bool
13168 aarch64_sve_dup_immediate_p (rtx x)
13169 {
13170   rtx elt;
13171
13172   if (!const_vec_duplicate_p (x, &elt)
13173       || !CONST_INT_P (elt))
13174     return false;
13175
13176   HOST_WIDE_INT val = INTVAL (elt);
13177   if (val & 0xff)
13178     return IN_RANGE (val, -0x80, 0x7f);
13179   return IN_RANGE (val, -0x8000, 0x7f00);
13180 }
13181
13182 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13183    SIGNED_P says whether the operand is signed rather than unsigned.  */
13184
13185 bool
13186 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13187 {
13188   rtx elt;
13189
13190   return (const_vec_duplicate_p (x, &elt)
13191           && CONST_INT_P (elt)
13192           && (signed_p
13193               ? IN_RANGE (INTVAL (elt), -16, 15)
13194               : IN_RANGE (INTVAL (elt), 0, 127)));
13195 }
13196
13197 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13198    instruction.  Negate X first if NEGATE_P is true.  */
13199
13200 bool
13201 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13202 {
13203   rtx elt;
13204   REAL_VALUE_TYPE r;
13205
13206   if (!const_vec_duplicate_p (x, &elt)
13207       || GET_CODE (elt) != CONST_DOUBLE)
13208     return false;
13209
13210   r = *CONST_DOUBLE_REAL_VALUE (elt);
13211
13212   if (negate_p)
13213     r = real_value_negate (&r);
13214
13215   if (real_equal (&r, &dconst1))
13216     return true;
13217   if (real_equal (&r, &dconsthalf))
13218     return true;
13219   return false;
13220 }
13221
13222 /* Return true if X is a valid immediate operand for an SVE FMUL
13223    instruction.  */
13224
13225 bool
13226 aarch64_sve_float_mul_immediate_p (rtx x)
13227 {
13228   rtx elt;
13229
13230   /* GCC will never generate a multiply with an immediate of 2, so there is no
13231      point testing for it (even though it is a valid constant).  */
13232   return (const_vec_duplicate_p (x, &elt)
13233           && GET_CODE (elt) == CONST_DOUBLE
13234           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13235 }
13236
13237 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13238    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
13239    is nonnull, use it to describe valid immediates.  */
13240 static bool
13241 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13242                                     simd_immediate_info *info,
13243                                     enum simd_immediate_check which,
13244                                     simd_immediate_info::insn_type insn)
13245 {
13246   /* Try a 4-byte immediate with LSL.  */
13247   for (unsigned int shift = 0; shift < 32; shift += 8)
13248     if ((val32 & (0xff << shift)) == val32)
13249       {
13250         if (info)
13251           *info = simd_immediate_info (SImode, val32 >> shift, insn,
13252                                        simd_immediate_info::LSL, shift);
13253         return true;
13254       }
13255
13256   /* Try a 2-byte immediate with LSL.  */
13257   unsigned int imm16 = val32 & 0xffff;
13258   if (imm16 == (val32 >> 16))
13259     for (unsigned int shift = 0; shift < 16; shift += 8)
13260       if ((imm16 & (0xff << shift)) == imm16)
13261         {
13262           if (info)
13263             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13264                                          simd_immediate_info::LSL, shift);
13265           return true;
13266         }
13267
13268   /* Try a 4-byte immediate with MSL, except for cases that MVN
13269      can handle.  */
13270   if (which == AARCH64_CHECK_MOV)
13271     for (unsigned int shift = 8; shift < 24; shift += 8)
13272       {
13273         unsigned int low = (1 << shift) - 1;
13274         if (((val32 & (0xff << shift)) | low) == val32)
13275           {
13276             if (info)
13277               *info = simd_immediate_info (SImode, val32 >> shift, insn,
13278                                            simd_immediate_info::MSL, shift);
13279             return true;
13280           }
13281       }
13282
13283   return false;
13284 }
13285
13286 /* Return true if replicating VAL64 is a valid immediate for the
13287    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
13288    use it to describe valid immediates.  */
13289 static bool
13290 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13291                                  simd_immediate_info *info,
13292                                  enum simd_immediate_check which)
13293 {
13294   unsigned int val32 = val64 & 0xffffffff;
13295   unsigned int val16 = val64 & 0xffff;
13296   unsigned int val8 = val64 & 0xff;
13297
13298   if (val32 == (val64 >> 32))
13299     {
13300       if ((which & AARCH64_CHECK_ORR) != 0
13301           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13302                                                  simd_immediate_info::MOV))
13303         return true;
13304
13305       if ((which & AARCH64_CHECK_BIC) != 0
13306           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13307                                                  simd_immediate_info::MVN))
13308         return true;
13309
13310       /* Try using a replicated byte.  */
13311       if (which == AARCH64_CHECK_MOV
13312           && val16 == (val32 >> 16)
13313           && val8 == (val16 >> 8))
13314         {
13315           if (info)
13316             *info = simd_immediate_info (QImode, val8);
13317           return true;
13318         }
13319     }
13320
13321   /* Try using a bit-to-bytemask.  */
13322   if (which == AARCH64_CHECK_MOV)
13323     {
13324       unsigned int i;
13325       for (i = 0; i < 64; i += 8)
13326         {
13327           unsigned char byte = (val64 >> i) & 0xff;
13328           if (byte != 0 && byte != 0xff)
13329             break;
13330         }
13331       if (i == 64)
13332         {
13333           if (info)
13334             *info = simd_immediate_info (DImode, val64);
13335           return true;
13336         }
13337     }
13338   return false;
13339 }
13340
13341 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13342    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
13343
13344 static bool
13345 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13346                              simd_immediate_info *info)
13347 {
13348   scalar_int_mode mode = DImode;
13349   unsigned int val32 = val64 & 0xffffffff;
13350   if (val32 == (val64 >> 32))
13351     {
13352       mode = SImode;
13353       unsigned int val16 = val32 & 0xffff;
13354       if (val16 == (val32 >> 16))
13355         {
13356           mode = HImode;
13357           unsigned int val8 = val16 & 0xff;
13358           if (val8 == (val16 >> 8))
13359             mode = QImode;
13360         }
13361     }
13362   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13363   if (IN_RANGE (val, -0x80, 0x7f))
13364     {
13365       /* DUP with no shift.  */
13366       if (info)
13367         *info = simd_immediate_info (mode, val);
13368       return true;
13369     }
13370   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13371     {
13372       /* DUP with LSL #8.  */
13373       if (info)
13374         *info = simd_immediate_info (mode, val);
13375       return true;
13376     }
13377   if (aarch64_bitmask_imm (val64, mode))
13378     {
13379       /* DUPM.  */
13380       if (info)
13381         *info = simd_immediate_info (mode, val);
13382       return true;
13383     }
13384   return false;
13385 }
13386
13387 /* Return true if OP is a valid SIMD immediate for the operation
13388    described by WHICH.  If INFO is nonnull, use it to describe valid
13389    immediates.  */
13390 bool
13391 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13392                               enum simd_immediate_check which)
13393 {
13394   machine_mode mode = GET_MODE (op);
13395   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13396   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13397     return false;
13398
13399   scalar_mode elt_mode = GET_MODE_INNER (mode);
13400   rtx base, step;
13401   unsigned int n_elts;
13402   if (GET_CODE (op) == CONST_VECTOR
13403       && CONST_VECTOR_DUPLICATE_P (op))
13404     n_elts = CONST_VECTOR_NPATTERNS (op);
13405   else if ((vec_flags & VEC_SVE_DATA)
13406            && const_vec_series_p (op, &base, &step))
13407     {
13408       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13409       if (!aarch64_sve_index_immediate_p (base)
13410           || !aarch64_sve_index_immediate_p (step))
13411         return false;
13412
13413       if (info)
13414         *info = simd_immediate_info (elt_mode, base, step);
13415       return true;
13416     }
13417   else if (GET_CODE (op) == CONST_VECTOR
13418            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13419     /* N_ELTS set above.  */;
13420   else
13421     return false;
13422
13423   /* Handle PFALSE and PTRUE.  */
13424   if (vec_flags & VEC_SVE_PRED)
13425     return (op == CONST0_RTX (mode)
13426             || op == CONSTM1_RTX (mode));
13427
13428   scalar_float_mode elt_float_mode;
13429   if (n_elts == 1
13430       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13431     {
13432       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13433       if (aarch64_float_const_zero_rtx_p (elt)
13434           || aarch64_float_const_representable_p (elt))
13435         {
13436           if (info)
13437             *info = simd_immediate_info (elt_float_mode, elt);
13438           return true;
13439         }
13440     }
13441
13442   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13443   if (elt_size > 8)
13444     return false;
13445
13446   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13447
13448   /* Expand the vector constant out into a byte vector, with the least
13449      significant byte of the register first.  */
13450   auto_vec<unsigned char, 16> bytes;
13451   bytes.reserve (n_elts * elt_size);
13452   for (unsigned int i = 0; i < n_elts; i++)
13453     {
13454       /* The vector is provided in gcc endian-neutral fashion.
13455          For aarch64_be Advanced SIMD, it must be laid out in the vector
13456          register in reverse order.  */
13457       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13458       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13459
13460       if (elt_mode != elt_int_mode)
13461         elt = gen_lowpart (elt_int_mode, elt);
13462
13463       if (!CONST_INT_P (elt))
13464         return false;
13465
13466       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13467       for (unsigned int byte = 0; byte < elt_size; byte++)
13468         {
13469           bytes.quick_push (elt_val & 0xff);
13470           elt_val >>= BITS_PER_UNIT;
13471         }
13472     }
13473
13474   /* The immediate must repeat every eight bytes.  */
13475   unsigned int nbytes = bytes.length ();
13476   for (unsigned i = 8; i < nbytes; ++i)
13477     if (bytes[i] != bytes[i - 8])
13478       return false;
13479
13480   /* Get the repeating 8-byte value as an integer.  No endian correction
13481      is needed here because bytes is already in lsb-first order.  */
13482   unsigned HOST_WIDE_INT val64 = 0;
13483   for (unsigned int i = 0; i < 8; i++)
13484     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13485               << (i * BITS_PER_UNIT));
13486
13487   if (vec_flags & VEC_SVE_DATA)
13488     return aarch64_sve_valid_immediate (val64, info);
13489   else
13490     return aarch64_advsimd_valid_immediate (val64, info, which);
13491 }
13492
13493 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13494    has a step in the range of INDEX.  Return the index expression if so,
13495    otherwise return null.  */
13496 rtx
13497 aarch64_check_zero_based_sve_index_immediate (rtx x)
13498 {
13499   rtx base, step;
13500   if (const_vec_series_p (x, &base, &step)
13501       && base == const0_rtx
13502       && aarch64_sve_index_immediate_p (step))
13503     return step;
13504   return NULL_RTX;
13505 }
13506
13507 /* Check of immediate shift constants are within range.  */
13508 bool
13509 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13510 {
13511   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13512   if (left)
13513     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13514   else
13515     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13516 }
13517
13518 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13519    operation of width WIDTH at bit position POS.  */
13520
13521 rtx
13522 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13523 {
13524   gcc_assert (CONST_INT_P (width));
13525   gcc_assert (CONST_INT_P (pos));
13526
13527   unsigned HOST_WIDE_INT mask
13528     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13529   return GEN_INT (mask << UINTVAL (pos));
13530 }
13531
13532 bool
13533 aarch64_mov_operand_p (rtx x, machine_mode mode)
13534 {
13535   if (GET_CODE (x) == HIGH
13536       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13537     return true;
13538
13539   if (CONST_INT_P (x))
13540     return true;
13541
13542   if (VECTOR_MODE_P (GET_MODE (x)))
13543     return aarch64_simd_valid_immediate (x, NULL);
13544
13545   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13546     return true;
13547
13548   if (aarch64_sve_cnt_immediate_p (x))
13549     return true;
13550
13551   return aarch64_classify_symbolic_expression (x)
13552     == SYMBOL_TINY_ABSOLUTE;
13553 }
13554
13555 /* Return a const_int vector of VAL.  */
13556 rtx
13557 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13558 {
13559   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13560   return gen_const_vec_duplicate (mode, c);
13561 }
13562
13563 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
13564
13565 bool
13566 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13567 {
13568   machine_mode vmode;
13569
13570   vmode = aarch64_simd_container_mode (mode, 64);
13571   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13572   return aarch64_simd_valid_immediate (op_v, NULL);
13573 }
13574
13575 /* Construct and return a PARALLEL RTX vector with elements numbering the
13576    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13577    the vector - from the perspective of the architecture.  This does not
13578    line up with GCC's perspective on lane numbers, so we end up with
13579    different masks depending on our target endian-ness.  The diagram
13580    below may help.  We must draw the distinction when building masks
13581    which select one half of the vector.  An instruction selecting
13582    architectural low-lanes for a big-endian target, must be described using
13583    a mask selecting GCC high-lanes.
13584
13585                  Big-Endian             Little-Endian
13586
13587 GCC             0   1   2   3           3   2   1   0
13588               | x | x | x | x |       | x | x | x | x |
13589 Architecture    3   2   1   0           3   2   1   0
13590
13591 Low Mask:         { 2, 3 }                { 0, 1 }
13592 High Mask:        { 0, 1 }                { 2, 3 }
13593
13594    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
13595
13596 rtx
13597 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13598 {
13599   rtvec v = rtvec_alloc (nunits / 2);
13600   int high_base = nunits / 2;
13601   int low_base = 0;
13602   int base;
13603   rtx t1;
13604   int i;
13605
13606   if (BYTES_BIG_ENDIAN)
13607     base = high ? low_base : high_base;
13608   else
13609     base = high ? high_base : low_base;
13610
13611   for (i = 0; i < nunits / 2; i++)
13612     RTVEC_ELT (v, i) = GEN_INT (base + i);
13613
13614   t1 = gen_rtx_PARALLEL (mode, v);
13615   return t1;
13616 }
13617
13618 /* Check OP for validity as a PARALLEL RTX vector with elements
13619    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13620    from the perspective of the architecture.  See the diagram above
13621    aarch64_simd_vect_par_cnst_half for more details.  */
13622
13623 bool
13624 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13625                                        bool high)
13626 {
13627   int nelts;
13628   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13629     return false;
13630
13631   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13632   HOST_WIDE_INT count_op = XVECLEN (op, 0);
13633   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13634   int i = 0;
13635
13636   if (count_op != count_ideal)
13637     return false;
13638
13639   for (i = 0; i < count_ideal; i++)
13640     {
13641       rtx elt_op = XVECEXP (op, 0, i);
13642       rtx elt_ideal = XVECEXP (ideal, 0, i);
13643
13644       if (!CONST_INT_P (elt_op)
13645           || INTVAL (elt_ideal) != INTVAL (elt_op))
13646         return false;
13647     }
13648   return true;
13649 }
13650
13651 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
13652    HIGH (exclusive).  */
13653 void
13654 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13655                           const_tree exp)
13656 {
13657   HOST_WIDE_INT lane;
13658   gcc_assert (CONST_INT_P (operand));
13659   lane = INTVAL (operand);
13660
13661   if (lane < low || lane >= high)
13662   {
13663     if (exp)
13664       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13665     else
13666       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13667   }
13668 }
13669
13670 /* Peform endian correction on lane number N, which indexes a vector
13671    of mode MODE, and return the result as an SImode rtx.  */
13672
13673 rtx
13674 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13675 {
13676   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13677 }
13678
13679 /* Return TRUE if OP is a valid vector addressing mode.  */
13680
13681 bool
13682 aarch64_simd_mem_operand_p (rtx op)
13683 {
13684   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13685                         || REG_P (XEXP (op, 0)));
13686 }
13687
13688 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
13689
13690 bool
13691 aarch64_sve_ld1r_operand_p (rtx op)
13692 {
13693   struct aarch64_address_info addr;
13694   scalar_mode mode;
13695
13696   return (MEM_P (op)
13697           && is_a <scalar_mode> (GET_MODE (op), &mode)
13698           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13699           && addr.type == ADDRESS_REG_IMM
13700           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13701 }
13702
13703 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13704    The conditions for STR are the same.  */
13705 bool
13706 aarch64_sve_ldr_operand_p (rtx op)
13707 {
13708   struct aarch64_address_info addr;
13709
13710   return (MEM_P (op)
13711           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13712                                        false, ADDR_QUERY_ANY)
13713           && addr.type == ADDRESS_REG_IMM);
13714 }
13715
13716 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13717    We need to be able to access the individual pieces, so the range
13718    is different from LD[234] and ST[234].  */
13719 bool
13720 aarch64_sve_struct_memory_operand_p (rtx op)
13721 {
13722   if (!MEM_P (op))
13723     return false;
13724
13725   machine_mode mode = GET_MODE (op);
13726   struct aarch64_address_info addr;
13727   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13728                                  ADDR_QUERY_ANY)
13729       || addr.type != ADDRESS_REG_IMM)
13730     return false;
13731
13732   poly_int64 first = addr.const_offset;
13733   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13734   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13735           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13736 }
13737
13738 /* Emit a register copy from operand to operand, taking care not to
13739    early-clobber source registers in the process.
13740
13741    COUNT is the number of components into which the copy needs to be
13742    decomposed.  */
13743 void
13744 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13745                                 unsigned int count)
13746 {
13747   unsigned int i;
13748   int rdest = REGNO (operands[0]);
13749   int rsrc = REGNO (operands[1]);
13750
13751   if (!reg_overlap_mentioned_p (operands[0], operands[1])
13752       || rdest < rsrc)
13753     for (i = 0; i < count; i++)
13754       emit_move_insn (gen_rtx_REG (mode, rdest + i),
13755                       gen_rtx_REG (mode, rsrc + i));
13756   else
13757     for (i = 0; i < count; i++)
13758       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13759                       gen_rtx_REG (mode, rsrc + count - i - 1));
13760 }
13761
13762 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13763    one of VSTRUCT modes: OI, CI, or XI.  */
13764 int
13765 aarch64_simd_attr_length_rglist (machine_mode mode)
13766 {
13767   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
13768   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13769 }
13770
13771 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
13772    alignment of a vector to 128 bits.  SVE predicates have an alignment of
13773    16 bits.  */
13774 static HOST_WIDE_INT
13775 aarch64_simd_vector_alignment (const_tree type)
13776 {
13777   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13778     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13779        be set for non-predicate vectors of booleans.  Modes are the most
13780        direct way we have of identifying real SVE predicate types.  */
13781     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13782   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13783   return MIN (align, 128);
13784 }
13785
13786 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
13787 static HOST_WIDE_INT
13788 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13789 {
13790   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13791     {
13792       /* If the length of the vector is fixed, try to align to that length,
13793          otherwise don't try to align at all.  */
13794       HOST_WIDE_INT result;
13795       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13796         result = TYPE_ALIGN (TREE_TYPE (type));
13797       return result;
13798     }
13799   return TYPE_ALIGN (type);
13800 }
13801
13802 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
13803 static bool
13804 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13805 {
13806   if (is_packed)
13807     return false;
13808
13809   /* For fixed-length vectors, check that the vectorizer will aim for
13810      full-vector alignment.  This isn't true for generic GCC vectors
13811      that are wider than the ABI maximum of 128 bits.  */
13812   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13813       && (wi::to_widest (TYPE_SIZE (type))
13814           != aarch64_vectorize_preferred_vector_alignment (type)))
13815     return false;
13816
13817   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
13818   return true;
13819 }
13820
13821 /* Return true if the vector misalignment factor is supported by the
13822    target.  */
13823 static bool
13824 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13825                                              const_tree type, int misalignment,
13826                                              bool is_packed)
13827 {
13828   if (TARGET_SIMD && STRICT_ALIGNMENT)
13829     {
13830       /* Return if movmisalign pattern is not supported for this mode.  */
13831       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13832         return false;
13833
13834       /* Misalignment factor is unknown at compile time.  */
13835       if (misalignment == -1)
13836         return false;
13837     }
13838   return default_builtin_support_vector_misalignment (mode, type, misalignment,
13839                                                       is_packed);
13840 }
13841
13842 /* If VALS is a vector constant that can be loaded into a register
13843    using DUP, generate instructions to do so and return an RTX to
13844    assign to the register.  Otherwise return NULL_RTX.  */
13845 static rtx
13846 aarch64_simd_dup_constant (rtx vals)
13847 {
13848   machine_mode mode = GET_MODE (vals);
13849   machine_mode inner_mode = GET_MODE_INNER (mode);
13850   rtx x;
13851
13852   if (!const_vec_duplicate_p (vals, &x))
13853     return NULL_RTX;
13854
13855   /* We can load this constant by using DUP and a constant in a
13856      single ARM register.  This will be cheaper than a vector
13857      load.  */
13858   x = copy_to_mode_reg (inner_mode, x);
13859   return gen_vec_duplicate (mode, x);
13860 }
13861
13862
13863 /* Generate code to load VALS, which is a PARALLEL containing only
13864    constants (for vec_init) or CONST_VECTOR, efficiently into a
13865    register.  Returns an RTX to copy into the register, or NULL_RTX
13866    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
13867 static rtx
13868 aarch64_simd_make_constant (rtx vals)
13869 {
13870   machine_mode mode = GET_MODE (vals);
13871   rtx const_dup;
13872   rtx const_vec = NULL_RTX;
13873   int n_const = 0;
13874   int i;
13875
13876   if (GET_CODE (vals) == CONST_VECTOR)
13877     const_vec = vals;
13878   else if (GET_CODE (vals) == PARALLEL)
13879     {
13880       /* A CONST_VECTOR must contain only CONST_INTs and
13881          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13882          Only store valid constants in a CONST_VECTOR.  */
13883       int n_elts = XVECLEN (vals, 0);
13884       for (i = 0; i < n_elts; ++i)
13885         {
13886           rtx x = XVECEXP (vals, 0, i);
13887           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13888             n_const++;
13889         }
13890       if (n_const == n_elts)
13891         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13892     }
13893   else
13894     gcc_unreachable ();
13895
13896   if (const_vec != NULL_RTX
13897       && aarch64_simd_valid_immediate (const_vec, NULL))
13898     /* Load using MOVI/MVNI.  */
13899     return const_vec;
13900   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13901     /* Loaded using DUP.  */
13902     return const_dup;
13903   else if (const_vec != NULL_RTX)
13904     /* Load from constant pool. We can not take advantage of single-cycle
13905        LD1 because we need a PC-relative addressing mode.  */
13906     return const_vec;
13907   else
13908     /* A PARALLEL containing something not valid inside CONST_VECTOR.
13909        We can not construct an initializer.  */
13910     return NULL_RTX;
13911 }
13912
13913 /* Expand a vector initialisation sequence, such that TARGET is
13914    initialised to contain VALS.  */
13915
13916 void
13917 aarch64_expand_vector_init (rtx target, rtx vals)
13918 {
13919   machine_mode mode = GET_MODE (target);
13920   scalar_mode inner_mode = GET_MODE_INNER (mode);
13921   /* The number of vector elements.  */
13922   int n_elts = XVECLEN (vals, 0);
13923   /* The number of vector elements which are not constant.  */
13924   int n_var = 0;
13925   rtx any_const = NULL_RTX;
13926   /* The first element of vals.  */
13927   rtx v0 = XVECEXP (vals, 0, 0);
13928   bool all_same = true;
13929
13930   /* Count the number of variable elements to initialise.  */
13931   for (int i = 0; i < n_elts; ++i)
13932     {
13933       rtx x = XVECEXP (vals, 0, i);
13934       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13935         ++n_var;
13936       else
13937         any_const = x;
13938
13939       all_same &= rtx_equal_p (x, v0);
13940     }
13941
13942   /* No variable elements, hand off to aarch64_simd_make_constant which knows
13943      how best to handle this.  */
13944   if (n_var == 0)
13945     {
13946       rtx constant = aarch64_simd_make_constant (vals);
13947       if (constant != NULL_RTX)
13948         {
13949           emit_move_insn (target, constant);
13950           return;
13951         }
13952     }
13953
13954   /* Splat a single non-constant element if we can.  */
13955   if (all_same)
13956     {
13957       rtx x = copy_to_mode_reg (inner_mode, v0);
13958       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13959       return;
13960     }
13961
13962   enum insn_code icode = optab_handler (vec_set_optab, mode);
13963   gcc_assert (icode != CODE_FOR_nothing);
13964
13965   /* If there are only variable elements, try to optimize
13966      the insertion using dup for the most common element
13967      followed by insertions.  */
13968
13969   /* The algorithm will fill matches[*][0] with the earliest matching element,
13970      and matches[X][1] with the count of duplicate elements (if X is the
13971      earliest element which has duplicates).  */
13972
13973   if (n_var == n_elts && n_elts <= 16)
13974     {
13975       int matches[16][2] = {0};
13976       for (int i = 0; i < n_elts; i++)
13977         {
13978           for (int j = 0; j <= i; j++)
13979             {
13980               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13981                 {
13982                   matches[i][0] = j;
13983                   matches[j][1]++;
13984                   break;
13985                 }
13986             }
13987         }
13988       int maxelement = 0;
13989       int maxv = 0;
13990       for (int i = 0; i < n_elts; i++)
13991         if (matches[i][1] > maxv)
13992           {
13993             maxelement = i;
13994             maxv = matches[i][1];
13995           }
13996
13997       /* Create a duplicate of the most common element, unless all elements
13998          are equally useless to us, in which case just immediately set the
13999          vector register using the first element.  */
14000
14001       if (maxv == 1)
14002         {
14003           /* For vectors of two 64-bit elements, we can do even better.  */
14004           if (n_elts == 2
14005               && (inner_mode == E_DImode
14006                   || inner_mode == E_DFmode))
14007
14008             {
14009               rtx x0 = XVECEXP (vals, 0, 0);
14010               rtx x1 = XVECEXP (vals, 0, 1);
14011               /* Combine can pick up this case, but handling it directly
14012                  here leaves clearer RTL.
14013
14014                  This is load_pair_lanes<mode>, and also gives us a clean-up
14015                  for store_pair_lanes<mode>.  */
14016               if (memory_operand (x0, inner_mode)
14017                   && memory_operand (x1, inner_mode)
14018                   && !STRICT_ALIGNMENT
14019                   && rtx_equal_p (XEXP (x1, 0),
14020                                   plus_constant (Pmode,
14021                                                  XEXP (x0, 0),
14022                                                  GET_MODE_SIZE (inner_mode))))
14023                 {
14024                   rtx t;
14025                   if (inner_mode == DFmode)
14026                     t = gen_load_pair_lanesdf (target, x0, x1);
14027                   else
14028                     t = gen_load_pair_lanesdi (target, x0, x1);
14029                   emit_insn (t);
14030                   return;
14031                 }
14032             }
14033           /* The subreg-move sequence below will move into lane zero of the
14034              vector register.  For big-endian we want that position to hold
14035              the last element of VALS.  */
14036           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
14037           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14038           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
14039         }
14040       else
14041         {
14042           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14043           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
14044         }
14045
14046       /* Insert the rest.  */
14047       for (int i = 0; i < n_elts; i++)
14048         {
14049           rtx x = XVECEXP (vals, 0, i);
14050           if (matches[i][0] == maxelement)
14051             continue;
14052           x = copy_to_mode_reg (inner_mode, x);
14053           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14054         }
14055       return;
14056     }
14057
14058   /* Initialise a vector which is part-variable.  We want to first try
14059      to build those lanes which are constant in the most efficient way we
14060      can.  */
14061   if (n_var != n_elts)
14062     {
14063       rtx copy = copy_rtx (vals);
14064
14065       /* Load constant part of vector.  We really don't care what goes into the
14066          parts we will overwrite, but we're more likely to be able to load the
14067          constant efficiently if it has fewer, larger, repeating parts
14068          (see aarch64_simd_valid_immediate).  */
14069       for (int i = 0; i < n_elts; i++)
14070         {
14071           rtx x = XVECEXP (vals, 0, i);
14072           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14073             continue;
14074           rtx subst = any_const;
14075           for (int bit = n_elts / 2; bit > 0; bit /= 2)
14076             {
14077               /* Look in the copied vector, as more elements are const.  */
14078               rtx test = XVECEXP (copy, 0, i ^ bit);
14079               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
14080                 {
14081                   subst = test;
14082                   break;
14083                 }
14084             }
14085           XVECEXP (copy, 0, i) = subst;
14086         }
14087       aarch64_expand_vector_init (target, copy);
14088     }
14089
14090   /* Insert the variable lanes directly.  */
14091   for (int i = 0; i < n_elts; i++)
14092     {
14093       rtx x = XVECEXP (vals, 0, i);
14094       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14095         continue;
14096       x = copy_to_mode_reg (inner_mode, x);
14097       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14098     }
14099 }
14100
14101 static unsigned HOST_WIDE_INT
14102 aarch64_shift_truncation_mask (machine_mode mode)
14103 {
14104   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
14105     return 0;
14106   return GET_MODE_UNIT_BITSIZE (mode) - 1;
14107 }
14108
14109 /* Select a format to encode pointers in exception handling data.  */
14110 int
14111 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
14112 {
14113    int type;
14114    switch (aarch64_cmodel)
14115      {
14116      case AARCH64_CMODEL_TINY:
14117      case AARCH64_CMODEL_TINY_PIC:
14118      case AARCH64_CMODEL_SMALL:
14119      case AARCH64_CMODEL_SMALL_PIC:
14120      case AARCH64_CMODEL_SMALL_SPIC:
14121        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
14122           for everything.  */
14123        type = DW_EH_PE_sdata4;
14124        break;
14125      default:
14126        /* No assumptions here.  8-byte relocs required.  */
14127        type = DW_EH_PE_sdata8;
14128        break;
14129      }
14130    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
14131 }
14132
14133 /* The last .arch and .tune assembly strings that we printed.  */
14134 static std::string aarch64_last_printed_arch_string;
14135 static std::string aarch64_last_printed_tune_string;
14136
14137 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
14138    by the function fndecl.  */
14139
14140 void
14141 aarch64_declare_function_name (FILE *stream, const char* name,
14142                                 tree fndecl)
14143 {
14144   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14145
14146   struct cl_target_option *targ_options;
14147   if (target_parts)
14148     targ_options = TREE_TARGET_OPTION (target_parts);
14149   else
14150     targ_options = TREE_TARGET_OPTION (target_option_current_node);
14151   gcc_assert (targ_options);
14152
14153   const struct processor *this_arch
14154     = aarch64_get_arch (targ_options->x_explicit_arch);
14155
14156   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
14157   std::string extension
14158     = aarch64_get_extension_string_for_isa_flags (isa_flags,
14159                                                   this_arch->flags);
14160   /* Only update the assembler .arch string if it is distinct from the last
14161      such string we printed.  */
14162   std::string to_print = this_arch->name + extension;
14163   if (to_print != aarch64_last_printed_arch_string)
14164     {
14165       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
14166       aarch64_last_printed_arch_string = to_print;
14167     }
14168
14169   /* Print the cpu name we're tuning for in the comments, might be
14170      useful to readers of the generated asm.  Do it only when it changes
14171      from function to function and verbose assembly is requested.  */
14172   const struct processor *this_tune
14173     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
14174
14175   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
14176     {
14177       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
14178                    this_tune->name);
14179       aarch64_last_printed_tune_string = this_tune->name;
14180     }
14181
14182   /* Don't forget the type directive for ELF.  */
14183   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14184   ASM_OUTPUT_LABEL (stream, name);
14185 }
14186
14187 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
14188
14189 static void
14190 aarch64_start_file (void)
14191 {
14192   struct cl_target_option *default_options
14193     = TREE_TARGET_OPTION (target_option_default_node);
14194
14195   const struct processor *default_arch
14196     = aarch64_get_arch (default_options->x_explicit_arch);
14197   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14198   std::string extension
14199     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14200                                                   default_arch->flags);
14201
14202    aarch64_last_printed_arch_string = default_arch->name + extension;
14203    aarch64_last_printed_tune_string = "";
14204    asm_fprintf (asm_out_file, "\t.arch %s\n",
14205                 aarch64_last_printed_arch_string.c_str ());
14206
14207    default_file_start ();
14208 }
14209
14210 /* Emit load exclusive.  */
14211
14212 static void
14213 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14214                              rtx mem, rtx model_rtx)
14215 {
14216   rtx (*gen) (rtx, rtx, rtx);
14217
14218   switch (mode)
14219     {
14220     case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
14221     case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
14222     case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
14223     case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
14224     default:
14225       gcc_unreachable ();
14226     }
14227
14228   emit_insn (gen (rval, mem, model_rtx));
14229 }
14230
14231 /* Emit store exclusive.  */
14232
14233 static void
14234 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14235                               rtx rval, rtx mem, rtx model_rtx)
14236 {
14237   rtx (*gen) (rtx, rtx, rtx, rtx);
14238
14239   switch (mode)
14240     {
14241     case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
14242     case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
14243     case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
14244     case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
14245     default:
14246       gcc_unreachable ();
14247     }
14248
14249   emit_insn (gen (bval, rval, mem, model_rtx));
14250 }
14251
14252 /* Mark the previous jump instruction as unlikely.  */
14253
14254 static void
14255 aarch64_emit_unlikely_jump (rtx insn)
14256 {
14257   rtx_insn *jump = emit_jump_insn (insn);
14258   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14259 }
14260
14261 /* Expand a compare and swap pattern.  */
14262
14263 void
14264 aarch64_expand_compare_and_swap (rtx operands[])
14265 {
14266   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14267   machine_mode mode, cmp_mode;
14268   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
14269   int idx;
14270   gen_cas_fn gen;
14271   const gen_cas_fn split_cas[] =
14272   {
14273     gen_aarch64_compare_and_swapqi,
14274     gen_aarch64_compare_and_swaphi,
14275     gen_aarch64_compare_and_swapsi,
14276     gen_aarch64_compare_and_swapdi
14277   };
14278   const gen_cas_fn atomic_cas[] =
14279   {
14280     gen_aarch64_compare_and_swapqi_lse,
14281     gen_aarch64_compare_and_swaphi_lse,
14282     gen_aarch64_compare_and_swapsi_lse,
14283     gen_aarch64_compare_and_swapdi_lse
14284   };
14285
14286   bval = operands[0];
14287   rval = operands[1];
14288   mem = operands[2];
14289   oldval = operands[3];
14290   newval = operands[4];
14291   is_weak = operands[5];
14292   mod_s = operands[6];
14293   mod_f = operands[7];
14294   mode = GET_MODE (mem);
14295   cmp_mode = mode;
14296
14297   /* Normally the succ memory model must be stronger than fail, but in the
14298      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14299      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
14300
14301   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14302       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14303     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14304
14305   switch (mode)
14306     {
14307     case E_QImode:
14308     case E_HImode:
14309       /* For short modes, we're going to perform the comparison in SImode,
14310          so do the zero-extension now.  */
14311       cmp_mode = SImode;
14312       rval = gen_reg_rtx (SImode);
14313       oldval = convert_modes (SImode, mode, oldval, true);
14314       /* Fall through.  */
14315
14316     case E_SImode:
14317     case E_DImode:
14318       /* Force the value into a register if needed.  */
14319       if (!aarch64_plus_operand (oldval, mode))
14320         oldval = force_reg (cmp_mode, oldval);
14321       break;
14322
14323     default:
14324       gcc_unreachable ();
14325     }
14326
14327   switch (mode)
14328     {
14329     case E_QImode: idx = 0; break;
14330     case E_HImode: idx = 1; break;
14331     case E_SImode: idx = 2; break;
14332     case E_DImode: idx = 3; break;
14333     default:
14334       gcc_unreachable ();
14335     }
14336   if (TARGET_LSE)
14337     gen = atomic_cas[idx];
14338   else
14339     gen = split_cas[idx];
14340
14341   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
14342
14343   if (mode == QImode || mode == HImode)
14344     emit_move_insn (operands[1], gen_lowpart (mode, rval));
14345
14346   x = gen_rtx_REG (CCmode, CC_REGNUM);
14347   x = gen_rtx_EQ (SImode, x, const0_rtx);
14348   emit_insn (gen_rtx_SET (bval, x));
14349 }
14350
14351 /* Test whether the target supports using a atomic load-operate instruction.
14352    CODE is the operation and AFTER is TRUE if the data in memory after the
14353    operation should be returned and FALSE if the data before the operation
14354    should be returned.  Returns FALSE if the operation isn't supported by the
14355    architecture.  */
14356
14357 bool
14358 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14359 {
14360   if (!TARGET_LSE)
14361     return false;
14362
14363   switch (code)
14364     {
14365     case SET:
14366     case AND:
14367     case IOR:
14368     case XOR:
14369     case MINUS:
14370     case PLUS:
14371       return true;
14372     default:
14373       return false;
14374     }
14375 }
14376
14377 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14378    sequence implementing an atomic operation.  */
14379
14380 static void
14381 aarch64_emit_post_barrier (enum memmodel model)
14382 {
14383   const enum memmodel base_model = memmodel_base (model);
14384
14385   if (is_mm_sync (model)
14386       && (base_model == MEMMODEL_ACQUIRE
14387           || base_model == MEMMODEL_ACQ_REL
14388           || base_model == MEMMODEL_SEQ_CST))
14389     {
14390       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14391     }
14392 }
14393
14394 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
14395    for the data in memory.  EXPECTED is the value expected to be in memory.
14396    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
14397    is the memory ordering to use.  */
14398
14399 void
14400 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14401                         rtx expected, rtx desired,
14402                         rtx model)
14403 {
14404   rtx (*gen) (rtx, rtx, rtx, rtx);
14405   machine_mode mode;
14406
14407   mode = GET_MODE (mem);
14408
14409   switch (mode)
14410     {
14411     case E_QImode: gen = gen_aarch64_atomic_casqi; break;
14412     case E_HImode: gen = gen_aarch64_atomic_cashi; break;
14413     case E_SImode: gen = gen_aarch64_atomic_cassi; break;
14414     case E_DImode: gen = gen_aarch64_atomic_casdi; break;
14415     default:
14416       gcc_unreachable ();
14417     }
14418
14419   /* Move the expected value into the CAS destination register.  */
14420   emit_insn (gen_rtx_SET (rval, expected));
14421
14422   /* Emit the CAS.  */
14423   emit_insn (gen (rval, mem, desired, model));
14424
14425   /* Compare the expected value with the value loaded by the CAS, to establish
14426      whether the swap was made.  */
14427   aarch64_gen_compare_reg (EQ, rval, expected);
14428 }
14429
14430 /* Split a compare and swap pattern.  */
14431
14432 void
14433 aarch64_split_compare_and_swap (rtx operands[])
14434 {
14435   rtx rval, mem, oldval, newval, scratch;
14436   machine_mode mode;
14437   bool is_weak;
14438   rtx_code_label *label1, *label2;
14439   rtx x, cond;
14440   enum memmodel model;
14441   rtx model_rtx;
14442
14443   rval = operands[0];
14444   mem = operands[1];
14445   oldval = operands[2];
14446   newval = operands[3];
14447   is_weak = (operands[4] != const0_rtx);
14448   model_rtx = operands[5];
14449   scratch = operands[7];
14450   mode = GET_MODE (mem);
14451   model = memmodel_from_int (INTVAL (model_rtx));
14452
14453   /* When OLDVAL is zero and we want the strong version we can emit a tighter
14454     loop:
14455     .label1:
14456         LD[A]XR rval, [mem]
14457         CBNZ    rval, .label2
14458         ST[L]XR scratch, newval, [mem]
14459         CBNZ    scratch, .label1
14460     .label2:
14461         CMP     rval, 0.  */
14462   bool strong_zero_p = !is_weak && oldval == const0_rtx;
14463
14464   label1 = NULL;
14465   if (!is_weak)
14466     {
14467       label1 = gen_label_rtx ();
14468       emit_label (label1);
14469     }
14470   label2 = gen_label_rtx ();
14471
14472   /* The initial load can be relaxed for a __sync operation since a final
14473      barrier will be emitted to stop code hoisting.  */
14474   if (is_mm_sync (model))
14475     aarch64_emit_load_exclusive (mode, rval, mem,
14476                                  GEN_INT (MEMMODEL_RELAXED));
14477   else
14478     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14479
14480   if (strong_zero_p)
14481     {
14482       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14483       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14484                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14485       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14486     }
14487   else
14488     {
14489       cond = aarch64_gen_compare_reg (NE, rval, oldval);
14490       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14491       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14492                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14493       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14494     }
14495
14496   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14497
14498   if (!is_weak)
14499     {
14500       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14501       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14502                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14503       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14504     }
14505   else
14506     {
14507       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14508       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14509       emit_insn (gen_rtx_SET (cond, x));
14510     }
14511
14512   emit_label (label2);
14513   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14514      to set the condition flags.  If this is not used it will be removed by
14515      later passes.  */
14516   if (strong_zero_p)
14517     {
14518       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14519       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14520       emit_insn (gen_rtx_SET (cond, x));
14521     }
14522   /* Emit any final barrier needed for a __sync operation.  */
14523   if (is_mm_sync (model))
14524     aarch64_emit_post_barrier (model);
14525 }
14526
14527 /* Emit a BIC instruction.  */
14528
14529 static void
14530 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14531 {
14532   rtx shift_rtx = GEN_INT (shift);
14533   rtx (*gen) (rtx, rtx, rtx, rtx);
14534
14535   switch (mode)
14536     {
14537     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14538     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14539     default:
14540       gcc_unreachable ();
14541     }
14542
14543   emit_insn (gen (dst, s2, shift_rtx, s1));
14544 }
14545
14546 /* Emit an atomic swap.  */
14547
14548 static void
14549 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14550                           rtx mem, rtx model)
14551 {
14552   rtx (*gen) (rtx, rtx, rtx, rtx);
14553
14554   switch (mode)
14555     {
14556     case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
14557     case E_HImode: gen = gen_aarch64_atomic_swphi; break;
14558     case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
14559     case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
14560     default:
14561       gcc_unreachable ();
14562     }
14563
14564   emit_insn (gen (dst, mem, value, model));
14565 }
14566
14567 /* Operations supported by aarch64_emit_atomic_load_op.  */
14568
14569 enum aarch64_atomic_load_op_code
14570 {
14571   AARCH64_LDOP_PLUS,    /* A + B  */
14572   AARCH64_LDOP_XOR,     /* A ^ B  */
14573   AARCH64_LDOP_OR,      /* A | B  */
14574   AARCH64_LDOP_BIC      /* A & ~B  */
14575 };
14576
14577 /* Emit an atomic load-operate.  */
14578
14579 static void
14580 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
14581                              machine_mode mode, rtx dst, rtx src,
14582                              rtx mem, rtx model)
14583 {
14584   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
14585   const aarch64_atomic_load_op_fn plus[] =
14586   {
14587     gen_aarch64_atomic_loadaddqi,
14588     gen_aarch64_atomic_loadaddhi,
14589     gen_aarch64_atomic_loadaddsi,
14590     gen_aarch64_atomic_loadadddi
14591   };
14592   const aarch64_atomic_load_op_fn eor[] =
14593   {
14594     gen_aarch64_atomic_loadeorqi,
14595     gen_aarch64_atomic_loadeorhi,
14596     gen_aarch64_atomic_loadeorsi,
14597     gen_aarch64_atomic_loadeordi
14598   };
14599   const aarch64_atomic_load_op_fn ior[] =
14600   {
14601     gen_aarch64_atomic_loadsetqi,
14602     gen_aarch64_atomic_loadsethi,
14603     gen_aarch64_atomic_loadsetsi,
14604     gen_aarch64_atomic_loadsetdi
14605   };
14606   const aarch64_atomic_load_op_fn bic[] =
14607   {
14608     gen_aarch64_atomic_loadclrqi,
14609     gen_aarch64_atomic_loadclrhi,
14610     gen_aarch64_atomic_loadclrsi,
14611     gen_aarch64_atomic_loadclrdi
14612   };
14613   aarch64_atomic_load_op_fn gen;
14614   int idx = 0;
14615
14616   switch (mode)
14617     {
14618     case E_QImode: idx = 0; break;
14619     case E_HImode: idx = 1; break;
14620     case E_SImode: idx = 2; break;
14621     case E_DImode: idx = 3; break;
14622     default:
14623       gcc_unreachable ();
14624     }
14625
14626   switch (code)
14627     {
14628     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
14629     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
14630     case AARCH64_LDOP_OR: gen = ior[idx]; break;
14631     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
14632     default:
14633       gcc_unreachable ();
14634     }
14635
14636   emit_insn (gen (dst, mem, src, model));
14637 }
14638
14639 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
14640    location to store the data read from memory.  OUT_RESULT is the location to
14641    store the result of the operation.  MEM is the memory location to read and
14642    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
14643    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
14644    be NULL.  */
14645
14646 void
14647 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14648                          rtx mem, rtx value, rtx model_rtx)
14649 {
14650   machine_mode mode = GET_MODE (mem);
14651   machine_mode wmode = (mode == DImode ? DImode : SImode);
14652   const bool short_mode = (mode < SImode);
14653   aarch64_atomic_load_op_code ldop_code;
14654   rtx src;
14655   rtx x;
14656
14657   if (out_data)
14658     out_data = gen_lowpart (mode, out_data);
14659
14660   if (out_result)
14661     out_result = gen_lowpart (mode, out_result);
14662
14663   /* Make sure the value is in a register, putting it into a destination
14664      register if it needs to be manipulated.  */
14665   if (!register_operand (value, mode)
14666       || code == AND || code == MINUS)
14667     {
14668       src = out_result ? out_result : out_data;
14669       emit_move_insn (src, gen_lowpart (mode, value));
14670     }
14671   else
14672     src = value;
14673   gcc_assert (register_operand (src, mode));
14674
14675   /* Preprocess the data for the operation as necessary.  If the operation is
14676      a SET then emit a swap instruction and finish.  */
14677   switch (code)
14678     {
14679     case SET:
14680       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14681       return;
14682
14683     case MINUS:
14684       /* Negate the value and treat it as a PLUS.  */
14685       {
14686         rtx neg_src;
14687
14688         /* Resize the value if necessary.  */
14689         if (short_mode)
14690           src = gen_lowpart (wmode, src);
14691
14692         neg_src = gen_rtx_NEG (wmode, src);
14693         emit_insn (gen_rtx_SET (src, neg_src));
14694
14695         if (short_mode)
14696           src = gen_lowpart (mode, src);
14697       }
14698       /* Fall-through.  */
14699     case PLUS:
14700       ldop_code = AARCH64_LDOP_PLUS;
14701       break;
14702
14703     case IOR:
14704       ldop_code = AARCH64_LDOP_OR;
14705       break;
14706
14707     case XOR:
14708       ldop_code = AARCH64_LDOP_XOR;
14709       break;
14710
14711     case AND:
14712       {
14713         rtx not_src;
14714
14715         /* Resize the value if necessary.  */
14716         if (short_mode)
14717           src = gen_lowpart (wmode, src);
14718
14719         not_src = gen_rtx_NOT (wmode, src);
14720         emit_insn (gen_rtx_SET (src, not_src));
14721
14722         if (short_mode)
14723           src = gen_lowpart (mode, src);
14724       }
14725       ldop_code = AARCH64_LDOP_BIC;
14726       break;
14727
14728     default:
14729       /* The operation can't be done with atomic instructions.  */
14730       gcc_unreachable ();
14731     }
14732
14733   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
14734
14735   /* If necessary, calculate the data in memory after the update by redoing the
14736      operation from values in registers.  */
14737   if (!out_result)
14738     return;
14739
14740   if (short_mode)
14741     {
14742       src = gen_lowpart (wmode, src);
14743       out_data = gen_lowpart (wmode, out_data);
14744       out_result = gen_lowpart (wmode, out_result);
14745     }
14746
14747   x = NULL_RTX;
14748
14749   switch (code)
14750     {
14751     case MINUS:
14752     case PLUS:
14753       x = gen_rtx_PLUS (wmode, out_data, src);
14754       break;
14755     case IOR:
14756       x = gen_rtx_IOR (wmode, out_data, src);
14757       break;
14758     case XOR:
14759       x = gen_rtx_XOR (wmode, out_data, src);
14760       break;
14761     case AND:
14762       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14763       return;
14764     default:
14765       gcc_unreachable ();
14766     }
14767
14768   emit_set_insn (out_result, x);
14769
14770   return;
14771 }
14772
14773 /* Split an atomic operation.  */
14774
14775 void
14776 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14777                          rtx value, rtx model_rtx, rtx cond)
14778 {
14779   machine_mode mode = GET_MODE (mem);
14780   machine_mode wmode = (mode == DImode ? DImode : SImode);
14781   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14782   const bool is_sync = is_mm_sync (model);
14783   rtx_code_label *label;
14784   rtx x;
14785
14786   /* Split the atomic operation into a sequence.  */
14787   label = gen_label_rtx ();
14788   emit_label (label);
14789
14790   if (new_out)
14791     new_out = gen_lowpart (wmode, new_out);
14792   if (old_out)
14793     old_out = gen_lowpart (wmode, old_out);
14794   else
14795     old_out = new_out;
14796   value = simplify_gen_subreg (wmode, value, mode, 0);
14797
14798   /* The initial load can be relaxed for a __sync operation since a final
14799      barrier will be emitted to stop code hoisting.  */
14800  if (is_sync)
14801     aarch64_emit_load_exclusive (mode, old_out, mem,
14802                                  GEN_INT (MEMMODEL_RELAXED));
14803   else
14804     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14805
14806   switch (code)
14807     {
14808     case SET:
14809       new_out = value;
14810       break;
14811
14812     case NOT:
14813       x = gen_rtx_AND (wmode, old_out, value);
14814       emit_insn (gen_rtx_SET (new_out, x));
14815       x = gen_rtx_NOT (wmode, new_out);
14816       emit_insn (gen_rtx_SET (new_out, x));
14817       break;
14818
14819     case MINUS:
14820       if (CONST_INT_P (value))
14821         {
14822           value = GEN_INT (-INTVAL (value));
14823           code = PLUS;
14824         }
14825       /* Fall through.  */
14826
14827     default:
14828       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14829       emit_insn (gen_rtx_SET (new_out, x));
14830       break;
14831     }
14832
14833   aarch64_emit_store_exclusive (mode, cond, mem,
14834                                 gen_lowpart (mode, new_out), model_rtx);
14835
14836   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14837   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14838                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14839   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14840
14841   /* Emit any final barrier needed for a __sync operation.  */
14842   if (is_sync)
14843     aarch64_emit_post_barrier (model);
14844 }
14845
14846 static void
14847 aarch64_init_libfuncs (void)
14848 {
14849    /* Half-precision float operations.  The compiler handles all operations
14850      with NULL libfuncs by converting to SFmode.  */
14851
14852   /* Conversions.  */
14853   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14854   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14855
14856   /* Arithmetic.  */
14857   set_optab_libfunc (add_optab, HFmode, NULL);
14858   set_optab_libfunc (sdiv_optab, HFmode, NULL);
14859   set_optab_libfunc (smul_optab, HFmode, NULL);
14860   set_optab_libfunc (neg_optab, HFmode, NULL);
14861   set_optab_libfunc (sub_optab, HFmode, NULL);
14862
14863   /* Comparisons.  */
14864   set_optab_libfunc (eq_optab, HFmode, NULL);
14865   set_optab_libfunc (ne_optab, HFmode, NULL);
14866   set_optab_libfunc (lt_optab, HFmode, NULL);
14867   set_optab_libfunc (le_optab, HFmode, NULL);
14868   set_optab_libfunc (ge_optab, HFmode, NULL);
14869   set_optab_libfunc (gt_optab, HFmode, NULL);
14870   set_optab_libfunc (unord_optab, HFmode, NULL);
14871 }
14872
14873 /* Target hook for c_mode_for_suffix.  */
14874 static machine_mode
14875 aarch64_c_mode_for_suffix (char suffix)
14876 {
14877   if (suffix == 'q')
14878     return TFmode;
14879
14880   return VOIDmode;
14881 }
14882
14883 /* We can only represent floating point constants which will fit in
14884    "quarter-precision" values.  These values are characterised by
14885    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
14886    by:
14887
14888    (-1)^s * (n/16) * 2^r
14889
14890    Where:
14891      's' is the sign bit.
14892      'n' is an integer in the range 16 <= n <= 31.
14893      'r' is an integer in the range -3 <= r <= 4.  */
14894
14895 /* Return true iff X can be represented by a quarter-precision
14896    floating point immediate operand X.  Note, we cannot represent 0.0.  */
14897 bool
14898 aarch64_float_const_representable_p (rtx x)
14899 {
14900   /* This represents our current view of how many bits
14901      make up the mantissa.  */
14902   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14903   int exponent;
14904   unsigned HOST_WIDE_INT mantissa, mask;
14905   REAL_VALUE_TYPE r, m;
14906   bool fail;
14907
14908   if (!CONST_DOUBLE_P (x))
14909     return false;
14910
14911   /* We don't support HFmode constants yet.  */
14912   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
14913     return false;
14914
14915   r = *CONST_DOUBLE_REAL_VALUE (x);
14916
14917   /* We cannot represent infinities, NaNs or +/-zero.  We won't
14918      know if we have +zero until we analyse the mantissa, but we
14919      can reject the other invalid values.  */
14920   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14921       || REAL_VALUE_MINUS_ZERO (r))
14922     return false;
14923
14924   /* Extract exponent.  */
14925   r = real_value_abs (&r);
14926   exponent = REAL_EXP (&r);
14927
14928   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14929      highest (sign) bit, with a fixed binary point at bit point_pos.
14930      m1 holds the low part of the mantissa, m2 the high part.
14931      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14932      bits for the mantissa, this can fail (low bits will be lost).  */
14933   real_ldexp (&m, &r, point_pos - exponent);
14934   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14935
14936   /* If the low part of the mantissa has bits set we cannot represent
14937      the value.  */
14938   if (w.ulow () != 0)
14939     return false;
14940   /* We have rejected the lower HOST_WIDE_INT, so update our
14941      understanding of how many bits lie in the mantissa and
14942      look only at the high HOST_WIDE_INT.  */
14943   mantissa = w.elt (1);
14944   point_pos -= HOST_BITS_PER_WIDE_INT;
14945
14946   /* We can only represent values with a mantissa of the form 1.xxxx.  */
14947   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14948   if ((mantissa & mask) != 0)
14949     return false;
14950
14951   /* Having filtered unrepresentable values, we may now remove all
14952      but the highest 5 bits.  */
14953   mantissa >>= point_pos - 5;
14954
14955   /* We cannot represent the value 0.0, so reject it.  This is handled
14956      elsewhere.  */
14957   if (mantissa == 0)
14958     return false;
14959
14960   /* Then, as bit 4 is always set, we can mask it off, leaving
14961      the mantissa in the range [0, 15].  */
14962   mantissa &= ~(1 << 4);
14963   gcc_assert (mantissa <= 15);
14964
14965   /* GCC internally does not use IEEE754-like encoding (where normalized
14966      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
14967      Our mantissa values are shifted 4 places to the left relative to
14968      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14969      by 5 places to correct for GCC's representation.  */
14970   exponent = 5 - exponent;
14971
14972   return (exponent >= 0 && exponent <= 7);
14973 }
14974
14975 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14976    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
14977    output MOVI/MVNI, ORR or BIC immediate.  */
14978 char*
14979 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14980                                    enum simd_immediate_check which)
14981 {
14982   bool is_valid;
14983   static char templ[40];
14984   const char *mnemonic;
14985   const char *shift_op;
14986   unsigned int lane_count = 0;
14987   char element_char;
14988
14989   struct simd_immediate_info info;
14990
14991   /* This will return true to show const_vector is legal for use as either
14992      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14993      It will also update INFO to show how the immediate should be generated.
14994      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
14995   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14996   gcc_assert (is_valid);
14997
14998   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14999   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
15000
15001   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15002     {
15003       gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
15004       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
15005          move immediate path.  */
15006       if (aarch64_float_const_zero_rtx_p (info.value))
15007         info.value = GEN_INT (0);
15008       else
15009         {
15010           const unsigned int buf_size = 20;
15011           char float_buf[buf_size] = {'\0'};
15012           real_to_decimal_for_mode (float_buf,
15013                                     CONST_DOUBLE_REAL_VALUE (info.value),
15014                                     buf_size, buf_size, 1, info.elt_mode);
15015
15016           if (lane_count == 1)
15017             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
15018           else
15019             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
15020                       lane_count, element_char, float_buf);
15021           return templ;
15022         }
15023     }
15024
15025   gcc_assert (CONST_INT_P (info.value));
15026
15027   if (which == AARCH64_CHECK_MOV)
15028     {
15029       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
15030       shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
15031       if (lane_count == 1)
15032         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
15033                   mnemonic, UINTVAL (info.value));
15034       else if (info.shift)
15035         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15036                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
15037                   element_char, UINTVAL (info.value), shift_op, info.shift);
15038       else
15039         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15040                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
15041                   element_char, UINTVAL (info.value));
15042     }
15043   else
15044     {
15045       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
15046       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
15047       if (info.shift)
15048         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15049                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
15050                   element_char, UINTVAL (info.value), "lsl", info.shift);
15051       else
15052         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15053                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
15054                   element_char, UINTVAL (info.value));
15055     }
15056   return templ;
15057 }
15058
15059 char*
15060 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
15061 {
15062
15063   /* If a floating point number was passed and we desire to use it in an
15064      integer mode do the conversion to integer.  */
15065   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
15066     {
15067       unsigned HOST_WIDE_INT ival;
15068       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
15069           gcc_unreachable ();
15070       immediate = gen_int_mode (ival, mode);
15071     }
15072
15073   machine_mode vmode;
15074   /* use a 64 bit mode for everything except for DI/DF mode, where we use
15075      a 128 bit vector mode.  */
15076   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
15077
15078   vmode = aarch64_simd_container_mode (mode, width);
15079   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
15080   return aarch64_output_simd_mov_immediate (v_op, width);
15081 }
15082
15083 /* Return the output string to use for moving immediate CONST_VECTOR
15084    into an SVE register.  */
15085
15086 char *
15087 aarch64_output_sve_mov_immediate (rtx const_vector)
15088 {
15089   static char templ[40];
15090   struct simd_immediate_info info;
15091   char element_char;
15092
15093   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15094   gcc_assert (is_valid);
15095
15096   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15097
15098   if (info.step)
15099     {
15100       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15101                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15102                 element_char, INTVAL (info.value), INTVAL (info.step));
15103       return templ;
15104     }
15105
15106   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15107     {
15108       if (aarch64_float_const_zero_rtx_p (info.value))
15109         info.value = GEN_INT (0);
15110       else
15111         {
15112           const int buf_size = 20;
15113           char float_buf[buf_size] = {};
15114           real_to_decimal_for_mode (float_buf,
15115                                     CONST_DOUBLE_REAL_VALUE (info.value),
15116                                     buf_size, buf_size, 1, info.elt_mode);
15117
15118           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15119                     element_char, float_buf);
15120           return templ;
15121         }
15122     }
15123
15124   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15125             element_char, INTVAL (info.value));
15126   return templ;
15127 }
15128
15129 /* Return the asm format for a PTRUE instruction whose destination has
15130    mode MODE.  SUFFIX is the element size suffix.  */
15131
15132 char *
15133 aarch64_output_ptrue (machine_mode mode, char suffix)
15134 {
15135   unsigned int nunits;
15136   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15137   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15138     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15139   else
15140     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15141   return buf;
15142 }
15143
15144 /* Split operands into moves from op[1] + op[2] into op[0].  */
15145
15146 void
15147 aarch64_split_combinev16qi (rtx operands[3])
15148 {
15149   unsigned int dest = REGNO (operands[0]);
15150   unsigned int src1 = REGNO (operands[1]);
15151   unsigned int src2 = REGNO (operands[2]);
15152   machine_mode halfmode = GET_MODE (operands[1]);
15153   unsigned int halfregs = REG_NREGS (operands[1]);
15154   rtx destlo, desthi;
15155
15156   gcc_assert (halfmode == V16QImode);
15157
15158   if (src1 == dest && src2 == dest + halfregs)
15159     {
15160       /* No-op move.  Can't split to nothing; emit something.  */
15161       emit_note (NOTE_INSN_DELETED);
15162       return;
15163     }
15164
15165   /* Preserve register attributes for variable tracking.  */
15166   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15167   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15168                                GET_MODE_SIZE (halfmode));
15169
15170   /* Special case of reversed high/low parts.  */
15171   if (reg_overlap_mentioned_p (operands[2], destlo)
15172       && reg_overlap_mentioned_p (operands[1], desthi))
15173     {
15174       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15175       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15176       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15177     }
15178   else if (!reg_overlap_mentioned_p (operands[2], destlo))
15179     {
15180       /* Try to avoid unnecessary moves if part of the result
15181          is in the right place already.  */
15182       if (src1 != dest)
15183         emit_move_insn (destlo, operands[1]);
15184       if (src2 != dest + halfregs)
15185         emit_move_insn (desthi, operands[2]);
15186     }
15187   else
15188     {
15189       if (src2 != dest + halfregs)
15190         emit_move_insn (desthi, operands[2]);
15191       if (src1 != dest)
15192         emit_move_insn (destlo, operands[1]);
15193     }
15194 }
15195
15196 /* vec_perm support.  */
15197
15198 struct expand_vec_perm_d
15199 {
15200   rtx target, op0, op1;
15201   vec_perm_indices perm;
15202   machine_mode vmode;
15203   unsigned int vec_flags;
15204   bool one_vector_p;
15205   bool testing_p;
15206 };
15207
15208 /* Generate a variable permutation.  */
15209
15210 static void
15211 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15212 {
15213   machine_mode vmode = GET_MODE (target);
15214   bool one_vector_p = rtx_equal_p (op0, op1);
15215
15216   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15217   gcc_checking_assert (GET_MODE (op0) == vmode);
15218   gcc_checking_assert (GET_MODE (op1) == vmode);
15219   gcc_checking_assert (GET_MODE (sel) == vmode);
15220   gcc_checking_assert (TARGET_SIMD);
15221
15222   if (one_vector_p)
15223     {
15224       if (vmode == V8QImode)
15225         {
15226           /* Expand the argument to a V16QI mode by duplicating it.  */
15227           rtx pair = gen_reg_rtx (V16QImode);
15228           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15229           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15230         }
15231       else
15232         {
15233           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15234         }
15235     }
15236   else
15237     {
15238       rtx pair;
15239
15240       if (vmode == V8QImode)
15241         {
15242           pair = gen_reg_rtx (V16QImode);
15243           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15244           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15245         }
15246       else
15247         {
15248           pair = gen_reg_rtx (OImode);
15249           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15250           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15251         }
15252     }
15253 }
15254
15255 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15256    NELT is the number of elements in the vector.  */
15257
15258 void
15259 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15260                          unsigned int nelt)
15261 {
15262   machine_mode vmode = GET_MODE (target);
15263   bool one_vector_p = rtx_equal_p (op0, op1);
15264   rtx mask;
15265
15266   /* The TBL instruction does not use a modulo index, so we must take care
15267      of that ourselves.  */
15268   mask = aarch64_simd_gen_const_vector_dup (vmode,
15269       one_vector_p ? nelt - 1 : 2 * nelt - 1);
15270   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15271
15272   /* For big-endian, we also need to reverse the index within the vector
15273      (but not which vector).  */
15274   if (BYTES_BIG_ENDIAN)
15275     {
15276       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
15277       if (!one_vector_p)
15278         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15279       sel = expand_simple_binop (vmode, XOR, sel, mask,
15280                                  NULL, 0, OPTAB_LIB_WIDEN);
15281     }
15282   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15283 }
15284
15285 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
15286
15287 static void
15288 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15289 {
15290   emit_insn (gen_rtx_SET (target,
15291                           gen_rtx_UNSPEC (GET_MODE (target),
15292                                           gen_rtvec (2, op0, op1), code)));
15293 }
15294
15295 /* Expand an SVE vec_perm with the given operands.  */
15296
15297 void
15298 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15299 {
15300   machine_mode data_mode = GET_MODE (target);
15301   machine_mode sel_mode = GET_MODE (sel);
15302   /* Enforced by the pattern condition.  */
15303   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15304
15305   /* Note: vec_perm indices are supposed to wrap when they go beyond the
15306      size of the two value vectors, i.e. the upper bits of the indices
15307      are effectively ignored.  SVE TBL instead produces 0 for any
15308      out-of-range indices, so we need to modulo all the vec_perm indices
15309      to ensure they are all in range.  */
15310   rtx sel_reg = force_reg (sel_mode, sel);
15311
15312   /* Check if the sel only references the first values vector.  */
15313   if (GET_CODE (sel) == CONST_VECTOR
15314       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15315     {
15316       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15317       return;
15318     }
15319
15320   /* Check if the two values vectors are the same.  */
15321   if (rtx_equal_p (op0, op1))
15322     {
15323       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15324       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15325                                          NULL, 0, OPTAB_DIRECT);
15326       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15327       return;
15328     }
15329
15330   /* Run TBL on for each value vector and combine the results.  */
15331
15332   rtx res0 = gen_reg_rtx (data_mode);
15333   rtx res1 = gen_reg_rtx (data_mode);
15334   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15335   if (GET_CODE (sel) != CONST_VECTOR
15336       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15337     {
15338       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15339                                                        2 * nunits - 1);
15340       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15341                                      NULL, 0, OPTAB_DIRECT);
15342     }
15343   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15344   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15345                                      NULL, 0, OPTAB_DIRECT);
15346   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15347   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15348     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15349   else
15350     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15351 }
15352
15353 /* Recognize patterns suitable for the TRN instructions.  */
15354 static bool
15355 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15356 {
15357   HOST_WIDE_INT odd;
15358   poly_uint64 nelt = d->perm.length ();
15359   rtx out, in0, in1, x;
15360   machine_mode vmode = d->vmode;
15361
15362   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15363     return false;
15364
15365   /* Note that these are little-endian tests.
15366      We correct for big-endian later.  */
15367   if (!d->perm[0].is_constant (&odd)
15368       || (odd != 0 && odd != 1)
15369       || !d->perm.series_p (0, 2, odd, 2)
15370       || !d->perm.series_p (1, 2, nelt + odd, 2))
15371     return false;
15372
15373   /* Success!  */
15374   if (d->testing_p)
15375     return true;
15376
15377   in0 = d->op0;
15378   in1 = d->op1;
15379   /* We don't need a big-endian lane correction for SVE; see the comment
15380      at the head of aarch64-sve.md for details.  */
15381   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15382     {
15383       x = in0, in0 = in1, in1 = x;
15384       odd = !odd;
15385     }
15386   out = d->target;
15387
15388   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15389                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15390   return true;
15391 }
15392
15393 /* Recognize patterns suitable for the UZP instructions.  */
15394 static bool
15395 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15396 {
15397   HOST_WIDE_INT odd;
15398   rtx out, in0, in1, x;
15399   machine_mode vmode = d->vmode;
15400
15401   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15402     return false;
15403
15404   /* Note that these are little-endian tests.
15405      We correct for big-endian later.  */
15406   if (!d->perm[0].is_constant (&odd)
15407       || (odd != 0 && odd != 1)
15408       || !d->perm.series_p (0, 1, odd, 2))
15409     return false;
15410
15411   /* Success!  */
15412   if (d->testing_p)
15413     return true;
15414
15415   in0 = d->op0;
15416   in1 = d->op1;
15417   /* We don't need a big-endian lane correction for SVE; see the comment
15418      at the head of aarch64-sve.md for details.  */
15419   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15420     {
15421       x = in0, in0 = in1, in1 = x;
15422       odd = !odd;
15423     }
15424   out = d->target;
15425
15426   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15427                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15428   return true;
15429 }
15430
15431 /* Recognize patterns suitable for the ZIP instructions.  */
15432 static bool
15433 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15434 {
15435   unsigned int high;
15436   poly_uint64 nelt = d->perm.length ();
15437   rtx out, in0, in1, x;
15438   machine_mode vmode = d->vmode;
15439
15440   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15441     return false;
15442
15443   /* Note that these are little-endian tests.
15444      We correct for big-endian later.  */
15445   poly_uint64 first = d->perm[0];
15446   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15447       || !d->perm.series_p (0, 2, first, 1)
15448       || !d->perm.series_p (1, 2, first + nelt, 1))
15449     return false;
15450   high = maybe_ne (first, 0U);
15451
15452   /* Success!  */
15453   if (d->testing_p)
15454     return true;
15455
15456   in0 = d->op0;
15457   in1 = d->op1;
15458   /* We don't need a big-endian lane correction for SVE; see the comment
15459      at the head of aarch64-sve.md for details.  */
15460   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15461     {
15462       x = in0, in0 = in1, in1 = x;
15463       high = !high;
15464     }
15465   out = d->target;
15466
15467   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15468                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15469   return true;
15470 }
15471
15472 /* Recognize patterns for the EXT insn.  */
15473
15474 static bool
15475 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15476 {
15477   HOST_WIDE_INT location;
15478   rtx offset;
15479
15480   /* The first element always refers to the first vector.
15481      Check if the extracted indices are increasing by one.  */
15482   if (d->vec_flags == VEC_SVE_PRED
15483       || !d->perm[0].is_constant (&location)
15484       || !d->perm.series_p (0, 1, location, 1))
15485     return false;
15486
15487   /* Success! */
15488   if (d->testing_p)
15489     return true;
15490
15491   /* The case where (location == 0) is a no-op for both big- and little-endian,
15492      and is removed by the mid-end at optimization levels -O1 and higher.
15493
15494      We don't need a big-endian lane correction for SVE; see the comment
15495      at the head of aarch64-sve.md for details.  */
15496   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15497     {
15498       /* After setup, we want the high elements of the first vector (stored
15499          at the LSB end of the register), and the low elements of the second
15500          vector (stored at the MSB end of the register). So swap.  */
15501       std::swap (d->op0, d->op1);
15502       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15503          to_constant () is safe since this is restricted to Advanced SIMD
15504          vectors.  */
15505       location = d->perm.length ().to_constant () - location;
15506     }
15507
15508   offset = GEN_INT (location);
15509   emit_set_insn (d->target,
15510                  gen_rtx_UNSPEC (d->vmode,
15511                                  gen_rtvec (3, d->op0, d->op1, offset),
15512                                  UNSPEC_EXT));
15513   return true;
15514 }
15515
15516 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15517    within each 64-bit, 32-bit or 16-bit granule.  */
15518
15519 static bool
15520 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15521 {
15522   HOST_WIDE_INT diff;
15523   unsigned int i, size, unspec;
15524   machine_mode pred_mode;
15525
15526   if (d->vec_flags == VEC_SVE_PRED
15527       || !d->one_vector_p
15528       || !d->perm[0].is_constant (&diff))
15529     return false;
15530
15531   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15532   if (size == 8)
15533     {
15534       unspec = UNSPEC_REV64;
15535       pred_mode = VNx2BImode;
15536     }
15537   else if (size == 4)
15538     {
15539       unspec = UNSPEC_REV32;
15540       pred_mode = VNx4BImode;
15541     }
15542   else if (size == 2)
15543     {
15544       unspec = UNSPEC_REV16;
15545       pred_mode = VNx8BImode;
15546     }
15547   else
15548     return false;
15549
15550   unsigned int step = diff + 1;
15551   for (i = 0; i < step; ++i)
15552     if (!d->perm.series_p (i, step, diff - i, step))
15553       return false;
15554
15555   /* Success! */
15556   if (d->testing_p)
15557     return true;
15558
15559   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15560   if (d->vec_flags == VEC_SVE_DATA)
15561     {
15562       rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15563       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15564                             UNSPEC_MERGE_PTRUE);
15565     }
15566   emit_set_insn (d->target, src);
15567   return true;
15568 }
15569
15570 /* Recognize patterns for the REV insn, which reverses elements within
15571    a full vector.  */
15572
15573 static bool
15574 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15575 {
15576   poly_uint64 nelt = d->perm.length ();
15577
15578   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15579     return false;
15580
15581   if (!d->perm.series_p (0, 1, nelt - 1, -1))
15582     return false;
15583
15584   /* Success! */
15585   if (d->testing_p)
15586     return true;
15587
15588   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15589   emit_set_insn (d->target, src);
15590   return true;
15591 }
15592
15593 static bool
15594 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15595 {
15596   rtx out = d->target;
15597   rtx in0;
15598   HOST_WIDE_INT elt;
15599   machine_mode vmode = d->vmode;
15600   rtx lane;
15601
15602   if (d->vec_flags == VEC_SVE_PRED
15603       || d->perm.encoding ().encoded_nelts () != 1
15604       || !d->perm[0].is_constant (&elt))
15605     return false;
15606
15607   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15608     return false;
15609
15610   /* Success! */
15611   if (d->testing_p)
15612     return true;
15613
15614   /* The generic preparation in aarch64_expand_vec_perm_const_1
15615      swaps the operand order and the permute indices if it finds
15616      d->perm[0] to be in the second operand.  Thus, we can always
15617      use d->op0 and need not do any extra arithmetic to get the
15618      correct lane number.  */
15619   in0 = d->op0;
15620   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
15621
15622   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15623   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15624   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15625   return true;
15626 }
15627
15628 static bool
15629 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15630 {
15631   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15632   machine_mode vmode = d->vmode;
15633
15634   /* Make sure that the indices are constant.  */
15635   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15636   for (unsigned int i = 0; i < encoded_nelts; ++i)
15637     if (!d->perm[i].is_constant ())
15638       return false;
15639
15640   if (d->testing_p)
15641     return true;
15642
15643   /* Generic code will try constant permutation twice.  Once with the
15644      original mode and again with the elements lowered to QImode.
15645      So wait and don't do the selector expansion ourselves.  */
15646   if (vmode != V8QImode && vmode != V16QImode)
15647     return false;
15648
15649   /* to_constant is safe since this routine is specific to Advanced SIMD
15650      vectors.  */
15651   unsigned int nelt = d->perm.length ().to_constant ();
15652   for (unsigned int i = 0; i < nelt; ++i)
15653     /* If big-endian and two vectors we end up with a weird mixed-endian
15654        mode on NEON.  Reverse the index within each word but not the word
15655        itself.  to_constant is safe because we checked is_constant above.  */
15656     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15657                         ? d->perm[i].to_constant () ^ (nelt - 1)
15658                         : d->perm[i].to_constant ());
15659
15660   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15661   sel = force_reg (vmode, sel);
15662
15663   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15664   return true;
15665 }
15666
15667 /* Try to implement D using an SVE TBL instruction.  */
15668
15669 static bool
15670 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15671 {
15672   unsigned HOST_WIDE_INT nelt;
15673
15674   /* Permuting two variable-length vectors could overflow the
15675      index range.  */
15676   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15677     return false;
15678
15679   if (d->testing_p)
15680     return true;
15681
15682   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15683   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15684   aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15685   return true;
15686 }
15687
15688 static bool
15689 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15690 {
15691   /* The pattern matching functions above are written to look for a small
15692      number to begin the sequence (0, 1, N/2).  If we begin with an index
15693      from the second operand, we can swap the operands.  */
15694   poly_int64 nelt = d->perm.length ();
15695   if (known_ge (d->perm[0], nelt))
15696     {
15697       d->perm.rotate_inputs (1);
15698       std::swap (d->op0, d->op1);
15699     }
15700
15701   if ((d->vec_flags == VEC_ADVSIMD
15702        || d->vec_flags == VEC_SVE_DATA
15703        || d->vec_flags == VEC_SVE_PRED)
15704       && known_gt (nelt, 1))
15705     {
15706       if (aarch64_evpc_rev_local (d))
15707         return true;
15708       else if (aarch64_evpc_rev_global (d))
15709         return true;
15710       else if (aarch64_evpc_ext (d))
15711         return true;
15712       else if (aarch64_evpc_dup (d))
15713         return true;
15714       else if (aarch64_evpc_zip (d))
15715         return true;
15716       else if (aarch64_evpc_uzp (d))
15717         return true;
15718       else if (aarch64_evpc_trn (d))
15719         return true;
15720       if (d->vec_flags == VEC_SVE_DATA)
15721         return aarch64_evpc_sve_tbl (d);
15722       else if (d->vec_flags == VEC_SVE_DATA)
15723         return aarch64_evpc_tbl (d);
15724     }
15725   return false;
15726 }
15727
15728 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
15729
15730 static bool
15731 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15732                                   rtx op1, const vec_perm_indices &sel)
15733 {
15734   struct expand_vec_perm_d d;
15735
15736   /* Check whether the mask can be applied to a single vector.  */
15737   if (op0 && rtx_equal_p (op0, op1))
15738     d.one_vector_p = true;
15739   else if (sel.all_from_input_p (0))
15740     {
15741       d.one_vector_p = true;
15742       op1 = op0;
15743     }
15744   else if (sel.all_from_input_p (1))
15745     {
15746       d.one_vector_p = true;
15747       op0 = op1;
15748     }
15749   else
15750     d.one_vector_p = false;
15751
15752   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15753                      sel.nelts_per_input ());
15754   d.vmode = vmode;
15755   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15756   d.target = target;
15757   d.op0 = op0;
15758   d.op1 = op1;
15759   d.testing_p = !target;
15760
15761   if (!d.testing_p)
15762     return aarch64_expand_vec_perm_const_1 (&d);
15763
15764   rtx_insn *last = get_last_insn ();
15765   bool ret = aarch64_expand_vec_perm_const_1 (&d);
15766   gcc_assert (last == get_last_insn ());
15767
15768   return ret;
15769 }
15770
15771 /* Generate a byte permute mask for a register of mode MODE,
15772    which has NUNITS units.  */
15773
15774 rtx
15775 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15776 {
15777   /* We have to reverse each vector because we dont have
15778      a permuted load that can reverse-load according to ABI rules.  */
15779   rtx mask;
15780   rtvec v = rtvec_alloc (16);
15781   unsigned int i, j;
15782   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15783
15784   gcc_assert (BYTES_BIG_ENDIAN);
15785   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15786
15787   for (i = 0; i < nunits; i++)
15788     for (j = 0; j < usize; j++)
15789       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15790   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15791   return force_reg (V16QImode, mask);
15792 }
15793
15794 /* Return true if X is a valid second operand for the SVE instruction
15795    that implements integer comparison OP_CODE.  */
15796
15797 static bool
15798 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15799 {
15800   if (register_operand (x, VOIDmode))
15801     return true;
15802
15803   switch (op_code)
15804     {
15805     case LTU:
15806     case LEU:
15807     case GEU:
15808     case GTU:
15809       return aarch64_sve_cmp_immediate_p (x, false);
15810     case LT:
15811     case LE:
15812     case GE:
15813     case GT:
15814     case NE:
15815     case EQ:
15816       return aarch64_sve_cmp_immediate_p (x, true);
15817     default:
15818       gcc_unreachable ();
15819     }
15820 }
15821
15822 /* Use predicated SVE instructions to implement the equivalent of:
15823
15824      (set TARGET OP)
15825
15826    given that PTRUE is an all-true predicate of the appropriate mode.  */
15827
15828 static void
15829 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
15830 {
15831   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15832                                gen_rtvec (2, ptrue, op),
15833                                UNSPEC_MERGE_PTRUE);
15834   rtx_insn *insn = emit_set_insn (target, unspec);
15835   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15836 }
15837
15838 /* Likewise, but also clobber the condition codes.  */
15839
15840 static void
15841 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
15842 {
15843   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15844                                gen_rtvec (2, ptrue, op),
15845                                UNSPEC_MERGE_PTRUE);
15846   rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
15847   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15848 }
15849
15850 /* Return the UNSPEC_COND_* code for comparison CODE.  */
15851
15852 static unsigned int
15853 aarch64_unspec_cond_code (rtx_code code)
15854 {
15855   switch (code)
15856     {
15857     case NE:
15858       return UNSPEC_COND_NE;
15859     case EQ:
15860       return UNSPEC_COND_EQ;
15861     case LT:
15862       return UNSPEC_COND_LT;
15863     case GT:
15864       return UNSPEC_COND_GT;
15865     case LE:
15866       return UNSPEC_COND_LE;
15867     case GE:
15868       return UNSPEC_COND_GE;
15869     default:
15870       gcc_unreachable ();
15871     }
15872 }
15873
15874 /* Emit:
15875
15876       (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
15877
15878    where <X> is the operation associated with comparison CODE.  This form
15879    of instruction is used when (and (CODE OP0 OP1) PRED) would have different
15880    semantics, such as when PRED might not be all-true and when comparing
15881    inactive lanes could have side effects.  */
15882
15883 static void
15884 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
15885                                   rtx pred, rtx op0, rtx op1)
15886 {
15887   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
15888                                gen_rtvec (3, pred, op0, op1),
15889                                aarch64_unspec_cond_code (code));
15890   emit_set_insn (target, unspec);
15891 }
15892
15893 /* Expand an SVE integer comparison using the SVE equivalent of:
15894
15895      (set TARGET (CODE OP0 OP1)).  */
15896
15897 void
15898 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15899 {
15900   machine_mode pred_mode = GET_MODE (target);
15901   machine_mode data_mode = GET_MODE (op0);
15902
15903   if (!aarch64_sve_cmp_operand_p (code, op1))
15904     op1 = force_reg (data_mode, op1);
15905
15906   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15907   rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15908   aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
15909 }
15910
15911 /* Emit the SVE equivalent of:
15912
15913       (set TMP1 (CODE1 OP0 OP1))
15914       (set TMP2 (CODE2 OP0 OP1))
15915       (set TARGET (ior:PRED_MODE TMP1 TMP2))
15916
15917    PTRUE is an all-true predicate with the same mode as TARGET.  */
15918
15919 static void
15920 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
15921                            rtx ptrue, rtx op0, rtx op1)
15922 {
15923   machine_mode pred_mode = GET_MODE (ptrue);
15924   rtx tmp1 = gen_reg_rtx (pred_mode);
15925   aarch64_emit_sve_ptrue_op (tmp1, ptrue,
15926                              gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
15927   rtx tmp2 = gen_reg_rtx (pred_mode);
15928   aarch64_emit_sve_ptrue_op (tmp2, ptrue,
15929                              gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
15930   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
15931 }
15932
15933 /* Emit the SVE equivalent of:
15934
15935       (set TMP (CODE OP0 OP1))
15936       (set TARGET (not TMP))
15937
15938    PTRUE is an all-true predicate with the same mode as TARGET.  */
15939
15940 static void
15941 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
15942                                 rtx op0, rtx op1)
15943 {
15944   machine_mode pred_mode = GET_MODE (ptrue);
15945   rtx tmp = gen_reg_rtx (pred_mode);
15946   aarch64_emit_sve_ptrue_op (tmp, ptrue,
15947                              gen_rtx_fmt_ee (code, pred_mode, op0, op1));
15948   aarch64_emit_unop (target, one_cmpl_optab, tmp);
15949 }
15950
15951 /* Expand an SVE floating-point comparison using the SVE equivalent of:
15952
15953      (set TARGET (CODE OP0 OP1))
15954
15955    If CAN_INVERT_P is true, the caller can also handle inverted results;
15956    return true if the result is in fact inverted.  */
15957
15958 bool
15959 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15960                                   rtx op0, rtx op1, bool can_invert_p)
15961 {
15962   machine_mode pred_mode = GET_MODE (target);
15963   machine_mode data_mode = GET_MODE (op0);
15964
15965   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15966   switch (code)
15967     {
15968     case UNORDERED:
15969       /* UNORDERED has no immediate form.  */
15970       op1 = force_reg (data_mode, op1);
15971       /* fall through */
15972     case LT:
15973     case LE:
15974     case GT:
15975     case GE:
15976     case EQ:
15977     case NE:
15978       {
15979         /* There is native support for the comparison.  */
15980         rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15981         aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15982         return false;
15983       }
15984
15985     case LTGT:
15986       /* This is a trapping operation (LT or GT).  */
15987       aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
15988       return false;
15989
15990     case UNEQ:
15991       if (!flag_trapping_math)
15992         {
15993           /* This would trap for signaling NaNs.  */
15994           op1 = force_reg (data_mode, op1);
15995           aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
15996           return false;
15997         }
15998       /* fall through */
15999     case UNLT:
16000     case UNLE:
16001     case UNGT:
16002     case UNGE:
16003       if (flag_trapping_math)
16004         {
16005           /* Work out which elements are ordered.  */
16006           rtx ordered = gen_reg_rtx (pred_mode);
16007           op1 = force_reg (data_mode, op1);
16008           aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
16009
16010           /* Test the opposite condition for the ordered elements,
16011              then invert the result.  */
16012           if (code == UNEQ)
16013             code = NE;
16014           else
16015             code = reverse_condition_maybe_unordered (code);
16016           if (can_invert_p)
16017             {
16018               aarch64_emit_sve_predicated_cond (target, code,
16019                                                 ordered, op0, op1);
16020               return true;
16021             }
16022           rtx tmp = gen_reg_rtx (pred_mode);
16023           aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
16024           aarch64_emit_unop (target, one_cmpl_optab, tmp);
16025           return false;
16026         }
16027       break;
16028
16029     case ORDERED:
16030       /* ORDERED has no immediate form.  */
16031       op1 = force_reg (data_mode, op1);
16032       break;
16033
16034     default:
16035       gcc_unreachable ();
16036     }
16037
16038   /* There is native support for the inverse comparison.  */
16039   code = reverse_condition_maybe_unordered (code);
16040   if (can_invert_p)
16041     {
16042       rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16043       aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16044       return true;
16045     }
16046   aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
16047   return false;
16048 }
16049
16050 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
16051    of the data being selected and CMP_MODE is the mode of the values being
16052    compared.  */
16053
16054 void
16055 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
16056                           rtx *ops)
16057 {
16058   machine_mode pred_mode
16059     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
16060                              GET_MODE_SIZE (cmp_mode)).require ();
16061   rtx pred = gen_reg_rtx (pred_mode);
16062   if (FLOAT_MODE_P (cmp_mode))
16063     {
16064       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
16065                                             ops[4], ops[5], true))
16066         std::swap (ops[1], ops[2]);
16067     }
16068   else
16069     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
16070
16071   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
16072   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
16073 }
16074
16075 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
16076    true.  However due to issues with register allocation it is preferable
16077    to avoid tieing integer scalar and FP scalar modes.  Executing integer
16078    operations in general registers is better than treating them as scalar
16079    vector operations.  This reduces latency and avoids redundant int<->FP
16080    moves.  So tie modes if they are either the same class, or vector modes
16081    with other vector modes, vector structs or any scalar mode.  */
16082
16083 static bool
16084 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
16085 {
16086   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
16087     return true;
16088
16089   /* We specifically want to allow elements of "structure" modes to
16090      be tieable to the structure.  This more general condition allows
16091      other rarer situations too.  The reason we don't extend this to
16092      predicate modes is that there are no predicate structure modes
16093      nor any specific instructions for extracting part of a predicate
16094      register.  */
16095   if (aarch64_vector_data_mode_p (mode1)
16096       && aarch64_vector_data_mode_p (mode2))
16097     return true;
16098
16099   /* Also allow any scalar modes with vectors.  */
16100   if (aarch64_vector_mode_supported_p (mode1)
16101       || aarch64_vector_mode_supported_p (mode2))
16102     return true;
16103
16104   return false;
16105 }
16106
16107 /* Return a new RTX holding the result of moving POINTER forward by
16108    AMOUNT bytes.  */
16109
16110 static rtx
16111 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16112 {
16113   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16114
16115   return adjust_automodify_address (pointer, GET_MODE (pointer),
16116                                     next, amount);
16117 }
16118
16119 /* Return a new RTX holding the result of moving POINTER forward by the
16120    size of the mode it points to.  */
16121
16122 static rtx
16123 aarch64_progress_pointer (rtx pointer)
16124 {
16125   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16126 }
16127
16128 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16129    MODE bytes.  */
16130
16131 static void
16132 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16133                                               machine_mode mode)
16134 {
16135   rtx reg = gen_reg_rtx (mode);
16136
16137   /* "Cast" the pointers to the correct mode.  */
16138   *src = adjust_address (*src, mode, 0);
16139   *dst = adjust_address (*dst, mode, 0);
16140   /* Emit the memcpy.  */
16141   emit_move_insn (reg, *src);
16142   emit_move_insn (*dst, reg);
16143   /* Move the pointers forward.  */
16144   *src = aarch64_progress_pointer (*src);
16145   *dst = aarch64_progress_pointer (*dst);
16146 }
16147
16148 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
16149    we succeed, otherwise return false.  */
16150
16151 bool
16152 aarch64_expand_movmem (rtx *operands)
16153 {
16154   int n, mode_bits;
16155   rtx dst = operands[0];
16156   rtx src = operands[1];
16157   rtx base;
16158   machine_mode cur_mode = BLKmode, next_mode;
16159   bool speed_p = !optimize_function_for_size_p (cfun);
16160
16161   /* When optimizing for size, give a better estimate of the length of a
16162      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
16163      will always require an even number of instructions to do now.  And each
16164      operation requires both a load+store, so devide the max number by 2.  */
16165   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
16166
16167   /* We can't do anything smart if the amount to copy is not constant.  */
16168   if (!CONST_INT_P (operands[2]))
16169     return false;
16170
16171   n = INTVAL (operands[2]);
16172
16173   /* Try to keep the number of instructions low.  For all cases we will do at
16174      most two moves for the residual amount, since we'll always overlap the
16175      remainder.  */
16176   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
16177     return false;
16178
16179   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16180   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16181
16182   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16183   src = adjust_automodify_address (src, VOIDmode, base, 0);
16184
16185   /* Convert n to bits to make the rest of the code simpler.  */
16186   n = n * BITS_PER_UNIT;
16187
16188   while (n > 0)
16189     {
16190       /* Find the largest mode in which to do the copy in without over reading
16191          or writing.  */
16192       opt_scalar_int_mode mode_iter;
16193       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
16194         if (GET_MODE_BITSIZE (mode_iter.require ()) <= n)
16195           cur_mode = mode_iter.require ();
16196
16197       gcc_assert (cur_mode != BLKmode);
16198
16199       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
16200       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
16201
16202       n -= mode_bits;
16203
16204       /* Do certain trailing copies as overlapping if it's going to be
16205          cheaper.  i.e. less instructions to do so.  For instance doing a 15
16206          byte copy it's more efficient to do two overlapping 8 byte copies than
16207          8 + 6 + 1.  */
16208       next_mode = smallest_mode_for_size (n, MODE_INT);
16209       int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
16210       if (n > 0 && n_bits > n && n_bits <= 8 * BITS_PER_UNIT)
16211         {
16212           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
16213           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
16214           n = n_bits;
16215         }
16216     }
16217
16218   return true;
16219 }
16220
16221 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16222    SImode stores.  Handle the case when the constant has identical
16223    bottom and top halves.  This is beneficial when the two stores can be
16224    merged into an STP and we avoid synthesising potentially expensive
16225    immediates twice.  Return true if such a split is possible.  */
16226
16227 bool
16228 aarch64_split_dimode_const_store (rtx dst, rtx src)
16229 {
16230   rtx lo = gen_lowpart (SImode, src);
16231   rtx hi = gen_highpart_mode (SImode, DImode, src);
16232
16233   bool size_p = optimize_function_for_size_p (cfun);
16234
16235   if (!rtx_equal_p (lo, hi))
16236     return false;
16237
16238   unsigned int orig_cost
16239     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16240   unsigned int lo_cost
16241     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16242
16243   /* We want to transform:
16244      MOV        x1, 49370
16245      MOVK       x1, 0x140, lsl 16
16246      MOVK       x1, 0xc0da, lsl 32
16247      MOVK       x1, 0x140, lsl 48
16248      STR        x1, [x0]
16249    into:
16250      MOV        w1, 49370
16251      MOVK       w1, 0x140, lsl 16
16252      STP        w1, w1, [x0]
16253    So we want to perform this only when we save two instructions
16254    or more.  When optimizing for size, however, accept any code size
16255    savings we can.  */
16256   if (size_p && orig_cost <= lo_cost)
16257     return false;
16258
16259   if (!size_p
16260       && (orig_cost <= lo_cost + 1))
16261     return false;
16262
16263   rtx mem_lo = adjust_address (dst, SImode, 0);
16264   if (!aarch64_mem_pair_operand (mem_lo, SImode))
16265     return false;
16266
16267   rtx tmp_reg = gen_reg_rtx (SImode);
16268   aarch64_expand_mov_immediate (tmp_reg, lo);
16269   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16270   /* Don't emit an explicit store pair as this may not be always profitable.
16271      Let the sched-fusion logic decide whether to merge them.  */
16272   emit_move_insn (mem_lo, tmp_reg);
16273   emit_move_insn (mem_hi, tmp_reg);
16274
16275   return true;
16276 }
16277
16278 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
16279
16280 static unsigned HOST_WIDE_INT
16281 aarch64_asan_shadow_offset (void)
16282 {
16283   return (HOST_WIDE_INT_1 << 36);
16284 }
16285
16286 static rtx
16287 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16288                         int code, tree treeop0, tree treeop1)
16289 {
16290   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16291   rtx op0, op1;
16292   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16293   insn_code icode;
16294   struct expand_operand ops[4];
16295
16296   start_sequence ();
16297   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16298
16299   op_mode = GET_MODE (op0);
16300   if (op_mode == VOIDmode)
16301     op_mode = GET_MODE (op1);
16302
16303   switch (op_mode)
16304     {
16305     case E_QImode:
16306     case E_HImode:
16307     case E_SImode:
16308       cmp_mode = SImode;
16309       icode = CODE_FOR_cmpsi;
16310       break;
16311
16312     case E_DImode:
16313       cmp_mode = DImode;
16314       icode = CODE_FOR_cmpdi;
16315       break;
16316
16317     case E_SFmode:
16318       cmp_mode = SFmode;
16319       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16320       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16321       break;
16322
16323     case E_DFmode:
16324       cmp_mode = DFmode;
16325       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16326       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16327       break;
16328
16329     default:
16330       end_sequence ();
16331       return NULL_RTX;
16332     }
16333
16334   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16335   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16336   if (!op0 || !op1)
16337     {
16338       end_sequence ();
16339       return NULL_RTX;
16340     }
16341   *prep_seq = get_insns ();
16342   end_sequence ();
16343
16344   create_fixed_operand (&ops[0], op0);
16345   create_fixed_operand (&ops[1], op1);
16346
16347   start_sequence ();
16348   if (!maybe_expand_insn (icode, 2, ops))
16349     {
16350       end_sequence ();
16351       return NULL_RTX;
16352     }
16353   *gen_seq = get_insns ();
16354   end_sequence ();
16355
16356   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16357                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16358 }
16359
16360 static rtx
16361 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16362                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
16363 {
16364   rtx op0, op1, target;
16365   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16366   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16367   insn_code icode;
16368   struct expand_operand ops[6];
16369   int aarch64_cond;
16370
16371   push_to_sequence (*prep_seq);
16372   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16373
16374   op_mode = GET_MODE (op0);
16375   if (op_mode == VOIDmode)
16376     op_mode = GET_MODE (op1);
16377
16378   switch (op_mode)
16379     {
16380     case E_QImode:
16381     case E_HImode:
16382     case E_SImode:
16383       cmp_mode = SImode;
16384       icode = CODE_FOR_ccmpsi;
16385       break;
16386
16387     case E_DImode:
16388       cmp_mode = DImode;
16389       icode = CODE_FOR_ccmpdi;
16390       break;
16391
16392     case E_SFmode:
16393       cmp_mode = SFmode;
16394       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16395       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16396       break;
16397
16398     case E_DFmode:
16399       cmp_mode = DFmode;
16400       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16401       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16402       break;
16403
16404     default:
16405       end_sequence ();
16406       return NULL_RTX;
16407     }
16408
16409   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16410   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16411   if (!op0 || !op1)
16412     {
16413       end_sequence ();
16414       return NULL_RTX;
16415     }
16416   *prep_seq = get_insns ();
16417   end_sequence ();
16418
16419   target = gen_rtx_REG (cc_mode, CC_REGNUM);
16420   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16421
16422   if (bit_code != AND)
16423     {
16424       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16425                                                 GET_MODE (XEXP (prev, 0))),
16426                              VOIDmode, XEXP (prev, 0), const0_rtx);
16427       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16428     }
16429
16430   create_fixed_operand (&ops[0], XEXP (prev, 0));
16431   create_fixed_operand (&ops[1], target);
16432   create_fixed_operand (&ops[2], op0);
16433   create_fixed_operand (&ops[3], op1);
16434   create_fixed_operand (&ops[4], prev);
16435   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16436
16437   push_to_sequence (*gen_seq);
16438   if (!maybe_expand_insn (icode, 6, ops))
16439     {
16440       end_sequence ();
16441       return NULL_RTX;
16442     }
16443
16444   *gen_seq = get_insns ();
16445   end_sequence ();
16446
16447   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16448 }
16449
16450 #undef TARGET_GEN_CCMP_FIRST
16451 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16452
16453 #undef TARGET_GEN_CCMP_NEXT
16454 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16455
16456 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
16457    instruction fusion of some sort.  */
16458
16459 static bool
16460 aarch64_macro_fusion_p (void)
16461 {
16462   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16463 }
16464
16465
16466 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
16467    should be kept together during scheduling.  */
16468
16469 static bool
16470 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16471 {
16472   rtx set_dest;
16473   rtx prev_set = single_set (prev);
16474   rtx curr_set = single_set (curr);
16475   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
16476   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16477
16478   if (!aarch64_macro_fusion_p ())
16479     return false;
16480
16481   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16482     {
16483       /* We are trying to match:
16484          prev (mov)  == (set (reg r0) (const_int imm16))
16485          curr (movk) == (set (zero_extract (reg r0)
16486                                            (const_int 16)
16487                                            (const_int 16))
16488                              (const_int imm16_1))  */
16489
16490       set_dest = SET_DEST (curr_set);
16491
16492       if (GET_CODE (set_dest) == ZERO_EXTRACT
16493           && CONST_INT_P (SET_SRC (curr_set))
16494           && CONST_INT_P (SET_SRC (prev_set))
16495           && CONST_INT_P (XEXP (set_dest, 2))
16496           && INTVAL (XEXP (set_dest, 2)) == 16
16497           && REG_P (XEXP (set_dest, 0))
16498           && REG_P (SET_DEST (prev_set))
16499           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16500         {
16501           return true;
16502         }
16503     }
16504
16505   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16506     {
16507
16508       /*  We're trying to match:
16509           prev (adrp) == (set (reg r1)
16510                               (high (symbol_ref ("SYM"))))
16511           curr (add) == (set (reg r0)
16512                              (lo_sum (reg r1)
16513                                      (symbol_ref ("SYM"))))
16514           Note that r0 need not necessarily be the same as r1, especially
16515           during pre-regalloc scheduling.  */
16516
16517       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16518           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16519         {
16520           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16521               && REG_P (XEXP (SET_SRC (curr_set), 0))
16522               && REGNO (XEXP (SET_SRC (curr_set), 0))
16523                  == REGNO (SET_DEST (prev_set))
16524               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16525                               XEXP (SET_SRC (curr_set), 1)))
16526             return true;
16527         }
16528     }
16529
16530   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16531     {
16532
16533       /* We're trying to match:
16534          prev (movk) == (set (zero_extract (reg r0)
16535                                            (const_int 16)
16536                                            (const_int 32))
16537                              (const_int imm16_1))
16538          curr (movk) == (set (zero_extract (reg r0)
16539                                            (const_int 16)
16540                                            (const_int 48))
16541                              (const_int imm16_2))  */
16542
16543       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16544           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16545           && REG_P (XEXP (SET_DEST (prev_set), 0))
16546           && REG_P (XEXP (SET_DEST (curr_set), 0))
16547           && REGNO (XEXP (SET_DEST (prev_set), 0))
16548              == REGNO (XEXP (SET_DEST (curr_set), 0))
16549           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16550           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16551           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16552           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16553           && CONST_INT_P (SET_SRC (prev_set))
16554           && CONST_INT_P (SET_SRC (curr_set)))
16555         return true;
16556
16557     }
16558   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16559     {
16560       /* We're trying to match:
16561           prev (adrp) == (set (reg r0)
16562                               (high (symbol_ref ("SYM"))))
16563           curr (ldr) == (set (reg r1)
16564                              (mem (lo_sum (reg r0)
16565                                              (symbol_ref ("SYM")))))
16566                  or
16567           curr (ldr) == (set (reg r1)
16568                              (zero_extend (mem
16569                                            (lo_sum (reg r0)
16570                                                    (symbol_ref ("SYM"))))))  */
16571       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16572           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16573         {
16574           rtx curr_src = SET_SRC (curr_set);
16575
16576           if (GET_CODE (curr_src) == ZERO_EXTEND)
16577             curr_src = XEXP (curr_src, 0);
16578
16579           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16580               && REG_P (XEXP (XEXP (curr_src, 0), 0))
16581               && REGNO (XEXP (XEXP (curr_src, 0), 0))
16582                  == REGNO (SET_DEST (prev_set))
16583               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16584                               XEXP (SET_SRC (prev_set), 0)))
16585               return true;
16586         }
16587     }
16588
16589   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16590        && aarch_crypto_can_dual_issue (prev, curr))
16591     return true;
16592
16593   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16594       && any_condjump_p (curr))
16595     {
16596       enum attr_type prev_type = get_attr_type (prev);
16597
16598       unsigned int condreg1, condreg2;
16599       rtx cc_reg_1;
16600       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16601       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16602
16603       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16604           && prev
16605           && modified_in_p (cc_reg_1, prev))
16606         {
16607           /* FIXME: this misses some which is considered simple arthematic
16608              instructions for ThunderX.  Simple shifts are missed here.  */
16609           if (prev_type == TYPE_ALUS_SREG
16610               || prev_type == TYPE_ALUS_IMM
16611               || prev_type == TYPE_LOGICS_REG
16612               || prev_type == TYPE_LOGICS_IMM)
16613             return true;
16614         }
16615     }
16616
16617   if (prev_set
16618       && curr_set
16619       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16620       && any_condjump_p (curr))
16621     {
16622       /* We're trying to match:
16623           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16624           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
16625                                                          (const_int 0))
16626                                                  (label_ref ("SYM"))
16627                                                  (pc))  */
16628       if (SET_DEST (curr_set) == (pc_rtx)
16629           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16630           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16631           && REG_P (SET_DEST (prev_set))
16632           && REGNO (SET_DEST (prev_set))
16633              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16634         {
16635           /* Fuse ALU operations followed by conditional branch instruction.  */
16636           switch (get_attr_type (prev))
16637             {
16638             case TYPE_ALU_IMM:
16639             case TYPE_ALU_SREG:
16640             case TYPE_ADC_REG:
16641             case TYPE_ADC_IMM:
16642             case TYPE_ADCS_REG:
16643             case TYPE_ADCS_IMM:
16644             case TYPE_LOGIC_REG:
16645             case TYPE_LOGIC_IMM:
16646             case TYPE_CSEL:
16647             case TYPE_ADR:
16648             case TYPE_MOV_IMM:
16649             case TYPE_SHIFT_REG:
16650             case TYPE_SHIFT_IMM:
16651             case TYPE_BFM:
16652             case TYPE_RBIT:
16653             case TYPE_REV:
16654             case TYPE_EXTEND:
16655               return true;
16656
16657             default:;
16658             }
16659         }
16660     }
16661
16662   return false;
16663 }
16664
16665 /* Return true iff the instruction fusion described by OP is enabled.  */
16666
16667 bool
16668 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16669 {
16670   return (aarch64_tune_params.fusible_ops & op) != 0;
16671 }
16672
16673 /* If MEM is in the form of [base+offset], extract the two parts
16674    of address and set to BASE and OFFSET, otherwise return false
16675    after clearing BASE and OFFSET.  */
16676
16677 bool
16678 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16679 {
16680   rtx addr;
16681
16682   gcc_assert (MEM_P (mem));
16683
16684   addr = XEXP (mem, 0);
16685
16686   if (REG_P (addr))
16687     {
16688       *base = addr;
16689       *offset = const0_rtx;
16690       return true;
16691     }
16692
16693   if (GET_CODE (addr) == PLUS
16694       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16695     {
16696       *base = XEXP (addr, 0);
16697       *offset = XEXP (addr, 1);
16698       return true;
16699     }
16700
16701   *base = NULL_RTX;
16702   *offset = NULL_RTX;
16703
16704   return false;
16705 }
16706
16707 /* Types for scheduling fusion.  */
16708 enum sched_fusion_type
16709 {
16710   SCHED_FUSION_NONE = 0,
16711   SCHED_FUSION_LD_SIGN_EXTEND,
16712   SCHED_FUSION_LD_ZERO_EXTEND,
16713   SCHED_FUSION_LD,
16714   SCHED_FUSION_ST,
16715   SCHED_FUSION_NUM
16716 };
16717
16718 /* If INSN is a load or store of address in the form of [base+offset],
16719    extract the two parts and set to BASE and OFFSET.  Return scheduling
16720    fusion type this INSN is.  */
16721
16722 static enum sched_fusion_type
16723 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16724 {
16725   rtx x, dest, src;
16726   enum sched_fusion_type fusion = SCHED_FUSION_LD;
16727
16728   gcc_assert (INSN_P (insn));
16729   x = PATTERN (insn);
16730   if (GET_CODE (x) != SET)
16731     return SCHED_FUSION_NONE;
16732
16733   src = SET_SRC (x);
16734   dest = SET_DEST (x);
16735
16736   machine_mode dest_mode = GET_MODE (dest);
16737
16738   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16739     return SCHED_FUSION_NONE;
16740
16741   if (GET_CODE (src) == SIGN_EXTEND)
16742     {
16743       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16744       src = XEXP (src, 0);
16745       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16746         return SCHED_FUSION_NONE;
16747     }
16748   else if (GET_CODE (src) == ZERO_EXTEND)
16749     {
16750       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16751       src = XEXP (src, 0);
16752       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16753         return SCHED_FUSION_NONE;
16754     }
16755
16756   if (GET_CODE (src) == MEM && REG_P (dest))
16757     extract_base_offset_in_addr (src, base, offset);
16758   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16759     {
16760       fusion = SCHED_FUSION_ST;
16761       extract_base_offset_in_addr (dest, base, offset);
16762     }
16763   else
16764     return SCHED_FUSION_NONE;
16765
16766   if (*base == NULL_RTX || *offset == NULL_RTX)
16767     fusion = SCHED_FUSION_NONE;
16768
16769   return fusion;
16770 }
16771
16772 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16773
16774    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16775    and PRI are only calculated for these instructions.  For other instruction,
16776    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
16777    type instruction fusion can be added by returning different priorities.
16778
16779    It's important that irrelevant instructions get the largest FUSION_PRI.  */
16780
16781 static void
16782 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16783                                int *fusion_pri, int *pri)
16784 {
16785   int tmp, off_val;
16786   rtx base, offset;
16787   enum sched_fusion_type fusion;
16788
16789   gcc_assert (INSN_P (insn));
16790
16791   tmp = max_pri - 1;
16792   fusion = fusion_load_store (insn, &base, &offset);
16793   if (fusion == SCHED_FUSION_NONE)
16794     {
16795       *pri = tmp;
16796       *fusion_pri = tmp;
16797       return;
16798     }
16799
16800   /* Set FUSION_PRI according to fusion type and base register.  */
16801   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16802
16803   /* Calculate PRI.  */
16804   tmp /= 2;
16805
16806   /* INSN with smaller offset goes first.  */
16807   off_val = (int)(INTVAL (offset));
16808   if (off_val >= 0)
16809     tmp -= (off_val & 0xfffff);
16810   else
16811     tmp += ((- off_val) & 0xfffff);
16812
16813   *pri = tmp;
16814   return;
16815 }
16816
16817 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16818    Adjust priority of sha1h instructions so they are scheduled before
16819    other SHA1 instructions.  */
16820
16821 static int
16822 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16823 {
16824   rtx x = PATTERN (insn);
16825
16826   if (GET_CODE (x) == SET)
16827     {
16828       x = SET_SRC (x);
16829
16830       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16831         return priority + 10;
16832     }
16833
16834   return priority;
16835 }
16836
16837 /* Given OPERANDS of consecutive load/store, check if we can merge
16838    them into ldp/stp.  LOAD is true if they are load instructions.
16839    MODE is the mode of memory operands.  */
16840
16841 bool
16842 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16843                                 machine_mode mode)
16844 {
16845   HOST_WIDE_INT offval_1, offval_2, msize;
16846   enum reg_class rclass_1, rclass_2;
16847   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16848
16849   if (load)
16850     {
16851       mem_1 = operands[1];
16852       mem_2 = operands[3];
16853       reg_1 = operands[0];
16854       reg_2 = operands[2];
16855       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16856       if (REGNO (reg_1) == REGNO (reg_2))
16857         return false;
16858     }
16859   else
16860     {
16861       mem_1 = operands[0];
16862       mem_2 = operands[2];
16863       reg_1 = operands[1];
16864       reg_2 = operands[3];
16865     }
16866
16867   /* The mems cannot be volatile.  */
16868   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16869     return false;
16870
16871   /* If we have SImode and slow unaligned ldp,
16872      check the alignment to be at least 8 byte. */
16873   if (mode == SImode
16874       && (aarch64_tune_params.extra_tuning_flags
16875           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16876       && !optimize_size
16877       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16878     return false;
16879
16880   /* Check if the addresses are in the form of [base+offset].  */
16881   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16882   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16883     return false;
16884   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16885   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16886     return false;
16887
16888   /* Check if the bases are same.  */
16889   if (!rtx_equal_p (base_1, base_2))
16890     return false;
16891
16892   /* The operands must be of the same size.  */
16893   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
16894                          GET_MODE_SIZE (GET_MODE (mem_2))));
16895
16896   offval_1 = INTVAL (offset_1);
16897   offval_2 = INTVAL (offset_2);
16898   /* We should only be trying this for fixed-sized modes.  There is no
16899      SVE LDP/STP instruction.  */
16900   msize = GET_MODE_SIZE (mode).to_constant ();
16901   /* Check if the offsets are consecutive.  */
16902   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16903     return false;
16904
16905   /* Check if the addresses are clobbered by load.  */
16906   if (load)
16907     {
16908       if (reg_mentioned_p (reg_1, mem_1))
16909         return false;
16910
16911       /* In increasing order, the last load can clobber the address.  */
16912       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16913         return false;
16914     }
16915
16916   /* One of the memory accesses must be a mempair operand.
16917      If it is not the first one, they need to be swapped by the
16918      peephole.  */
16919   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
16920        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
16921     return false;
16922
16923   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16924     rclass_1 = FP_REGS;
16925   else
16926     rclass_1 = GENERAL_REGS;
16927
16928   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16929     rclass_2 = FP_REGS;
16930   else
16931     rclass_2 = GENERAL_REGS;
16932
16933   /* Check if the registers are of same class.  */
16934   if (rclass_1 != rclass_2)
16935     return false;
16936
16937   return true;
16938 }
16939
16940 /* Given OPERANDS of consecutive load/store that can be merged,
16941    swap them if they are not in ascending order.  */
16942 void
16943 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
16944 {
16945   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
16946   HOST_WIDE_INT offval_1, offval_2;
16947
16948   if (load)
16949     {
16950       mem_1 = operands[1];
16951       mem_2 = operands[3];
16952     }
16953   else
16954     {
16955       mem_1 = operands[0];
16956       mem_2 = operands[2];
16957     }
16958
16959   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16960   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16961
16962   offval_1 = INTVAL (offset_1);
16963   offval_2 = INTVAL (offset_2);
16964
16965   if (offval_1 > offval_2)
16966     {
16967       /* Irrespective of whether this is a load or a store,
16968          we do the same swap.  */
16969       std::swap (operands[0], operands[2]);
16970       std::swap (operands[1], operands[3]);
16971     }
16972 }
16973
16974 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
16975    comparison between the two.  */
16976 int
16977 aarch64_host_wide_int_compare (const void *x, const void *y)
16978 {
16979   return wi::cmps (* ((const HOST_WIDE_INT *) x),
16980                    * ((const HOST_WIDE_INT *) y));
16981 }
16982
16983 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
16984    other pointing to a REG rtx containing an offset, compare the offsets
16985    of the two pairs.
16986
16987    Return:
16988
16989         1 iff offset (X) > offset (Y)
16990         0 iff offset (X) == offset (Y)
16991         -1 iff offset (X) < offset (Y)  */
16992 int
16993 aarch64_ldrstr_offset_compare (const void *x, const void *y)
16994 {
16995   const rtx * operands_1 = (const rtx *) x;
16996   const rtx * operands_2 = (const rtx *) y;
16997   rtx mem_1, mem_2, base, offset_1, offset_2;
16998
16999   if (MEM_P (operands_1[0]))
17000     mem_1 = operands_1[0];
17001   else
17002     mem_1 = operands_1[1];
17003
17004   if (MEM_P (operands_2[0]))
17005     mem_2 = operands_2[0];
17006   else
17007     mem_2 = operands_2[1];
17008
17009   /* Extract the offsets.  */
17010   extract_base_offset_in_addr (mem_1, &base, &offset_1);
17011   extract_base_offset_in_addr (mem_2, &base, &offset_2);
17012
17013   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
17014
17015   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
17016 }
17017
17018 /* Given OPERANDS of consecutive load/store, check if we can merge
17019    them into ldp/stp by adjusting the offset.  LOAD is true if they
17020    are load instructions.  MODE is the mode of memory operands.
17021
17022    Given below consecutive stores:
17023
17024      str  w1, [xb, 0x100]
17025      str  w1, [xb, 0x104]
17026      str  w1, [xb, 0x108]
17027      str  w1, [xb, 0x10c]
17028
17029    Though the offsets are out of the range supported by stp, we can
17030    still pair them after adjusting the offset, like:
17031
17032      add  scratch, xb, 0x100
17033      stp  w1, w1, [scratch]
17034      stp  w1, w1, [scratch, 0x8]
17035
17036    The peephole patterns detecting this opportunity should guarantee
17037    the scratch register is avaliable.  */
17038
17039 bool
17040 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
17041                                        scalar_mode mode)
17042 {
17043   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
17044   HOST_WIDE_INT offvals[4], msize;
17045   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
17046   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
17047
17048   if (load)
17049     {
17050       reg_1 = operands[0];
17051       mem_1 = operands[1];
17052       reg_2 = operands[2];
17053       mem_2 = operands[3];
17054       reg_3 = operands[4];
17055       mem_3 = operands[5];
17056       reg_4 = operands[6];
17057       mem_4 = operands[7];
17058       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
17059                   && REG_P (reg_3) && REG_P (reg_4));
17060
17061       /* Do not attempt to merge the loads if the loads clobber each other.  */
17062       for (int i = 0; i < 8; i += 2)
17063         for (int j = i + 2; j < 8; j += 2)
17064           if (reg_overlap_mentioned_p (operands[i], operands[j]))
17065             return false;
17066     }
17067   else
17068     {
17069       mem_1 = operands[0];
17070       reg_1 = operands[1];
17071       mem_2 = operands[2];
17072       reg_2 = operands[3];
17073       mem_3 = operands[4];
17074       reg_3 = operands[5];
17075       mem_4 = operands[6];
17076       reg_4 = operands[7];
17077     }
17078   /* Skip if memory operand is by itslef valid for ldp/stp.  */
17079   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
17080     return false;
17081
17082   /* The mems cannot be volatile.  */
17083   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
17084       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
17085     return false;
17086
17087   /* Check if the addresses are in the form of [base+offset].  */
17088   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17089   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
17090     return false;
17091   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17092   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
17093     return false;
17094   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
17095   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
17096     return false;
17097   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
17098   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
17099     return false;
17100
17101   /* Check if the bases are same.  */
17102   if (!rtx_equal_p (base_1, base_2)
17103       || !rtx_equal_p (base_2, base_3)
17104       || !rtx_equal_p (base_3, base_4))
17105     return false;
17106
17107   offvals[0] = INTVAL (offset_1);
17108   offvals[1] = INTVAL (offset_2);
17109   offvals[2] = INTVAL (offset_3);
17110   offvals[3] = INTVAL (offset_4);
17111   msize = GET_MODE_SIZE (mode);
17112
17113   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
17114   qsort (offvals, 4, sizeof (HOST_WIDE_INT), aarch64_host_wide_int_compare);
17115
17116   if (!(offvals[1] == offvals[0] + msize
17117         && offvals[3] == offvals[2] + msize))
17118     return false;
17119
17120   /* Check that offsets are within range of each other.  The ldp/stp
17121      instructions have 7 bit immediate offsets, so use 0x80.  */
17122   if (offvals[2] - offvals[0] >= msize * 0x80)
17123     return false;
17124
17125   /* The offsets must be aligned with respect to each other.  */
17126   if (offvals[0] % msize != offvals[2] % msize)
17127     return false;
17128
17129   /* Check if the addresses are clobbered by load.  */
17130   if (load && (reg_mentioned_p (reg_1, mem_1)
17131                || reg_mentioned_p (reg_2, mem_2)
17132                || reg_mentioned_p (reg_3, mem_3)
17133                || reg_mentioned_p (reg_4, mem_4)))
17134     return false;
17135
17136   /* If we have SImode and slow unaligned ldp,
17137      check the alignment to be at least 8 byte. */
17138   if (mode == SImode
17139       && (aarch64_tune_params.extra_tuning_flags
17140           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17141       && !optimize_size
17142       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
17143     return false;
17144
17145   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
17146     rclass_1 = FP_REGS;
17147   else
17148     rclass_1 = GENERAL_REGS;
17149
17150   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
17151     rclass_2 = FP_REGS;
17152   else
17153     rclass_2 = GENERAL_REGS;
17154
17155   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
17156     rclass_3 = FP_REGS;
17157   else
17158     rclass_3 = GENERAL_REGS;
17159
17160   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
17161     rclass_4 = FP_REGS;
17162   else
17163     rclass_4 = GENERAL_REGS;
17164
17165   /* Check if the registers are of same class.  */
17166   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
17167     return false;
17168
17169   return true;
17170 }
17171
17172 /* Given OPERANDS of consecutive load/store, this function pairs them
17173    into LDP/STP after adjusting the offset.  It depends on the fact
17174    that the operands can be sorted so the offsets are correct for STP.
17175    MODE is the mode of memory operands.  CODE is the rtl operator
17176    which should be applied to all memory operands, it's SIGN_EXTEND,
17177    ZERO_EXTEND or UNKNOWN.  */
17178
17179 bool
17180 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17181                              scalar_mode mode, RTX_CODE code)
17182 {
17183   rtx base, offset_1, offset_3, t1, t2;
17184   rtx mem_1, mem_2, mem_3, mem_4;
17185   rtx temp_operands[8];
17186   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
17187                 stp_off_upper_limit, stp_off_lower_limit, msize;
17188
17189   /* We make changes on a copy as we may still bail out.  */
17190   for (int i = 0; i < 8; i ++)
17191     temp_operands[i] = operands[i];
17192
17193   /* Sort the operands.  */
17194   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
17195
17196   if (load)
17197     {
17198       mem_1 = temp_operands[1];
17199       mem_2 = temp_operands[3];
17200       mem_3 = temp_operands[5];
17201       mem_4 = temp_operands[7];
17202     }
17203   else
17204     {
17205       mem_1 = temp_operands[0];
17206       mem_2 = temp_operands[2];
17207       mem_3 = temp_operands[4];
17208       mem_4 = temp_operands[6];
17209       gcc_assert (code == UNKNOWN);
17210     }
17211
17212   extract_base_offset_in_addr (mem_1, &base, &offset_1);
17213   extract_base_offset_in_addr (mem_3, &base, &offset_3);
17214   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
17215               && offset_3 != NULL_RTX);
17216
17217   /* Adjust offset so it can fit in LDP/STP instruction.  */
17218   msize = GET_MODE_SIZE (mode);
17219   stp_off_upper_limit = msize * (0x40 - 1);
17220   stp_off_lower_limit = - msize * 0x40;
17221
17222   off_val_1 = INTVAL (offset_1);
17223   off_val_3 = INTVAL (offset_3);
17224
17225   /* The base offset is optimally half way between the two STP/LDP offsets.  */
17226   if (msize <= 4)
17227     base_off = (off_val_1 + off_val_3) / 2;
17228   else
17229     /* However, due to issues with negative LDP/STP offset generation for
17230        larger modes, for DF, DI and vector modes. we must not use negative
17231        addresses smaller than 9 signed unadjusted bits can store.  This
17232        provides the most range in this case.  */
17233     base_off = off_val_1;
17234
17235   /* Adjust the base so that it is aligned with the addresses but still
17236      optimal.  */
17237   if (base_off % msize != off_val_1 % msize)
17238     /* Fix the offset, bearing in mind we want to make it bigger not
17239        smaller.  */
17240     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17241   else if (msize <= 4)
17242     /* The negative range of LDP/STP is one larger than the positive range.  */
17243     base_off += msize;
17244
17245   /* Check if base offset is too big or too small.  We can attempt to resolve
17246      this issue by setting it to the maximum value and seeing if the offsets
17247      still fit.  */
17248   if (base_off >= 0x1000)
17249     {
17250       base_off = 0x1000 - 1;
17251       /* We must still make sure that the base offset is aligned with respect
17252          to the address.  But it may may not be made any bigger.  */
17253       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17254     }
17255
17256   /* Likewise for the case where the base is too small.  */
17257   if (base_off <= -0x1000)
17258     {
17259       base_off = -0x1000 + 1;
17260       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17261     }
17262
17263   /* Offset of the first STP/LDP.  */
17264   new_off_1 = off_val_1 - base_off;
17265
17266   /* Offset of the second STP/LDP.  */
17267   new_off_3 = off_val_3 - base_off;
17268
17269   /* The offsets must be within the range of the LDP/STP instructions.  */
17270   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
17271       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
17272     return false;
17273
17274   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
17275                                                   new_off_1), true);
17276   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
17277                                                   new_off_1 + msize), true);
17278   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
17279                                                   new_off_3), true);
17280   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
17281                                                   new_off_3 + msize), true);
17282
17283   if (!aarch64_mem_pair_operand (mem_1, mode)
17284       || !aarch64_mem_pair_operand (mem_3, mode))
17285     return false;
17286
17287   if (code == ZERO_EXTEND)
17288     {
17289       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17290       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17291       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17292       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17293     }
17294   else if (code == SIGN_EXTEND)
17295     {
17296       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17297       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17298       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17299       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17300     }
17301
17302   if (load)
17303     {
17304       operands[0] = temp_operands[0];
17305       operands[1] = mem_1;
17306       operands[2] = temp_operands[2];
17307       operands[3] = mem_2;
17308       operands[4] = temp_operands[4];
17309       operands[5] = mem_3;
17310       operands[6] = temp_operands[6];
17311       operands[7] = mem_4;
17312     }
17313   else
17314     {
17315       operands[0] = mem_1;
17316       operands[1] = temp_operands[1];
17317       operands[2] = mem_2;
17318       operands[3] = temp_operands[3];
17319       operands[4] = mem_3;
17320       operands[5] = temp_operands[5];
17321       operands[6] = mem_4;
17322       operands[7] = temp_operands[7];
17323     }
17324
17325   /* Emit adjusting instruction.  */
17326   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
17327   /* Emit ldp/stp instructions.  */
17328   t1 = gen_rtx_SET (operands[0], operands[1]);
17329   t2 = gen_rtx_SET (operands[2], operands[3]);
17330   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17331   t1 = gen_rtx_SET (operands[4], operands[5]);
17332   t2 = gen_rtx_SET (operands[6], operands[7]);
17333   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17334   return true;
17335 }
17336
17337 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
17338    it isn't worth branching around empty masked ops (including masked
17339    stores).  */
17340
17341 static bool
17342 aarch64_empty_mask_is_expensive (unsigned)
17343 {
17344   return false;
17345 }
17346
17347 /* Return 1 if pseudo register should be created and used to hold
17348    GOT address for PIC code.  */
17349
17350 bool
17351 aarch64_use_pseudo_pic_reg (void)
17352 {
17353   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17354 }
17355
17356 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
17357
17358 static int
17359 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17360 {
17361   switch (XINT (x, 1))
17362     {
17363     case UNSPEC_GOTSMALLPIC:
17364     case UNSPEC_GOTSMALLPIC28K:
17365     case UNSPEC_GOTTINYPIC:
17366       return 0;
17367     default:
17368       break;
17369     }
17370
17371   return default_unspec_may_trap_p (x, flags);
17372 }
17373
17374
17375 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17376    return the log2 of that value.  Otherwise return -1.  */
17377
17378 int
17379 aarch64_fpconst_pow_of_2 (rtx x)
17380 {
17381   const REAL_VALUE_TYPE *r;
17382
17383   if (!CONST_DOUBLE_P (x))
17384     return -1;
17385
17386   r = CONST_DOUBLE_REAL_VALUE (x);
17387
17388   if (REAL_VALUE_NEGATIVE (*r)
17389       || REAL_VALUE_ISNAN (*r)
17390       || REAL_VALUE_ISINF (*r)
17391       || !real_isinteger (r, DFmode))
17392     return -1;
17393
17394   return exact_log2 (real_to_integer (r));
17395 }
17396
17397 /* If X is a vector of equal CONST_DOUBLE values and that value is
17398    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
17399
17400 int
17401 aarch64_vec_fpconst_pow_of_2 (rtx x)
17402 {
17403   int nelts;
17404   if (GET_CODE (x) != CONST_VECTOR
17405       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17406     return -1;
17407
17408   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17409     return -1;
17410
17411   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17412   if (firstval <= 0)
17413     return -1;
17414
17415   for (int i = 1; i < nelts; i++)
17416     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17417       return -1;
17418
17419   return firstval;
17420 }
17421
17422 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17423    to float.
17424
17425    __fp16 always promotes through this hook.
17426    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17427    through the generic excess precision logic rather than here.  */
17428
17429 static tree
17430 aarch64_promoted_type (const_tree t)
17431 {
17432   if (SCALAR_FLOAT_TYPE_P (t)
17433       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17434     return float_type_node;
17435
17436   return NULL_TREE;
17437 }
17438
17439 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
17440
17441 static bool
17442 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17443                            optimization_type opt_type)
17444 {
17445   switch (op)
17446     {
17447     case rsqrt_optab:
17448       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17449
17450     default:
17451       return true;
17452     }
17453 }
17454
17455 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
17456
17457 static unsigned int
17458 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17459                                         int *offset)
17460 {
17461   /* Polynomial invariant 1 == (VG / 2) - 1.  */
17462   gcc_assert (i == 1);
17463   *factor = 2;
17464   *offset = 1;
17465   return AARCH64_DWARF_VG;
17466 }
17467
17468 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17469    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17470
17471 static bool
17472 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17473 {
17474   return (mode == HFmode
17475           ? true
17476           : default_libgcc_floating_mode_supported_p (mode));
17477 }
17478
17479 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17480    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17481
17482 static bool
17483 aarch64_scalar_mode_supported_p (scalar_mode mode)
17484 {
17485   return (mode == HFmode
17486           ? true
17487           : default_scalar_mode_supported_p (mode));
17488 }
17489
17490 /* Set the value of FLT_EVAL_METHOD.
17491    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17492
17493     0: evaluate all operations and constants, whose semantic type has at
17494        most the range and precision of type float, to the range and
17495        precision of float; evaluate all other operations and constants to
17496        the range and precision of the semantic type;
17497
17498     N, where _FloatN is a supported interchange floating type
17499        evaluate all operations and constants, whose semantic type has at
17500        most the range and precision of _FloatN type, to the range and
17501        precision of the _FloatN type; evaluate all other operations and
17502        constants to the range and precision of the semantic type;
17503
17504    If we have the ARMv8.2-A extensions then we support _Float16 in native
17505    precision, so we should set this to 16.  Otherwise, we support the type,
17506    but want to evaluate expressions in float precision, so set this to
17507    0.  */
17508
17509 static enum flt_eval_method
17510 aarch64_excess_precision (enum excess_precision_type type)
17511 {
17512   switch (type)
17513     {
17514       case EXCESS_PRECISION_TYPE_FAST:
17515       case EXCESS_PRECISION_TYPE_STANDARD:
17516         /* We can calculate either in 16-bit range and precision or
17517            32-bit range and precision.  Make that decision based on whether
17518            we have native support for the ARMv8.2-A 16-bit floating-point
17519            instructions or not.  */
17520         return (TARGET_FP_F16INST
17521                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17522                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17523       case EXCESS_PRECISION_TYPE_IMPLICIT:
17524         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17525       default:
17526         gcc_unreachable ();
17527     }
17528   return FLT_EVAL_METHOD_UNPREDICTABLE;
17529 }
17530
17531 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
17532    scheduled for speculative execution.  Reject the long-running division
17533    and square-root instructions.  */
17534
17535 static bool
17536 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17537 {
17538   switch (get_attr_type (insn))
17539     {
17540       case TYPE_SDIV:
17541       case TYPE_UDIV:
17542       case TYPE_FDIVS:
17543       case TYPE_FDIVD:
17544       case TYPE_FSQRTS:
17545       case TYPE_FSQRTD:
17546       case TYPE_NEON_FP_SQRT_S:
17547       case TYPE_NEON_FP_SQRT_D:
17548       case TYPE_NEON_FP_SQRT_S_Q:
17549       case TYPE_NEON_FP_SQRT_D_Q:
17550       case TYPE_NEON_FP_DIV_S:
17551       case TYPE_NEON_FP_DIV_D:
17552       case TYPE_NEON_FP_DIV_S_Q:
17553       case TYPE_NEON_FP_DIV_D_Q:
17554         return false;
17555       default:
17556         return true;
17557     }
17558 }
17559
17560 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
17561
17562 static int
17563 aarch64_compute_pressure_classes (reg_class *classes)
17564 {
17565   int i = 0;
17566   classes[i++] = GENERAL_REGS;
17567   classes[i++] = FP_REGS;
17568   /* PR_REGS isn't a useful pressure class because many predicate pseudo
17569      registers need to go in PR_LO_REGS at some point during their
17570      lifetime.  Splitting it into two halves has the effect of making
17571      all predicates count against PR_LO_REGS, so that we try whenever
17572      possible to restrict the number of live predicates to 8.  This
17573      greatly reduces the amount of spilling in certain loops.  */
17574   classes[i++] = PR_LO_REGS;
17575   classes[i++] = PR_HI_REGS;
17576   return i;
17577 }
17578
17579 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
17580
17581 static bool
17582 aarch64_can_change_mode_class (machine_mode from,
17583                                machine_mode to, reg_class_t)
17584 {
17585   if (BYTES_BIG_ENDIAN)
17586     {
17587       bool from_sve_p = aarch64_sve_data_mode_p (from);
17588       bool to_sve_p = aarch64_sve_data_mode_p (to);
17589
17590       /* Don't allow changes between SVE data modes and non-SVE modes.
17591          See the comment at the head of aarch64-sve.md for details.  */
17592       if (from_sve_p != to_sve_p)
17593         return false;
17594
17595       /* Don't allow changes in element size: lane 0 of the new vector
17596          would not then be lane 0 of the old vector.  See the comment
17597          above aarch64_maybe_expand_sve_subreg_move for a more detailed
17598          description.
17599
17600          In the worst case, this forces a register to be spilled in
17601          one mode and reloaded in the other, which handles the
17602          endianness correctly.  */
17603       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17604         return false;
17605     }
17606   return true;
17607 }
17608
17609 /* Implement TARGET_EARLY_REMAT_MODES.  */
17610
17611 static void
17612 aarch64_select_early_remat_modes (sbitmap modes)
17613 {
17614   /* SVE values are not normally live across a call, so it should be
17615      worth doing early rematerialization even in VL-specific mode.  */
17616   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17617     {
17618       machine_mode mode = (machine_mode) i;
17619       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17620       if (vec_flags & VEC_ANY_SVE)
17621         bitmap_set_bit (modes, i);
17622     }
17623 }
17624
17625 /* Target-specific selftests.  */
17626
17627 #if CHECKING_P
17628
17629 namespace selftest {
17630
17631 /* Selftest for the RTL loader.
17632    Verify that the RTL loader copes with a dump from
17633    print_rtx_function.  This is essentially just a test that class
17634    function_reader can handle a real dump, but it also verifies
17635    that lookup_reg_by_dump_name correctly handles hard regs.
17636    The presence of hard reg names in the dump means that the test is
17637    target-specific, hence it is in this file.  */
17638
17639 static void
17640 aarch64_test_loading_full_dump ()
17641 {
17642   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17643
17644   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17645
17646   rtx_insn *insn_1 = get_insn_by_uid (1);
17647   ASSERT_EQ (NOTE, GET_CODE (insn_1));
17648
17649   rtx_insn *insn_15 = get_insn_by_uid (15);
17650   ASSERT_EQ (INSN, GET_CODE (insn_15));
17651   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17652
17653   /* Verify crtl->return_rtx.  */
17654   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17655   ASSERT_EQ (0, REGNO (crtl->return_rtx));
17656   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17657 }
17658
17659 /* Run all target-specific selftests.  */
17660
17661 static void
17662 aarch64_run_selftests (void)
17663 {
17664   aarch64_test_loading_full_dump ();
17665 }
17666
17667 } // namespace selftest
17668
17669 #endif /* #if CHECKING_P */
17670
17671 #undef TARGET_ADDRESS_COST
17672 #define TARGET_ADDRESS_COST aarch64_address_cost
17673
17674 /* This hook will determines whether unnamed bitfields affect the alignment
17675    of the containing structure.  The hook returns true if the structure
17676    should inherit the alignment requirements of an unnamed bitfield's
17677    type.  */
17678 #undef TARGET_ALIGN_ANON_BITFIELD
17679 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17680
17681 #undef TARGET_ASM_ALIGNED_DI_OP
17682 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17683
17684 #undef TARGET_ASM_ALIGNED_HI_OP
17685 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17686
17687 #undef TARGET_ASM_ALIGNED_SI_OP
17688 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17689
17690 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17691 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17692   hook_bool_const_tree_hwi_hwi_const_tree_true
17693
17694 #undef TARGET_ASM_FILE_START
17695 #define TARGET_ASM_FILE_START aarch64_start_file
17696
17697 #undef TARGET_ASM_OUTPUT_MI_THUNK
17698 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17699
17700 #undef TARGET_ASM_SELECT_RTX_SECTION
17701 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17702
17703 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17704 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17705
17706 #undef TARGET_BUILD_BUILTIN_VA_LIST
17707 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17708
17709 #undef TARGET_CALLEE_COPIES
17710 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17711
17712 #undef TARGET_CAN_ELIMINATE
17713 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17714
17715 #undef TARGET_CAN_INLINE_P
17716 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17717
17718 #undef TARGET_CANNOT_FORCE_CONST_MEM
17719 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17720
17721 #undef TARGET_CASE_VALUES_THRESHOLD
17722 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17723
17724 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17725 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17726
17727 /* Only the least significant bit is used for initialization guard
17728    variables.  */
17729 #undef TARGET_CXX_GUARD_MASK_BIT
17730 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17731
17732 #undef TARGET_C_MODE_FOR_SUFFIX
17733 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17734
17735 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17736 #undef  TARGET_DEFAULT_TARGET_FLAGS
17737 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17738 #endif
17739
17740 #undef TARGET_CLASS_MAX_NREGS
17741 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17742
17743 #undef TARGET_BUILTIN_DECL
17744 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17745
17746 #undef TARGET_BUILTIN_RECIPROCAL
17747 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17748
17749 #undef TARGET_C_EXCESS_PRECISION
17750 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17751
17752 #undef  TARGET_EXPAND_BUILTIN
17753 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17754
17755 #undef TARGET_EXPAND_BUILTIN_VA_START
17756 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17757
17758 #undef TARGET_FOLD_BUILTIN
17759 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17760
17761 #undef TARGET_FUNCTION_ARG
17762 #define TARGET_FUNCTION_ARG aarch64_function_arg
17763
17764 #undef TARGET_FUNCTION_ARG_ADVANCE
17765 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17766
17767 #undef TARGET_FUNCTION_ARG_BOUNDARY
17768 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17769
17770 #undef TARGET_FUNCTION_ARG_PADDING
17771 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17772
17773 #undef TARGET_GET_RAW_RESULT_MODE
17774 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17775 #undef TARGET_GET_RAW_ARG_MODE
17776 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17777
17778 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17779 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17780
17781 #undef TARGET_FUNCTION_VALUE
17782 #define TARGET_FUNCTION_VALUE aarch64_function_value
17783
17784 #undef TARGET_FUNCTION_VALUE_REGNO_P
17785 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17786
17787 #undef TARGET_GIMPLE_FOLD_BUILTIN
17788 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17789
17790 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17791 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17792
17793 #undef  TARGET_INIT_BUILTINS
17794 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
17795
17796 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17797 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17798   aarch64_ira_change_pseudo_allocno_class
17799
17800 #undef TARGET_LEGITIMATE_ADDRESS_P
17801 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17802
17803 #undef TARGET_LEGITIMATE_CONSTANT_P
17804 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17805
17806 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17807 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17808   aarch64_legitimize_address_displacement
17809
17810 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17811 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17812
17813 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17814 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17815 aarch64_libgcc_floating_mode_supported_p
17816
17817 #undef TARGET_MANGLE_TYPE
17818 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17819
17820 #undef TARGET_MEMORY_MOVE_COST
17821 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17822
17823 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17824 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17825
17826 #undef TARGET_MUST_PASS_IN_STACK
17827 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17828
17829 /* This target hook should return true if accesses to volatile bitfields
17830    should use the narrowest mode possible.  It should return false if these
17831    accesses should use the bitfield container type.  */
17832 #undef TARGET_NARROW_VOLATILE_BITFIELD
17833 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17834
17835 #undef  TARGET_OPTION_OVERRIDE
17836 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17837
17838 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17839 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17840   aarch64_override_options_after_change
17841
17842 #undef TARGET_OPTION_SAVE
17843 #define TARGET_OPTION_SAVE aarch64_option_save
17844
17845 #undef TARGET_OPTION_RESTORE
17846 #define TARGET_OPTION_RESTORE aarch64_option_restore
17847
17848 #undef TARGET_OPTION_PRINT
17849 #define TARGET_OPTION_PRINT aarch64_option_print
17850
17851 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17852 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17853
17854 #undef TARGET_SET_CURRENT_FUNCTION
17855 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17856
17857 #undef TARGET_PASS_BY_REFERENCE
17858 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17859
17860 #undef TARGET_PREFERRED_RELOAD_CLASS
17861 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17862
17863 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17864 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17865
17866 #undef TARGET_PROMOTED_TYPE
17867 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17868
17869 #undef TARGET_SECONDARY_RELOAD
17870 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17871
17872 #undef TARGET_SHIFT_TRUNCATION_MASK
17873 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17874
17875 #undef TARGET_SETUP_INCOMING_VARARGS
17876 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17877
17878 #undef TARGET_STRUCT_VALUE_RTX
17879 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
17880
17881 #undef TARGET_REGISTER_MOVE_COST
17882 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17883
17884 #undef TARGET_RETURN_IN_MEMORY
17885 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17886
17887 #undef TARGET_RETURN_IN_MSB
17888 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17889
17890 #undef TARGET_RTX_COSTS
17891 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17892
17893 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17894 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17895
17896 #undef TARGET_SCHED_ISSUE_RATE
17897 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17898
17899 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17900 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17901   aarch64_sched_first_cycle_multipass_dfa_lookahead
17902
17903 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17904 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17905   aarch64_first_cycle_multipass_dfa_lookahead_guard
17906
17907 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17908 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17909   aarch64_get_separate_components
17910
17911 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17912 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17913   aarch64_components_for_bb
17914
17915 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17916 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17917   aarch64_disqualify_components
17918
17919 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17920 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17921   aarch64_emit_prologue_components
17922
17923 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17924 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17925   aarch64_emit_epilogue_components
17926
17927 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17928 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17929   aarch64_set_handled_components
17930
17931 #undef TARGET_TRAMPOLINE_INIT
17932 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17933
17934 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17935 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17936
17937 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17938 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17939
17940 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17941 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17942   aarch64_builtin_support_vector_misalignment
17943
17944 #undef TARGET_ARRAY_MODE
17945 #define TARGET_ARRAY_MODE aarch64_array_mode
17946
17947 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17948 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17949
17950 #undef TARGET_VECTORIZE_ADD_STMT_COST
17951 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17952
17953 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17954 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17955   aarch64_builtin_vectorization_cost
17956
17957 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17958 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17959
17960 #undef TARGET_VECTORIZE_BUILTINS
17961 #define TARGET_VECTORIZE_BUILTINS
17962
17963 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17964 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17965   aarch64_builtin_vectorized_function
17966
17967 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17968 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17969   aarch64_autovectorize_vector_sizes
17970
17971 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17972 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17973   aarch64_atomic_assign_expand_fenv
17974
17975 /* Section anchor support.  */
17976
17977 #undef TARGET_MIN_ANCHOR_OFFSET
17978 #define TARGET_MIN_ANCHOR_OFFSET -256
17979
17980 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17981    byte offset; we can do much more for larger data types, but have no way
17982    to determine the size of the access.  We assume accesses are aligned.  */
17983 #undef TARGET_MAX_ANCHOR_OFFSET
17984 #define TARGET_MAX_ANCHOR_OFFSET 4095
17985
17986 #undef TARGET_VECTOR_ALIGNMENT
17987 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17988
17989 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17990 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17991   aarch64_vectorize_preferred_vector_alignment
17992 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17993 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17994   aarch64_simd_vector_alignment_reachable
17995
17996 /* vec_perm support.  */
17997
17998 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17999 #define TARGET_VECTORIZE_VEC_PERM_CONST \
18000   aarch64_vectorize_vec_perm_const
18001
18002 #undef TARGET_VECTORIZE_GET_MASK_MODE
18003 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
18004 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
18005 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
18006   aarch64_empty_mask_is_expensive
18007 #undef TARGET_PREFERRED_ELSE_VALUE
18008 #define TARGET_PREFERRED_ELSE_VALUE \
18009   aarch64_preferred_else_value
18010
18011 #undef TARGET_INIT_LIBFUNCS
18012 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
18013
18014 #undef TARGET_FIXED_CONDITION_CODE_REGS
18015 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
18016
18017 #undef TARGET_FLAGS_REGNUM
18018 #define TARGET_FLAGS_REGNUM CC_REGNUM
18019
18020 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
18021 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
18022
18023 #undef TARGET_ASAN_SHADOW_OFFSET
18024 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
18025
18026 #undef TARGET_LEGITIMIZE_ADDRESS
18027 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
18028
18029 #undef TARGET_SCHED_CAN_SPECULATE_INSN
18030 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
18031
18032 #undef TARGET_CAN_USE_DOLOOP_P
18033 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
18034
18035 #undef TARGET_SCHED_ADJUST_PRIORITY
18036 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
18037
18038 #undef TARGET_SCHED_MACRO_FUSION_P
18039 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
18040
18041 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
18042 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
18043
18044 #undef TARGET_SCHED_FUSION_PRIORITY
18045 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
18046
18047 #undef TARGET_UNSPEC_MAY_TRAP_P
18048 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
18049
18050 #undef TARGET_USE_PSEUDO_PIC_REG
18051 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
18052
18053 #undef TARGET_PRINT_OPERAND
18054 #define TARGET_PRINT_OPERAND aarch64_print_operand
18055
18056 #undef TARGET_PRINT_OPERAND_ADDRESS
18057 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
18058
18059 #undef TARGET_OPTAB_SUPPORTED_P
18060 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
18061
18062 #undef TARGET_OMIT_STRUCT_RETURN_REG
18063 #define TARGET_OMIT_STRUCT_RETURN_REG true
18064
18065 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
18066 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
18067   aarch64_dwarf_poly_indeterminate_value
18068
18069 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
18070 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
18071 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
18072
18073 #undef TARGET_HARD_REGNO_NREGS
18074 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
18075 #undef TARGET_HARD_REGNO_MODE_OK
18076 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
18077
18078 #undef TARGET_MODES_TIEABLE_P
18079 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
18080
18081 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
18082 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
18083   aarch64_hard_regno_call_part_clobbered
18084
18085 #undef TARGET_CONSTANT_ALIGNMENT
18086 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
18087
18088 #undef TARGET_COMPUTE_PRESSURE_CLASSES
18089 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
18090
18091 #undef TARGET_CAN_CHANGE_MODE_CLASS
18092 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
18093
18094 #undef TARGET_SELECT_EARLY_REMAT_MODES
18095 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
18096
18097 #if CHECKING_P
18098 #undef TARGET_RUN_TARGET_SELFTESTS
18099 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
18100 #endif /* #if CHECKING_P */
18101
18102 struct gcc_target targetm = TARGET_INITIALIZER;
18103
18104 #include "gt-aarch64.h"