gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2019 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "cgraph.h"
  44 #include "diagnostic.h"
  45 #include "insn-attr.h"
  46 #include "alias.h"
  47 #include "fold-const.h"
  48 #include "stor-layout.h"
  49 #include "calls.h"
  50 #include "varasm.h"
  51 #include "output.h"
  52 #include "flags.h"
  53 #include "explow.h"
  54 #include "expr.h"
  55 #include "reload.h"
  56 #include "langhooks.h"
  57 #include "opts.h"
  58 #include "params.h"
  59 #include "gimplify.h"
  60 #include "dwarf2.h"
  61 #include "gimple-iterator.h"
  62 #include "tree-vectorizer.h"
  63 #include "aarch64-cost-tables.h"
  64 #include "dumpfile.h"
  65 #include "builtins.h"
  66 #include "rtl-iter.h"
  67 #include "tm-constrs.h"
  68 #include "sched-int.h"
  69 #include "target-globals.h"
  70 #include "common/common-target.h"
  71 #include "cfgrtl.h"
  72 #include "selftest.h"
  73 #include "selftest-rtl.h"
  74 #include "rtx-vector-builder.h"
  75 #include "intl.h"
  76
  77 /* This file should be included last.  */
  78 #include "target-def.h"
  79
  80 /* Defined for convenience.  */
  81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  82
  83 /* Information about a legitimate vector immediate operand.  */
  84 struct simd_immediate_info
  85 {
  86   enum insn_type { MOV, MVN };
  87   enum modifier_type { LSL, MSL };
  88
  89   simd_immediate_info () {}
  90   simd_immediate_info (scalar_float_mode, rtx);
  91   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  92                        insn_type = MOV, modifier_type = LSL,
  93                        unsigned int = 0);
  94   simd_immediate_info (scalar_mode, rtx, rtx);
  95
  96   /* The mode of the elements.  */
  97   scalar_mode elt_mode;
  98
  99   /* The value of each element if all elements are the same, or the
 100      first value if the constant is a series.  */
 101   rtx value;
 102
 103   /* The value of the step if the constant is a series, null otherwise.  */
 104   rtx step;
 105
 106   /* The instruction to use to move the immediate into a vector.  */
 107   insn_type insn;
 108
 109   /* The kind of shift modifier to use, and the number of bits to shift.
 110      This is (LSL, 0) if no shift is needed.  */
 111   modifier_type modifier;
 112   unsigned int shift;
 113 };
 114
 115 /* Construct a floating-point immediate in which each element has mode
 116    ELT_MODE_IN and value VALUE_IN.  */
 117 inline simd_immediate_info
 118 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 119   : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
 120     modifier (LSL), shift (0)
 121 {}
 122
 123 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 124    and value VALUE_IN.  The other parameters are as for the structure
 125    fields.  */
 126 inline simd_immediate_info
 127 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 128                        unsigned HOST_WIDE_INT value_in,
 129                        insn_type insn_in, modifier_type modifier_in,
 130                        unsigned int shift_in)
 131   : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
 132     step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
 133 {}
 134
 135 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 136    and where element I is equal to VALUE_IN + I * STEP_IN.  */
 137 inline simd_immediate_info
 138 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
 139   : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
 140     modifier (LSL), shift (0)
 141 {}
 142
 143 /* The current code model.  */
 144 enum aarch64_code_model aarch64_cmodel;
 145
 146 /* The number of 64-bit elements in an SVE vector.  */
 147 poly_uint16 aarch64_sve_vg;
 148
 149 #ifdef HAVE_AS_TLS
 150 #undef TARGET_HAVE_TLS
 151 #define TARGET_HAVE_TLS 1
 152 #endif
 153
 154 static bool aarch64_composite_type_p (const_tree, machine_mode);
 155 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 156                                                      const_tree,
 157                                                      machine_mode *, int *,
 158                                                      bool *);
 159 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 160 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 161 static void aarch64_override_options_after_change (void);
 162 static bool aarch64_vector_mode_supported_p (machine_mode);
 163 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 164 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 165                                                          const_tree type,
 166                                                          int misalignment,
 167                                                          bool is_packed);
 168 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 169 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 170                                             aarch64_addr_query_type);
 171 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 172
 173 /* Major revision number of the ARM Architecture implemented by the target.  */
 174 unsigned aarch64_architecture_version;
 175
 176 /* The processor for which instructions should be scheduled.  */
 177 enum aarch64_processor aarch64_tune = cortexa53;
 178
 179 /* Mask to specify which instruction scheduling options should be used.  */
 180 unsigned long aarch64_tune_flags = 0;
 181
 182 /* Global flag for PC relative loads.  */
 183 bool aarch64_pcrelative_literal_loads;
 184
 185 /* Global flag for whether frame pointer is enabled.  */
 186 bool aarch64_use_frame_pointer;
 187
 188 #define BRANCH_PROTECT_STR_MAX 255
 189 char *accepted_branch_protection_string = NULL;
 190
 191 static enum aarch64_parse_opt_result
 192 aarch64_parse_branch_protection (const char*, char**);
 193
 194 /* Support for command line parsing of boolean flags in the tuning
 195    structures.  */
 196 struct aarch64_flag_desc
 197 {
 198   const char* name;
 199   unsigned int flag;
 200 };
 201
 202 #define AARCH64_FUSION_PAIR(name, internal_name) \
 203   { name, AARCH64_FUSE_##internal_name },
 204 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 205 {
 206   { "none", AARCH64_FUSE_NOTHING },
 207 #include "aarch64-fusion-pairs.def"
 208   { "all", AARCH64_FUSE_ALL },
 209   { NULL, AARCH64_FUSE_NOTHING }
 210 };
 211
 212 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 213   { name, AARCH64_EXTRA_TUNE_##internal_name },
 214 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 215 {
 216   { "none", AARCH64_EXTRA_TUNE_NONE },
 217 #include "aarch64-tuning-flags.def"
 218   { "all", AARCH64_EXTRA_TUNE_ALL },
 219   { NULL, AARCH64_EXTRA_TUNE_NONE }
 220 };
 221
 222 /* Tuning parameters.  */
 223
 224 static const struct cpu_addrcost_table generic_addrcost_table =
 225 {
 226     {
 227       1, /* hi  */
 228       0, /* si  */
 229       0, /* di  */
 230       1, /* ti  */
 231     },
 232   0, /* pre_modify  */
 233   0, /* post_modify  */
 234   0, /* register_offset  */
 235   0, /* register_sextend  */
 236   0, /* register_zextend  */
 237   0 /* imm_offset  */
 238 };
 239
 240 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 241 {
 242     {
 243       0, /* hi  */
 244       0, /* si  */
 245       0, /* di  */
 246       2, /* ti  */
 247     },
 248   0, /* pre_modify  */
 249   0, /* post_modify  */
 250   1, /* register_offset  */
 251   1, /* register_sextend  */
 252   2, /* register_zextend  */
 253   0, /* imm_offset  */
 254 };
 255
 256 static const struct cpu_addrcost_table xgene1_addrcost_table =
 257 {
 258     {
 259       1, /* hi  */
 260       0, /* si  */
 261       0, /* di  */
 262       1, /* ti  */
 263     },
 264   1, /* pre_modify  */
 265   1, /* post_modify  */
 266   0, /* register_offset  */
 267   1, /* register_sextend  */
 268   1, /* register_zextend  */
 269   0, /* imm_offset  */
 270 };
 271
 272 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 273 {
 274     {
 275       1, /* hi  */
 276       1, /* si  */
 277       1, /* di  */
 278       2, /* ti  */
 279     },
 280   0, /* pre_modify  */
 281   0, /* post_modify  */
 282   2, /* register_offset  */
 283   3, /* register_sextend  */
 284   3, /* register_zextend  */
 285   0, /* imm_offset  */
 286 };
 287
 288 static const struct cpu_addrcost_table tsv110_addrcost_table =
 289 {
 290     {
 291       1, /* hi  */
 292       0, /* si  */
 293       0, /* di  */
 294       1, /* ti  */
 295     },
 296   0, /* pre_modify  */
 297   0, /* post_modify  */
 298   0, /* register_offset  */
 299   1, /* register_sextend  */
 300   1, /* register_zextend  */
 301   0, /* imm_offset  */
 302 };
 303
 304 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 305 {
 306     {
 307       1, /* hi  */
 308       1, /* si  */
 309       1, /* di  */
 310       2, /* ti  */
 311     },
 312   1, /* pre_modify  */
 313   1, /* post_modify  */
 314   3, /* register_offset  */
 315   3, /* register_sextend  */
 316   3, /* register_zextend  */
 317   2, /* imm_offset  */
 318 };
 319
 320 static const struct cpu_regmove_cost generic_regmove_cost =
 321 {
 322   1, /* GP2GP  */
 323   /* Avoid the use of slow int<->fp moves for spilling by setting
 324      their cost higher than memmov_cost.  */
 325   5, /* GP2FP  */
 326   5, /* FP2GP  */
 327   2 /* FP2FP  */
 328 };
 329
 330 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 331 {
 332   1, /* GP2GP  */
 333   /* Avoid the use of slow int<->fp moves for spilling by setting
 334      their cost higher than memmov_cost.  */
 335   5, /* GP2FP  */
 336   5, /* FP2GP  */
 337   2 /* FP2FP  */
 338 };
 339
 340 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 341 {
 342   1, /* GP2GP  */
 343   /* Avoid the use of slow int<->fp moves for spilling by setting
 344      their cost higher than memmov_cost.  */
 345   5, /* GP2FP  */
 346   5, /* FP2GP  */
 347   2 /* FP2FP  */
 348 };
 349
 350 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 351 {
 352   1, /* GP2GP  */
 353   /* Avoid the use of slow int<->fp moves for spilling by setting
 354      their cost higher than memmov_cost (actual, 4 and 9).  */
 355   9, /* GP2FP  */
 356   9, /* FP2GP  */
 357   1 /* FP2FP  */
 358 };
 359
 360 static const struct cpu_regmove_cost thunderx_regmove_cost =
 361 {
 362   2, /* GP2GP  */
 363   2, /* GP2FP  */
 364   6, /* FP2GP  */
 365   4 /* FP2FP  */
 366 };
 367
 368 static const struct cpu_regmove_cost xgene1_regmove_cost =
 369 {
 370   1, /* GP2GP  */
 371   /* Avoid the use of slow int<->fp moves for spilling by setting
 372      their cost higher than memmov_cost.  */
 373   8, /* GP2FP  */
 374   8, /* FP2GP  */
 375   2 /* FP2FP  */
 376 };
 377
 378 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 379 {
 380   2, /* GP2GP  */
 381   /* Avoid the use of int<->fp moves for spilling.  */
 382   6, /* GP2FP  */
 383   6, /* FP2GP  */
 384   4 /* FP2FP  */
 385 };
 386
 387 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 388 {
 389   1, /* GP2GP  */
 390   /* Avoid the use of int<->fp moves for spilling.  */
 391   8, /* GP2FP  */
 392   8, /* FP2GP  */
 393   4  /* FP2FP  */
 394 };
 395
 396 static const struct cpu_regmove_cost tsv110_regmove_cost =
 397 {
 398   1, /* GP2GP  */
 399   /* Avoid the use of slow int<->fp moves for spilling by setting
 400      their cost higher than memmov_cost.  */
 401   2, /* GP2FP  */
 402   3, /* FP2GP  */
 403   2  /* FP2FP  */
 404 };
 405
 406 /* Generic costs for vector insn classes.  */
 407 static const struct cpu_vector_cost generic_vector_cost =
 408 {
 409   1, /* scalar_int_stmt_cost  */
 410   1, /* scalar_fp_stmt_cost  */
 411   1, /* scalar_load_cost  */
 412   1, /* scalar_store_cost  */
 413   1, /* vec_int_stmt_cost  */
 414   1, /* vec_fp_stmt_cost  */
 415   2, /* vec_permute_cost  */
 416   1, /* vec_to_scalar_cost  */
 417   1, /* scalar_to_vec_cost  */
 418   1, /* vec_align_load_cost  */
 419   1, /* vec_unalign_load_cost  */
 420   1, /* vec_unalign_store_cost  */
 421   1, /* vec_store_cost  */
 422   3, /* cond_taken_branch_cost  */
 423   1 /* cond_not_taken_branch_cost  */
 424 };
 425
 426 /* QDF24XX costs for vector insn classes.  */
 427 static const struct cpu_vector_cost qdf24xx_vector_cost =
 428 {
 429   1, /* scalar_int_stmt_cost  */
 430   1, /* scalar_fp_stmt_cost  */
 431   1, /* scalar_load_cost  */
 432   1, /* scalar_store_cost  */
 433   1, /* vec_int_stmt_cost  */
 434   3, /* vec_fp_stmt_cost  */
 435   2, /* vec_permute_cost  */
 436   1, /* vec_to_scalar_cost  */
 437   1, /* scalar_to_vec_cost  */
 438   1, /* vec_align_load_cost  */
 439   1, /* vec_unalign_load_cost  */
 440   1, /* vec_unalign_store_cost  */
 441   1, /* vec_store_cost  */
 442   3, /* cond_taken_branch_cost  */
 443   1 /* cond_not_taken_branch_cost  */
 444 };
 445
 446 /* ThunderX costs for vector insn classes.  */
 447 static const struct cpu_vector_cost thunderx_vector_cost =
 448 {
 449   1, /* scalar_int_stmt_cost  */
 450   1, /* scalar_fp_stmt_cost  */
 451   3, /* scalar_load_cost  */
 452   1, /* scalar_store_cost  */
 453   4, /* vec_int_stmt_cost  */
 454   1, /* vec_fp_stmt_cost  */
 455   4, /* vec_permute_cost  */
 456   2, /* vec_to_scalar_cost  */
 457   2, /* scalar_to_vec_cost  */
 458   3, /* vec_align_load_cost  */
 459   5, /* vec_unalign_load_cost  */
 460   5, /* vec_unalign_store_cost  */
 461   1, /* vec_store_cost  */
 462   3, /* cond_taken_branch_cost  */
 463   3 /* cond_not_taken_branch_cost  */
 464 };
 465
 466 static const struct cpu_vector_cost tsv110_vector_cost =
 467 {
 468   1, /* scalar_int_stmt_cost  */
 469   1, /* scalar_fp_stmt_cost  */
 470   5, /* scalar_load_cost  */
 471   1, /* scalar_store_cost  */
 472   2, /* vec_int_stmt_cost  */
 473   2, /* vec_fp_stmt_cost  */
 474   2, /* vec_permute_cost  */
 475   3, /* vec_to_scalar_cost  */
 476   2, /* scalar_to_vec_cost  */
 477   5, /* vec_align_load_cost  */
 478   5, /* vec_unalign_load_cost  */
 479   1, /* vec_unalign_store_cost  */
 480   1, /* vec_store_cost  */
 481   1, /* cond_taken_branch_cost  */
 482   1 /* cond_not_taken_branch_cost  */
 483 };
 484
 485 /* Generic costs for vector insn classes.  */
 486 static const struct cpu_vector_cost cortexa57_vector_cost =
 487 {
 488   1, /* scalar_int_stmt_cost  */
 489   1, /* scalar_fp_stmt_cost  */
 490   4, /* scalar_load_cost  */
 491   1, /* scalar_store_cost  */
 492   2, /* vec_int_stmt_cost  */
 493   2, /* vec_fp_stmt_cost  */
 494   3, /* vec_permute_cost  */
 495   8, /* vec_to_scalar_cost  */
 496   8, /* scalar_to_vec_cost  */
 497   4, /* vec_align_load_cost  */
 498   4, /* vec_unalign_load_cost  */
 499   1, /* vec_unalign_store_cost  */
 500   1, /* vec_store_cost  */
 501   1, /* cond_taken_branch_cost  */
 502   1 /* cond_not_taken_branch_cost  */
 503 };
 504
 505 static const struct cpu_vector_cost exynosm1_vector_cost =
 506 {
 507   1, /* scalar_int_stmt_cost  */
 508   1, /* scalar_fp_stmt_cost  */
 509   5, /* scalar_load_cost  */
 510   1, /* scalar_store_cost  */
 511   3, /* vec_int_stmt_cost  */
 512   3, /* vec_fp_stmt_cost  */
 513   3, /* vec_permute_cost  */
 514   3, /* vec_to_scalar_cost  */
 515   3, /* scalar_to_vec_cost  */
 516   5, /* vec_align_load_cost  */
 517   5, /* vec_unalign_load_cost  */
 518   1, /* vec_unalign_store_cost  */
 519   1, /* vec_store_cost  */
 520   1, /* cond_taken_branch_cost  */
 521   1 /* cond_not_taken_branch_cost  */
 522 };
 523
 524 /* Generic costs for vector insn classes.  */
 525 static const struct cpu_vector_cost xgene1_vector_cost =
 526 {
 527   1, /* scalar_int_stmt_cost  */
 528   1, /* scalar_fp_stmt_cost  */
 529   5, /* scalar_load_cost  */
 530   1, /* scalar_store_cost  */
 531   2, /* vec_int_stmt_cost  */
 532   2, /* vec_fp_stmt_cost  */
 533   2, /* vec_permute_cost  */
 534   4, /* vec_to_scalar_cost  */
 535   4, /* scalar_to_vec_cost  */
 536   10, /* vec_align_load_cost  */
 537   10, /* vec_unalign_load_cost  */
 538   2, /* vec_unalign_store_cost  */
 539   2, /* vec_store_cost  */
 540   2, /* cond_taken_branch_cost  */
 541   1 /* cond_not_taken_branch_cost  */
 542 };
 543
 544 /* Costs for vector insn classes for Vulcan.  */
 545 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 546 {
 547   1, /* scalar_int_stmt_cost  */
 548   6, /* scalar_fp_stmt_cost  */
 549   4, /* scalar_load_cost  */
 550   1, /* scalar_store_cost  */
 551   5, /* vec_int_stmt_cost  */
 552   6, /* vec_fp_stmt_cost  */
 553   3, /* vec_permute_cost  */
 554   6, /* vec_to_scalar_cost  */
 555   5, /* scalar_to_vec_cost  */
 556   8, /* vec_align_load_cost  */
 557   8, /* vec_unalign_load_cost  */
 558   4, /* vec_unalign_store_cost  */
 559   4, /* vec_store_cost  */
 560   2, /* cond_taken_branch_cost  */
 561   1  /* cond_not_taken_branch_cost  */
 562 };
 563
 564 /* Generic costs for branch instructions.  */
 565 static const struct cpu_branch_cost generic_branch_cost =
 566 {
 567   1,  /* Predictable.  */
 568   3   /* Unpredictable.  */
 569 };
 570
 571 /* Generic approximation modes.  */
 572 static const cpu_approx_modes generic_approx_modes =
 573 {
 574   AARCH64_APPROX_NONE,  /* division  */
 575   AARCH64_APPROX_NONE,  /* sqrt  */
 576   AARCH64_APPROX_NONE   /* recip_sqrt  */
 577 };
 578
 579 /* Approximation modes for Exynos M1.  */
 580 static const cpu_approx_modes exynosm1_approx_modes =
 581 {
 582   AARCH64_APPROX_NONE,  /* division  */
 583   AARCH64_APPROX_ALL,   /* sqrt  */
 584   AARCH64_APPROX_ALL    /* recip_sqrt  */
 585 };
 586
 587 /* Approximation modes for X-Gene 1.  */
 588 static const cpu_approx_modes xgene1_approx_modes =
 589 {
 590   AARCH64_APPROX_NONE,  /* division  */
 591   AARCH64_APPROX_NONE,  /* sqrt  */
 592   AARCH64_APPROX_ALL    /* recip_sqrt  */
 593 };
 594
 595 /* Generic prefetch settings (which disable prefetch).  */
 596 static const cpu_prefetch_tune generic_prefetch_tune =
 597 {
 598   0,                    /* num_slots  */
 599   -1,                   /* l1_cache_size  */
 600   -1,                   /* l1_cache_line_size  */
 601   -1,                   /* l2_cache_size  */
 602   true,                 /* prefetch_dynamic_strides */
 603   -1,                   /* minimum_stride */
 604   -1                    /* default_opt_level  */
 605 };
 606
 607 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 608 {
 609   0,                    /* num_slots  */
 610   -1,                   /* l1_cache_size  */
 611   64,                   /* l1_cache_line_size  */
 612   -1,                   /* l2_cache_size  */
 613   true,                 /* prefetch_dynamic_strides */
 614   -1,                   /* minimum_stride */
 615   -1                    /* default_opt_level  */
 616 };
 617
 618 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 619 {
 620   4,                    /* num_slots  */
 621   32,                   /* l1_cache_size  */
 622   64,                   /* l1_cache_line_size  */
 623   512,                  /* l2_cache_size  */
 624   false,                /* prefetch_dynamic_strides */
 625   2048,                 /* minimum_stride */
 626   3                     /* default_opt_level  */
 627 };
 628
 629 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 630 {
 631   8,                    /* num_slots  */
 632   32,                   /* l1_cache_size  */
 633   128,                  /* l1_cache_line_size  */
 634   16*1024,              /* l2_cache_size  */
 635   true,                 /* prefetch_dynamic_strides */
 636   -1,                   /* minimum_stride */
 637   3                     /* default_opt_level  */
 638 };
 639
 640 static const cpu_prefetch_tune thunderx_prefetch_tune =
 641 {
 642   8,                    /* num_slots  */
 643   32,                   /* l1_cache_size  */
 644   128,                  /* l1_cache_line_size  */
 645   -1,                   /* l2_cache_size  */
 646   true,                 /* prefetch_dynamic_strides */
 647   -1,                   /* minimum_stride */
 648   -1                    /* default_opt_level  */
 649 };
 650
 651 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 652 {
 653   8,                    /* num_slots  */
 654   32,                   /* l1_cache_size  */
 655   64,                   /* l1_cache_line_size  */
 656   256,                  /* l2_cache_size  */
 657   true,                 /* prefetch_dynamic_strides */
 658   -1,                   /* minimum_stride */
 659   -1                    /* default_opt_level  */
 660 };
 661
 662 static const cpu_prefetch_tune tsv110_prefetch_tune =
 663 {
 664   0,                    /* num_slots  */
 665   64,                   /* l1_cache_size  */
 666   64,                   /* l1_cache_line_size  */
 667   512,                  /* l2_cache_size  */
 668   true,                 /* prefetch_dynamic_strides */
 669   -1,                   /* minimum_stride */
 670   -1                    /* default_opt_level  */
 671 };
 672
 673 static const cpu_prefetch_tune xgene1_prefetch_tune =
 674 {
 675   8,                    /* num_slots  */
 676   32,                   /* l1_cache_size  */
 677   64,                   /* l1_cache_line_size  */
 678   256,                  /* l2_cache_size  */
 679   true,                 /* prefetch_dynamic_strides */
 680   -1,                   /* minimum_stride */
 681   -1                    /* default_opt_level  */
 682 };
 683
 684 static const struct tune_params generic_tunings =
 685 {
 686   &cortexa57_extra_costs,
 687   &generic_addrcost_table,
 688   &generic_regmove_cost,
 689   &generic_vector_cost,
 690   &generic_branch_cost,
 691   &generic_approx_modes,
 692   SVE_NOT_IMPLEMENTED, /* sve_width  */
 693   4, /* memmov_cost  */
 694   2, /* issue_rate  */
 695   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 696   "8",  /* function_align.  */
 697   "4",  /* jump_align.  */
 698   "8",  /* loop_align.  */
 699   2,    /* int_reassoc_width.  */
 700   4,    /* fp_reassoc_width.  */
 701   1,    /* vec_reassoc_width.  */
 702   2,    /* min_div_recip_mul_sf.  */
 703   2,    /* min_div_recip_mul_df.  */
 704   0,    /* max_case_values.  */
 705   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 706   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 707   &generic_prefetch_tune
 708 };
 709
 710 static const struct tune_params cortexa35_tunings =
 711 {
 712   &cortexa53_extra_costs,
 713   &generic_addrcost_table,
 714   &cortexa53_regmove_cost,
 715   &generic_vector_cost,
 716   &generic_branch_cost,
 717   &generic_approx_modes,
 718   SVE_NOT_IMPLEMENTED, /* sve_width  */
 719   4, /* memmov_cost  */
 720   1, /* issue_rate  */
 721   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 722    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 723   "16", /* function_align.  */
 724   "4",  /* jump_align.  */
 725   "8",  /* loop_align.  */
 726   2,    /* int_reassoc_width.  */
 727   4,    /* fp_reassoc_width.  */
 728   1,    /* vec_reassoc_width.  */
 729   2,    /* min_div_recip_mul_sf.  */
 730   2,    /* min_div_recip_mul_df.  */
 731   0,    /* max_case_values.  */
 732   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 733   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 734   &generic_prefetch_tune
 735 };
 736
 737 static const struct tune_params cortexa53_tunings =
 738 {
 739   &cortexa53_extra_costs,
 740   &generic_addrcost_table,
 741   &cortexa53_regmove_cost,
 742   &generic_vector_cost,
 743   &generic_branch_cost,
 744   &generic_approx_modes,
 745   SVE_NOT_IMPLEMENTED, /* sve_width  */
 746   4, /* memmov_cost  */
 747   2, /* issue_rate  */
 748   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 749    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 750   "16", /* function_align.  */
 751   "4",  /* jump_align.  */
 752   "8",  /* loop_align.  */
 753   2,    /* int_reassoc_width.  */
 754   4,    /* fp_reassoc_width.  */
 755   1,    /* vec_reassoc_width.  */
 756   2,    /* min_div_recip_mul_sf.  */
 757   2,    /* min_div_recip_mul_df.  */
 758   0,    /* max_case_values.  */
 759   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 760   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 761   &generic_prefetch_tune
 762 };
 763
 764 static const struct tune_params cortexa57_tunings =
 765 {
 766   &cortexa57_extra_costs,
 767   &generic_addrcost_table,
 768   &cortexa57_regmove_cost,
 769   &cortexa57_vector_cost,
 770   &generic_branch_cost,
 771   &generic_approx_modes,
 772   SVE_NOT_IMPLEMENTED, /* sve_width  */
 773   4, /* memmov_cost  */
 774   3, /* issue_rate  */
 775   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 776    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 777   "16", /* function_align.  */
 778   "4",  /* jump_align.  */
 779   "8",  /* loop_align.  */
 780   2,    /* int_reassoc_width.  */
 781   4,    /* fp_reassoc_width.  */
 782   1,    /* vec_reassoc_width.  */
 783   2,    /* min_div_recip_mul_sf.  */
 784   2,    /* min_div_recip_mul_df.  */
 785   0,    /* max_case_values.  */
 786   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 787   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 788   &generic_prefetch_tune
 789 };
 790
 791 static const struct tune_params cortexa72_tunings =
 792 {
 793   &cortexa57_extra_costs,
 794   &generic_addrcost_table,
 795   &cortexa57_regmove_cost,
 796   &cortexa57_vector_cost,
 797   &generic_branch_cost,
 798   &generic_approx_modes,
 799   SVE_NOT_IMPLEMENTED, /* sve_width  */
 800   4, /* memmov_cost  */
 801   3, /* issue_rate  */
 802   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 803    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 804   "16", /* function_align.  */
 805   "4",  /* jump_align.  */
 806   "8",  /* loop_align.  */
 807   2,    /* int_reassoc_width.  */
 808   4,    /* fp_reassoc_width.  */
 809   1,    /* vec_reassoc_width.  */
 810   2,    /* min_div_recip_mul_sf.  */
 811   2,    /* min_div_recip_mul_df.  */
 812   0,    /* max_case_values.  */
 813   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 814   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 815   &generic_prefetch_tune
 816 };
 817
 818 static const struct tune_params cortexa73_tunings =
 819 {
 820   &cortexa57_extra_costs,
 821   &generic_addrcost_table,
 822   &cortexa57_regmove_cost,
 823   &cortexa57_vector_cost,
 824   &generic_branch_cost,
 825   &generic_approx_modes,
 826   SVE_NOT_IMPLEMENTED, /* sve_width  */
 827   4, /* memmov_cost.  */
 828   2, /* issue_rate.  */
 829   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 830    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 831   "16", /* function_align.  */
 832   "4",  /* jump_align.  */
 833   "8",  /* loop_align.  */
 834   2,    /* int_reassoc_width.  */
 835   4,    /* fp_reassoc_width.  */
 836   1,    /* vec_reassoc_width.  */
 837   2,    /* min_div_recip_mul_sf.  */
 838   2,    /* min_div_recip_mul_df.  */
 839   0,    /* max_case_values.  */
 840   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 841   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 842   &generic_prefetch_tune
 843 };
 844
 845
 846
 847 static const struct tune_params exynosm1_tunings =
 848 {
 849   &exynosm1_extra_costs,
 850   &exynosm1_addrcost_table,
 851   &exynosm1_regmove_cost,
 852   &exynosm1_vector_cost,
 853   &generic_branch_cost,
 854   &exynosm1_approx_modes,
 855   SVE_NOT_IMPLEMENTED, /* sve_width  */
 856   4,    /* memmov_cost  */
 857   3,    /* issue_rate  */
 858   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 859   "4",  /* function_align.  */
 860   "4",  /* jump_align.  */
 861   "4",  /* loop_align.  */
 862   2,    /* int_reassoc_width.  */
 863   4,    /* fp_reassoc_width.  */
 864   1,    /* vec_reassoc_width.  */
 865   2,    /* min_div_recip_mul_sf.  */
 866   2,    /* min_div_recip_mul_df.  */
 867   48,   /* max_case_values.  */
 868   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 869   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 870   &exynosm1_prefetch_tune
 871 };
 872
 873 static const struct tune_params thunderxt88_tunings =
 874 {
 875   &thunderx_extra_costs,
 876   &generic_addrcost_table,
 877   &thunderx_regmove_cost,
 878   &thunderx_vector_cost,
 879   &generic_branch_cost,
 880   &generic_approx_modes,
 881   SVE_NOT_IMPLEMENTED, /* sve_width  */
 882   6, /* memmov_cost  */
 883   2, /* issue_rate  */
 884   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 885   "8",  /* function_align.  */
 886   "8",  /* jump_align.  */
 887   "8",  /* loop_align.  */
 888   2,    /* int_reassoc_width.  */
 889   4,    /* fp_reassoc_width.  */
 890   1,    /* vec_reassoc_width.  */
 891   2,    /* min_div_recip_mul_sf.  */
 892   2,    /* min_div_recip_mul_df.  */
 893   0,    /* max_case_values.  */
 894   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 895   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 896   &thunderxt88_prefetch_tune
 897 };
 898
 899 static const struct tune_params thunderx_tunings =
 900 {
 901   &thunderx_extra_costs,
 902   &generic_addrcost_table,
 903   &thunderx_regmove_cost,
 904   &thunderx_vector_cost,
 905   &generic_branch_cost,
 906   &generic_approx_modes,
 907   SVE_NOT_IMPLEMENTED, /* sve_width  */
 908   6, /* memmov_cost  */
 909   2, /* issue_rate  */
 910   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 911   "8",  /* function_align.  */
 912   "8",  /* jump_align.  */
 913   "8",  /* loop_align.  */
 914   2,    /* int_reassoc_width.  */
 915   4,    /* fp_reassoc_width.  */
 916   1,    /* vec_reassoc_width.  */
 917   2,    /* min_div_recip_mul_sf.  */
 918   2,    /* min_div_recip_mul_df.  */
 919   0,    /* max_case_values.  */
 920   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 921   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 922    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 923   &thunderx_prefetch_tune
 924 };
 925
 926 static const struct tune_params tsv110_tunings =
 927 {
 928   &tsv110_extra_costs,
 929   &tsv110_addrcost_table,
 930   &tsv110_regmove_cost,
 931   &tsv110_vector_cost,
 932   &generic_branch_cost,
 933   &generic_approx_modes,
 934   SVE_NOT_IMPLEMENTED, /* sve_width  */
 935   4,    /* memmov_cost  */
 936   4,    /* issue_rate  */
 937   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
 938    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 939   "16", /* function_align.  */
 940   "4",  /* jump_align.  */
 941   "8",  /* loop_align.  */
 942   2,    /* int_reassoc_width.  */
 943   4,    /* fp_reassoc_width.  */
 944   1,    /* vec_reassoc_width.  */
 945   2,    /* min_div_recip_mul_sf.  */
 946   2,    /* min_div_recip_mul_df.  */
 947   0,    /* max_case_values.  */
 948   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 949   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
 950   &tsv110_prefetch_tune
 951 };
 952
 953 static const struct tune_params xgene1_tunings =
 954 {
 955   &xgene1_extra_costs,
 956   &xgene1_addrcost_table,
 957   &xgene1_regmove_cost,
 958   &xgene1_vector_cost,
 959   &generic_branch_cost,
 960   &xgene1_approx_modes,
 961   SVE_NOT_IMPLEMENTED, /* sve_width  */
 962   6, /* memmov_cost  */
 963   4, /* issue_rate  */
 964   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 965   "16", /* function_align.  */
 966   "16", /* jump_align.  */
 967   "16", /* loop_align.  */
 968   2,    /* int_reassoc_width.  */
 969   4,    /* fp_reassoc_width.  */
 970   1,    /* vec_reassoc_width.  */
 971   2,    /* min_div_recip_mul_sf.  */
 972   2,    /* min_div_recip_mul_df.  */
 973   17,   /* max_case_values.  */
 974   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 975   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
 976   &xgene1_prefetch_tune
 977 };
 978
 979 static const struct tune_params emag_tunings =
 980 {
 981   &xgene1_extra_costs,
 982   &xgene1_addrcost_table,
 983   &xgene1_regmove_cost,
 984   &xgene1_vector_cost,
 985   &generic_branch_cost,
 986   &xgene1_approx_modes,
 987   SVE_NOT_IMPLEMENTED,
 988   6, /* memmov_cost  */
 989   4, /* issue_rate  */
 990   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 991   "16", /* function_align.  */
 992   "16", /* jump_align.  */
 993   "16", /* loop_align.  */
 994   2,    /* int_reassoc_width.  */
 995   4,    /* fp_reassoc_width.  */
 996   1,    /* vec_reassoc_width.  */
 997   2,    /* min_div_recip_mul_sf.  */
 998   2,    /* min_div_recip_mul_df.  */
 999   17,   /* max_case_values.  */
1000   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1001   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1002   &xgene1_prefetch_tune
1003 };
1004
1005 static const struct tune_params qdf24xx_tunings =
1006 {
1007   &qdf24xx_extra_costs,
1008   &qdf24xx_addrcost_table,
1009   &qdf24xx_regmove_cost,
1010   &qdf24xx_vector_cost,
1011   &generic_branch_cost,
1012   &generic_approx_modes,
1013   SVE_NOT_IMPLEMENTED, /* sve_width  */
1014   4, /* memmov_cost  */
1015   4, /* issue_rate  */
1016   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1017    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1018   "16", /* function_align.  */
1019   "8",  /* jump_align.  */
1020   "16", /* loop_align.  */
1021   2,    /* int_reassoc_width.  */
1022   4,    /* fp_reassoc_width.  */
1023   1,    /* vec_reassoc_width.  */
1024   2,    /* min_div_recip_mul_sf.  */
1025   2,    /* min_div_recip_mul_df.  */
1026   0,    /* max_case_values.  */
1027   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1028   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1029   &qdf24xx_prefetch_tune
1030 };
1031
1032 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1033    for now.  */
1034 static const struct tune_params saphira_tunings =
1035 {
1036   &generic_extra_costs,
1037   &generic_addrcost_table,
1038   &generic_regmove_cost,
1039   &generic_vector_cost,
1040   &generic_branch_cost,
1041   &generic_approx_modes,
1042   SVE_NOT_IMPLEMENTED, /* sve_width  */
1043   4, /* memmov_cost  */
1044   4, /* issue_rate  */
1045   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1046    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1047   "16", /* function_align.  */
1048   "8",  /* jump_align.  */
1049   "16", /* loop_align.  */
1050   2,    /* int_reassoc_width.  */
1051   4,    /* fp_reassoc_width.  */
1052   1,    /* vec_reassoc_width.  */
1053   2,    /* min_div_recip_mul_sf.  */
1054   2,    /* min_div_recip_mul_df.  */
1055   0,    /* max_case_values.  */
1056   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1057   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1058   &generic_prefetch_tune
1059 };
1060
1061 static const struct tune_params thunderx2t99_tunings =
1062 {
1063   &thunderx2t99_extra_costs,
1064   &thunderx2t99_addrcost_table,
1065   &thunderx2t99_regmove_cost,
1066   &thunderx2t99_vector_cost,
1067   &generic_branch_cost,
1068   &generic_approx_modes,
1069   SVE_NOT_IMPLEMENTED, /* sve_width  */
1070   4, /* memmov_cost.  */
1071   4, /* issue_rate.  */
1072   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1073    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
1074   "16", /* function_align.  */
1075   "8",  /* jump_align.  */
1076   "16", /* loop_align.  */
1077   3,    /* int_reassoc_width.  */
1078   2,    /* fp_reassoc_width.  */
1079   2,    /* vec_reassoc_width.  */
1080   2,    /* min_div_recip_mul_sf.  */
1081   2,    /* min_div_recip_mul_df.  */
1082   0,    /* max_case_values.  */
1083   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1084   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1085   &thunderx2t99_prefetch_tune
1086 };
1087
1088 static const struct tune_params neoversen1_tunings =
1089 {
1090   &cortexa57_extra_costs,
1091   &generic_addrcost_table,
1092   &generic_regmove_cost,
1093   &cortexa57_vector_cost,
1094   &generic_branch_cost,
1095   &generic_approx_modes,
1096   SVE_NOT_IMPLEMENTED, /* sve_width  */
1097   4, /* memmov_cost  */
1098   3, /* issue_rate  */
1099   AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
1100   "32:16",      /* function_align.  */
1101   "32:16",      /* jump_align.  */
1102   "32:16",      /* loop_align.  */
1103   2,    /* int_reassoc_width.  */
1104   4,    /* fp_reassoc_width.  */
1105   2,    /* vec_reassoc_width.  */
1106   2,    /* min_div_recip_mul_sf.  */
1107   2,    /* min_div_recip_mul_df.  */
1108   0,    /* max_case_values.  */
1109   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1110   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1111   &generic_prefetch_tune
1112 };
1113
1114 /* Support for fine-grained override of the tuning structures.  */
1115 struct aarch64_tuning_override_function
1116 {
1117   const char* name;
1118   void (*parse_override)(const char*, struct tune_params*);
1119 };
1120
1121 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1122 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1123 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1124
1125 static const struct aarch64_tuning_override_function
1126 aarch64_tuning_override_functions[] =
1127 {
1128   { "fuse", aarch64_parse_fuse_string },
1129   { "tune", aarch64_parse_tune_string },
1130   { "sve_width", aarch64_parse_sve_width_string },
1131   { NULL, NULL }
1132 };
1133
1134 /* A processor implementing AArch64.  */
1135 struct processor
1136 {
1137   const char *const name;
1138   enum aarch64_processor ident;
1139   enum aarch64_processor sched_core;
1140   enum aarch64_arch arch;
1141   unsigned architecture_version;
1142   const unsigned long flags;
1143   const struct tune_params *const tune;
1144 };
1145
1146 /* Architectures implementing AArch64.  */
1147 static const struct processor all_architectures[] =
1148 {
1149 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1150   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1151 #include "aarch64-arches.def"
1152   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1153 };
1154
1155 /* Processor cores implementing AArch64.  */
1156 static const struct processor all_cores[] =
1157 {
1158 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1159   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1160   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1161   FLAGS, &COSTS##_tunings},
1162 #include "aarch64-cores.def"
1163   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1164     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1165   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1166 };
1167
1168
1169 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1170    handling code or by target attributes.  */
1171 static const struct processor *selected_arch;
1172 static const struct processor *selected_cpu;
1173 static const struct processor *selected_tune;
1174
1175 /* The current tuning set.  */
1176 struct tune_params aarch64_tune_params = generic_tunings;
1177
1178 /* Table of machine attributes.  */
1179 static const struct attribute_spec aarch64_attribute_table[] =
1180 {
1181   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1182        affects_type_identity, handler, exclude } */
1183   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,  NULL, NULL },
1184   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1185 };
1186
1187 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1188
1189 /* An ISA extension in the co-processor and main instruction set space.  */
1190 struct aarch64_option_extension
1191 {
1192   const char *const name;
1193   const unsigned long flags_on;
1194   const unsigned long flags_off;
1195 };
1196
1197 typedef enum aarch64_cond_code
1198 {
1199   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1200   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1201   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1202 }
1203 aarch64_cc;
1204
1205 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1206
1207 struct aarch64_branch_protect_type
1208 {
1209   /* The type's name that the user passes to the branch-protection option
1210     string.  */
1211   const char* name;
1212   /* Function to handle the protection type and set global variables.
1213     First argument is the string token corresponding with this type and the
1214     second argument is the next token in the option string.
1215     Return values:
1216     * AARCH64_PARSE_OK: Handling was sucessful.
1217     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1218       should print an error.
1219     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1220       own error.  */
1221   enum aarch64_parse_opt_result (*handler)(char*, char*);
1222   /* A list of types that can follow this type in the option string.  */
1223   const aarch64_branch_protect_type* subtypes;
1224   unsigned int num_subtypes;
1225 };
1226
1227 static enum aarch64_parse_opt_result
1228 aarch64_handle_no_branch_protection (char* str, char* rest)
1229 {
1230   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1231   aarch64_enable_bti = 0;
1232   if (rest)
1233     {
1234       error ("unexpected %<%s%> after %<%s%>", rest, str);
1235       return AARCH64_PARSE_INVALID_FEATURE;
1236     }
1237   return AARCH64_PARSE_OK;
1238 }
1239
1240 static enum aarch64_parse_opt_result
1241 aarch64_handle_standard_branch_protection (char* str, char* rest)
1242 {
1243   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1244   aarch64_enable_bti = 1;
1245   if (rest)
1246     {
1247       error ("unexpected %<%s%> after %<%s%>", rest, str);
1248       return AARCH64_PARSE_INVALID_FEATURE;
1249     }
1250   return AARCH64_PARSE_OK;
1251 }
1252
1253 static enum aarch64_parse_opt_result
1254 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1255                                     char* rest ATTRIBUTE_UNUSED)
1256 {
1257   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1258   return AARCH64_PARSE_OK;
1259 }
1260
1261 static enum aarch64_parse_opt_result
1262 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1263                               char* rest ATTRIBUTE_UNUSED)
1264 {
1265   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1266   return AARCH64_PARSE_OK;
1267 }
1268
1269 static enum aarch64_parse_opt_result
1270 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1271                                     char* rest ATTRIBUTE_UNUSED)
1272 {
1273   aarch64_enable_bti = 1;
1274   return AARCH64_PARSE_OK;
1275 }
1276
1277 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1278   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1279   { NULL, NULL, NULL, 0 }
1280 };
1281
1282 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1283   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1284   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1285   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1286     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1287   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1288   { NULL, NULL, NULL, 0 }
1289 };
1290
1291 /* The condition codes of the processor, and the inverse function.  */
1292 static const char * const aarch64_condition_codes[] =
1293 {
1294   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1295   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1296 };
1297
1298 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1299 const char *
1300 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1301                         const char * branch_format)
1302 {
1303     rtx_code_label * tmp_label = gen_label_rtx ();
1304     char label_buf[256];
1305     char buffer[128];
1306     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1307                                  CODE_LABEL_NUMBER (tmp_label));
1308     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1309     rtx dest_label = operands[pos_label];
1310     operands[pos_label] = tmp_label;
1311
1312     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1313     output_asm_insn (buffer, operands);
1314
1315     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1316     operands[pos_label] = dest_label;
1317     output_asm_insn (buffer, operands);
1318     return "";
1319 }
1320
1321 void
1322 aarch64_err_no_fpadvsimd (machine_mode mode)
1323 {
1324   if (TARGET_GENERAL_REGS_ONLY)
1325     if (FLOAT_MODE_P (mode))
1326       error ("%qs is incompatible with the use of floating-point types",
1327              "-mgeneral-regs-only");
1328     else
1329       error ("%qs is incompatible with the use of vector types",
1330              "-mgeneral-regs-only");
1331   else
1332     if (FLOAT_MODE_P (mode))
1333       error ("%qs feature modifier is incompatible with the use of"
1334              " floating-point types", "+nofp");
1335     else
1336       error ("%qs feature modifier is incompatible with the use of"
1337              " vector types", "+nofp");
1338 }
1339
1340 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1341    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1342    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1343    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1344    and GENERAL_REGS is lower than the memory cost (in this case the best class
1345    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1346    cost results in bad allocations with many redundant int<->FP moves which
1347    are expensive on various cores.
1348    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1349    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1350    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1351    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1352    The result of this is that it is no longer inefficient to have a higher
1353    memory move cost than the register move cost.
1354 */
1355
1356 static reg_class_t
1357 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1358                                          reg_class_t best_class)
1359 {
1360   machine_mode mode;
1361
1362   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1363       || !reg_class_subset_p (FP_REGS, allocno_class))
1364     return allocno_class;
1365
1366   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1367       || !reg_class_subset_p (FP_REGS, best_class))
1368     return best_class;
1369
1370   mode = PSEUDO_REGNO_MODE (regno);
1371   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1372 }
1373
1374 static unsigned int
1375 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1376 {
1377   if (GET_MODE_UNIT_SIZE (mode) == 4)
1378     return aarch64_tune_params.min_div_recip_mul_sf;
1379   return aarch64_tune_params.min_div_recip_mul_df;
1380 }
1381
1382 /* Return the reassociation width of treeop OPC with mode MODE.  */
1383 static int
1384 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1385 {
1386   if (VECTOR_MODE_P (mode))
1387     return aarch64_tune_params.vec_reassoc_width;
1388   if (INTEGRAL_MODE_P (mode))
1389     return aarch64_tune_params.int_reassoc_width;
1390   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1391   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1392     return aarch64_tune_params.fp_reassoc_width;
1393   return 1;
1394 }
1395
1396 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1397 unsigned
1398 aarch64_dbx_register_number (unsigned regno)
1399 {
1400    if (GP_REGNUM_P (regno))
1401      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1402    else if (regno == SP_REGNUM)
1403      return AARCH64_DWARF_SP;
1404    else if (FP_REGNUM_P (regno))
1405      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1406    else if (PR_REGNUM_P (regno))
1407      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1408    else if (regno == VG_REGNUM)
1409      return AARCH64_DWARF_VG;
1410
1411    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1412       equivalent DWARF register.  */
1413    return DWARF_FRAME_REGISTERS;
1414 }
1415
1416 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1417 static bool
1418 aarch64_advsimd_struct_mode_p (machine_mode mode)
1419 {
1420   return (TARGET_SIMD
1421           && (mode == OImode || mode == CImode || mode == XImode));
1422 }
1423
1424 /* Return true if MODE is an SVE predicate mode.  */
1425 static bool
1426 aarch64_sve_pred_mode_p (machine_mode mode)
1427 {
1428   return (TARGET_SVE
1429           && (mode == VNx16BImode
1430               || mode == VNx8BImode
1431               || mode == VNx4BImode
1432               || mode == VNx2BImode));
1433 }
1434
1435 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1436 const unsigned int VEC_ADVSIMD  = 1;
1437 const unsigned int VEC_SVE_DATA = 2;
1438 const unsigned int VEC_SVE_PRED = 4;
1439 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1440    a structure of 2, 3 or 4 vectors.  */
1441 const unsigned int VEC_STRUCT   = 8;
1442 /* Useful combinations of the above.  */
1443 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1444 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1445
1446 /* Return a set of flags describing the vector properties of mode MODE.
1447    Ignore modes that are not supported by the current target.  */
1448 static unsigned int
1449 aarch64_classify_vector_mode (machine_mode mode)
1450 {
1451   if (aarch64_advsimd_struct_mode_p (mode))
1452     return VEC_ADVSIMD | VEC_STRUCT;
1453
1454   if (aarch64_sve_pred_mode_p (mode))
1455     return VEC_SVE_PRED;
1456
1457   scalar_mode inner = GET_MODE_INNER (mode);
1458   if (VECTOR_MODE_P (mode)
1459       && (inner == QImode
1460           || inner == HImode
1461           || inner == HFmode
1462           || inner == SImode
1463           || inner == SFmode
1464           || inner == DImode
1465           || inner == DFmode))
1466     {
1467       if (TARGET_SVE)
1468         {
1469           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1470             return VEC_SVE_DATA;
1471           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1472               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1473               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1474             return VEC_SVE_DATA | VEC_STRUCT;
1475         }
1476
1477       /* This includes V1DF but not V1DI (which doesn't exist).  */
1478       if (TARGET_SIMD
1479           && (known_eq (GET_MODE_BITSIZE (mode), 64)
1480               || known_eq (GET_MODE_BITSIZE (mode), 128)))
1481         return VEC_ADVSIMD;
1482     }
1483
1484   return 0;
1485 }
1486
1487 /* Return true if MODE is any of the data vector modes, including
1488    structure modes.  */
1489 static bool
1490 aarch64_vector_data_mode_p (machine_mode mode)
1491 {
1492   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1493 }
1494
1495 /* Return true if MODE is an SVE data vector mode; either a single vector
1496    or a structure of vectors.  */
1497 static bool
1498 aarch64_sve_data_mode_p (machine_mode mode)
1499 {
1500   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1501 }
1502
1503 /* Implement target hook TARGET_ARRAY_MODE.  */
1504 static opt_machine_mode
1505 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1506 {
1507   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1508       && IN_RANGE (nelems, 2, 4))
1509     return mode_for_vector (GET_MODE_INNER (mode),
1510                             GET_MODE_NUNITS (mode) * nelems);
1511
1512   return opt_machine_mode ();
1513 }
1514
1515 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1516 static bool
1517 aarch64_array_mode_supported_p (machine_mode mode,
1518                                 unsigned HOST_WIDE_INT nelems)
1519 {
1520   if (TARGET_SIMD
1521       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1522           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1523       && (nelems >= 2 && nelems <= 4))
1524     return true;
1525
1526   return false;
1527 }
1528
1529 /* Return the SVE predicate mode to use for elements that have
1530    ELEM_NBYTES bytes, if such a mode exists.  */
1531
1532 opt_machine_mode
1533 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1534 {
1535   if (TARGET_SVE)
1536     {
1537       if (elem_nbytes == 1)
1538         return VNx16BImode;
1539       if (elem_nbytes == 2)
1540         return VNx8BImode;
1541       if (elem_nbytes == 4)
1542         return VNx4BImode;
1543       if (elem_nbytes == 8)
1544         return VNx2BImode;
1545     }
1546   return opt_machine_mode ();
1547 }
1548
1549 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1550
1551 static opt_machine_mode
1552 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1553 {
1554   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1555     {
1556       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1557       machine_mode pred_mode;
1558       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1559         return pred_mode;
1560     }
1561
1562   return default_get_mask_mode (nunits, nbytes);
1563 }
1564
1565 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1566    prefer to use the first arithmetic operand as the else value if
1567    the else value doesn't matter, since that exactly matches the SVE
1568    destructive merging form.  For ternary operations we could either
1569    pick the first operand and use FMAD-like instructions or the last
1570    operand and use FMLA-like instructions; the latter seems more
1571    natural.  */
1572
1573 static tree
1574 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1575 {
1576   return nops == 3 ? ops[2] : ops[0];
1577 }
1578
1579 /* Implement TARGET_HARD_REGNO_NREGS.  */
1580
1581 static unsigned int
1582 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1583 {
1584   /* ??? Logically we should only need to provide a value when
1585      HARD_REGNO_MODE_OK says that the combination is valid,
1586      but at the moment we need to handle all modes.  Just ignore
1587      any runtime parts for registers that can't store them.  */
1588   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1589   switch (aarch64_regno_regclass (regno))
1590     {
1591     case FP_REGS:
1592     case FP_LO_REGS:
1593       if (aarch64_sve_data_mode_p (mode))
1594         return exact_div (GET_MODE_SIZE (mode),
1595                           BYTES_PER_SVE_VECTOR).to_constant ();
1596       return CEIL (lowest_size, UNITS_PER_VREG);
1597     case PR_REGS:
1598     case PR_LO_REGS:
1599     case PR_HI_REGS:
1600       return 1;
1601     default:
1602       return CEIL (lowest_size, UNITS_PER_WORD);
1603     }
1604   gcc_unreachable ();
1605 }
1606
1607 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1608
1609 static bool
1610 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1611 {
1612   if (GET_MODE_CLASS (mode) == MODE_CC)
1613     return regno == CC_REGNUM;
1614
1615   if (regno == VG_REGNUM)
1616     /* This must have the same size as _Unwind_Word.  */
1617     return mode == DImode;
1618
1619   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1620   if (vec_flags & VEC_SVE_PRED)
1621     return PR_REGNUM_P (regno);
1622
1623   if (PR_REGNUM_P (regno))
1624     return 0;
1625
1626   if (regno == SP_REGNUM)
1627     /* The purpose of comparing with ptr_mode is to support the
1628        global register variable associated with the stack pointer
1629        register via the syntax of asm ("wsp") in ILP32.  */
1630     return mode == Pmode || mode == ptr_mode;
1631
1632   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1633     return mode == Pmode;
1634
1635   if (GP_REGNUM_P (regno))
1636     {
1637       if (known_le (GET_MODE_SIZE (mode), 8))
1638         return true;
1639       else if (known_le (GET_MODE_SIZE (mode), 16))
1640         return (regno & 1) == 0;
1641     }
1642   else if (FP_REGNUM_P (regno))
1643     {
1644       if (vec_flags & VEC_STRUCT)
1645         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1646       else
1647         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1648     }
1649
1650   return false;
1651 }
1652
1653 /* Return true if this is a definition of a vectorized simd function.  */
1654
1655 static bool
1656 aarch64_simd_decl_p (tree fndecl)
1657 {
1658   tree fntype;
1659
1660   if (fndecl == NULL)
1661     return false;
1662   fntype = TREE_TYPE (fndecl);
1663   if (fntype == NULL)
1664     return false;
1665
1666   /* Functions with the aarch64_vector_pcs attribute use the simd ABI.  */
1667   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1668     return true;
1669
1670   return false;
1671 }
1672
1673 /* Return the mode a register save/restore should use.  DImode for integer
1674    registers, DFmode for FP registers in non-SIMD functions (they only save
1675    the bottom half of a 128 bit register), or TFmode for FP registers in
1676    SIMD functions.  */
1677
1678 static machine_mode
1679 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1680 {
1681   return GP_REGNUM_P (regno)
1682            ? E_DImode
1683            : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1684 }
1685
1686 /* Return true if the instruction is a call to a SIMD function, false
1687    if it is not a SIMD function or if we do not know anything about
1688    the function.  */
1689
1690 static bool
1691 aarch64_simd_call_p (rtx_insn *insn)
1692 {
1693   rtx symbol;
1694   rtx call;
1695   tree fndecl;
1696
1697   gcc_assert (CALL_P (insn));
1698   call = get_call_rtx_from (insn);
1699   symbol = XEXP (XEXP (call, 0), 0);
1700   if (GET_CODE (symbol) != SYMBOL_REF)
1701     return false;
1702   fndecl = SYMBOL_REF_DECL (symbol);
1703   if (!fndecl)
1704     return false;
1705
1706   return aarch64_simd_decl_p (fndecl);
1707 }
1708
1709 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS.  If INSN calls
1710    a function that uses the SIMD ABI, take advantage of the extra
1711    call-preserved registers that the ABI provides.  */
1712
1713 void
1714 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1715                                           HARD_REG_SET *return_set)
1716 {
1717   if (aarch64_simd_call_p (insn))
1718     {
1719       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1720         if (FP_SIMD_SAVED_REGNUM_P (regno))
1721           CLEAR_HARD_REG_BIT (*return_set, regno);
1722     }
1723 }
1724
1725 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1726    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1727    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1728
1729 static bool
1730 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1731                                         machine_mode mode)
1732 {
1733   bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1734   return FP_REGNUM_P (regno)
1735          && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1736 }
1737
1738 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS.  */
1739
1740 rtx_insn *
1741 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1742 {
1743   gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1744
1745   if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1746     return call_1;
1747   else
1748     return call_2;
1749 }
1750
1751 /* Implement REGMODE_NATURAL_SIZE.  */
1752 poly_uint64
1753 aarch64_regmode_natural_size (machine_mode mode)
1754 {
1755   /* The natural size for SVE data modes is one SVE data vector,
1756      and similarly for predicates.  We can't independently modify
1757      anything smaller than that.  */
1758   /* ??? For now, only do this for variable-width SVE registers.
1759      Doing it for constant-sized registers breaks lower-subreg.c.  */
1760   /* ??? And once that's fixed, we should probably have similar
1761      code for Advanced SIMD.  */
1762   if (!aarch64_sve_vg.is_constant ())
1763     {
1764       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1765       if (vec_flags & VEC_SVE_PRED)
1766         return BYTES_PER_SVE_PRED;
1767       if (vec_flags & VEC_SVE_DATA)
1768         return BYTES_PER_SVE_VECTOR;
1769     }
1770   return UNITS_PER_WORD;
1771 }
1772
1773 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1774 machine_mode
1775 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1776                                      machine_mode mode)
1777 {
1778   /* The predicate mode determines which bits are significant and
1779      which are "don't care".  Decreasing the number of lanes would
1780      lose data while increasing the number of lanes would make bits
1781      unnecessarily significant.  */
1782   if (PR_REGNUM_P (regno))
1783     return mode;
1784   if (known_ge (GET_MODE_SIZE (mode), 4))
1785     return mode;
1786   else
1787     return SImode;
1788 }
1789
1790 /* Return true if I's bits are consecutive ones from the MSB.  */
1791 bool
1792 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1793 {
1794   return exact_log2 (-i) != HOST_WIDE_INT_M1;
1795 }
1796
1797 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1798    that strcpy from constants will be faster.  */
1799
1800 static HOST_WIDE_INT
1801 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1802 {
1803   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1804     return MAX (align, BITS_PER_WORD);
1805   return align;
1806 }
1807
1808 /* Return true if calls to DECL should be treated as
1809    long-calls (ie called via a register).  */
1810 static bool
1811 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1812 {
1813   return false;
1814 }
1815
1816 /* Return true if calls to symbol-ref SYM should be treated as
1817    long-calls (ie called via a register).  */
1818 bool
1819 aarch64_is_long_call_p (rtx sym)
1820 {
1821   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1822 }
1823
1824 /* Return true if calls to symbol-ref SYM should not go through
1825    plt stubs.  */
1826
1827 bool
1828 aarch64_is_noplt_call_p (rtx sym)
1829 {
1830   const_tree decl = SYMBOL_REF_DECL (sym);
1831
1832   if (flag_pic
1833       && decl
1834       && (!flag_plt
1835           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1836       && !targetm.binds_local_p (decl))
1837     return true;
1838
1839   return false;
1840 }
1841
1842 /* Return true if the offsets to a zero/sign-extract operation
1843    represent an expression that matches an extend operation.  The
1844    operands represent the paramters from
1845
1846    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1847 bool
1848 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1849                                 rtx extract_imm)
1850 {
1851   HOST_WIDE_INT mult_val, extract_val;
1852
1853   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1854     return false;
1855
1856   mult_val = INTVAL (mult_imm);
1857   extract_val = INTVAL (extract_imm);
1858
1859   if (extract_val > 8
1860       && extract_val < GET_MODE_BITSIZE (mode)
1861       && exact_log2 (extract_val & ~7) > 0
1862       && (extract_val & 7) <= 4
1863       && mult_val == (1 << (extract_val & 7)))
1864     return true;
1865
1866   return false;
1867 }
1868
1869 /* Emit an insn that's a simple single-set.  Both the operands must be
1870    known to be valid.  */
1871 inline static rtx_insn *
1872 emit_set_insn (rtx x, rtx y)
1873 {
1874   return emit_insn (gen_rtx_SET (x, y));
1875 }
1876
1877 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1878    return the rtx for register 0 in the proper mode.  */
1879 rtx
1880 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1881 {
1882   machine_mode mode = SELECT_CC_MODE (code, x, y);
1883   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1884
1885   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1886   return cc_reg;
1887 }
1888
1889 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
1890
1891 static rtx
1892 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
1893                                   machine_mode y_mode)
1894 {
1895   if (y_mode == E_QImode || y_mode == E_HImode)
1896     {
1897       if (CONST_INT_P (y))
1898         y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
1899       else
1900         {
1901           rtx t, cc_reg;
1902           machine_mode cc_mode;
1903
1904           t = gen_rtx_ZERO_EXTEND (SImode, y);
1905           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
1906           cc_mode = CC_SWPmode;
1907           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
1908           emit_set_insn (cc_reg, t);
1909           return cc_reg;
1910         }
1911     }
1912
1913   return aarch64_gen_compare_reg (code, x, y);
1914 }
1915
1916 /* Build the SYMBOL_REF for __tls_get_addr.  */
1917
1918 static GTY(()) rtx tls_get_addr_libfunc;
1919
1920 rtx
1921 aarch64_tls_get_addr (void)
1922 {
1923   if (!tls_get_addr_libfunc)
1924     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1925   return tls_get_addr_libfunc;
1926 }
1927
1928 /* Return the TLS model to use for ADDR.  */
1929
1930 static enum tls_model
1931 tls_symbolic_operand_type (rtx addr)
1932 {
1933   enum tls_model tls_kind = TLS_MODEL_NONE;
1934   if (GET_CODE (addr) == CONST)
1935     {
1936       poly_int64 addend;
1937       rtx sym = strip_offset (addr, &addend);
1938       if (GET_CODE (sym) == SYMBOL_REF)
1939         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1940     }
1941   else if (GET_CODE (addr) == SYMBOL_REF)
1942     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1943
1944   return tls_kind;
1945 }
1946
1947 /* We'll allow lo_sum's in addresses in our legitimate addresses
1948    so that combine would take care of combining addresses where
1949    necessary, but for generation purposes, we'll generate the address
1950    as :
1951    RTL                               Absolute
1952    tmp = hi (symbol_ref);            adrp  x1, foo
1953    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1954                                      nop
1955
1956    PIC                               TLS
1957    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1958    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1959                                      bl   __tls_get_addr
1960                                      nop
1961
1962    Load TLS symbol, depending on TLS mechanism and TLS access model.
1963
1964    Global Dynamic - Traditional TLS:
1965    adrp tmp, :tlsgd:imm
1966    add  dest, tmp, #:tlsgd_lo12:imm
1967    bl   __tls_get_addr
1968
1969    Global Dynamic - TLS Descriptors:
1970    adrp dest, :tlsdesc:imm
1971    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1972    add  dest, dest, #:tlsdesc_lo12:imm
1973    blr  tmp
1974    mrs  tp, tpidr_el0
1975    add  dest, dest, tp
1976
1977    Initial Exec:
1978    mrs  tp, tpidr_el0
1979    adrp tmp, :gottprel:imm
1980    ldr  dest, [tmp, #:gottprel_lo12:imm]
1981    add  dest, dest, tp
1982
1983    Local Exec:
1984    mrs  tp, tpidr_el0
1985    add  t0, tp, #:tprel_hi12:imm, lsl #12
1986    add  t0, t0, #:tprel_lo12_nc:imm
1987 */
1988
1989 static void
1990 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1991                                    enum aarch64_symbol_type type)
1992 {
1993   switch (type)
1994     {
1995     case SYMBOL_SMALL_ABSOLUTE:
1996       {
1997         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1998         rtx tmp_reg = dest;
1999         machine_mode mode = GET_MODE (dest);
2000
2001         gcc_assert (mode == Pmode || mode == ptr_mode);
2002
2003         if (can_create_pseudo_p ())
2004           tmp_reg = gen_reg_rtx (mode);
2005
2006         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2007         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2008         return;
2009       }
2010
2011     case SYMBOL_TINY_ABSOLUTE:
2012       emit_insn (gen_rtx_SET (dest, imm));
2013       return;
2014
2015     case SYMBOL_SMALL_GOT_28K:
2016       {
2017         machine_mode mode = GET_MODE (dest);
2018         rtx gp_rtx = pic_offset_table_rtx;
2019         rtx insn;
2020         rtx mem;
2021
2022         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2023            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2024            decide rtx costs, in which case pic_offset_table_rtx is not
2025            initialized.  For that case no need to generate the first adrp
2026            instruction as the final cost for global variable access is
2027            one instruction.  */
2028         if (gp_rtx != NULL)
2029           {
2030             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2031                using the page base as GOT base, the first page may be wasted,
2032                in the worst scenario, there is only 28K space for GOT).
2033
2034                The generate instruction sequence for accessing global variable
2035                is:
2036
2037                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2038
2039                Only one instruction needed. But we must initialize
2040                pic_offset_table_rtx properly.  We generate initialize insn for
2041                every global access, and allow CSE to remove all redundant.
2042
2043                The final instruction sequences will look like the following
2044                for multiply global variables access.
2045
2046                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2047
2048                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2049                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2050                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2051                  ...  */
2052
2053             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2054             crtl->uses_pic_offset_table = 1;
2055             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2056
2057             if (mode != GET_MODE (gp_rtx))
2058              gp_rtx = gen_lowpart (mode, gp_rtx);
2059
2060           }
2061
2062         if (mode == ptr_mode)
2063           {
2064             if (mode == DImode)
2065               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2066             else
2067               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2068
2069             mem = XVECEXP (SET_SRC (insn), 0, 0);
2070           }
2071         else
2072           {
2073             gcc_assert (mode == Pmode);
2074
2075             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2076             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2077           }
2078
2079         /* The operand is expected to be MEM.  Whenever the related insn
2080            pattern changed, above code which calculate mem should be
2081            updated.  */
2082         gcc_assert (GET_CODE (mem) == MEM);
2083         MEM_READONLY_P (mem) = 1;
2084         MEM_NOTRAP_P (mem) = 1;
2085         emit_insn (insn);
2086         return;
2087       }
2088
2089     case SYMBOL_SMALL_GOT_4G:
2090       {
2091         /* In ILP32, the mode of dest can be either SImode or DImode,
2092            while the got entry is always of SImode size.  The mode of
2093            dest depends on how dest is used: if dest is assigned to a
2094            pointer (e.g. in the memory), it has SImode; it may have
2095            DImode if dest is dereferenced to access the memeory.
2096            This is why we have to handle three different ldr_got_small
2097            patterns here (two patterns for ILP32).  */
2098
2099         rtx insn;
2100         rtx mem;
2101         rtx tmp_reg = dest;
2102         machine_mode mode = GET_MODE (dest);
2103
2104         if (can_create_pseudo_p ())
2105           tmp_reg = gen_reg_rtx (mode);
2106
2107         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2108         if (mode == ptr_mode)
2109           {
2110             if (mode == DImode)
2111               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2112             else
2113               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2114
2115             mem = XVECEXP (SET_SRC (insn), 0, 0);
2116           }
2117         else
2118           {
2119             gcc_assert (mode == Pmode);
2120
2121             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2122             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2123           }
2124
2125         gcc_assert (GET_CODE (mem) == MEM);
2126         MEM_READONLY_P (mem) = 1;
2127         MEM_NOTRAP_P (mem) = 1;
2128         emit_insn (insn);
2129         return;
2130       }
2131
2132     case SYMBOL_SMALL_TLSGD:
2133       {
2134         rtx_insn *insns;
2135         machine_mode mode = GET_MODE (dest);
2136         rtx result = gen_rtx_REG (mode, R0_REGNUM);
2137
2138         start_sequence ();
2139         if (TARGET_ILP32)
2140           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2141         else
2142           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2143         insns = get_insns ();
2144         end_sequence ();
2145
2146         RTL_CONST_CALL_P (insns) = 1;
2147         emit_libcall_block (insns, dest, result, imm);
2148         return;
2149       }
2150
2151     case SYMBOL_SMALL_TLSDESC:
2152       {
2153         machine_mode mode = GET_MODE (dest);
2154         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2155         rtx tp;
2156
2157         gcc_assert (mode == Pmode || mode == ptr_mode);
2158
2159         /* In ILP32, the got entry is always of SImode size.  Unlike
2160            small GOT, the dest is fixed at reg 0.  */
2161         if (TARGET_ILP32)
2162           emit_insn (gen_tlsdesc_small_si (imm));
2163         else
2164           emit_insn (gen_tlsdesc_small_di (imm));
2165         tp = aarch64_load_tp (NULL);
2166
2167         if (mode != Pmode)
2168           tp = gen_lowpart (mode, tp);
2169
2170         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2171         if (REG_P (dest))
2172           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2173         return;
2174       }
2175
2176     case SYMBOL_SMALL_TLSIE:
2177       {
2178         /* In ILP32, the mode of dest can be either SImode or DImode,
2179            while the got entry is always of SImode size.  The mode of
2180            dest depends on how dest is used: if dest is assigned to a
2181            pointer (e.g. in the memory), it has SImode; it may have
2182            DImode if dest is dereferenced to access the memeory.
2183            This is why we have to handle three different tlsie_small
2184            patterns here (two patterns for ILP32).  */
2185         machine_mode mode = GET_MODE (dest);
2186         rtx tmp_reg = gen_reg_rtx (mode);
2187         rtx tp = aarch64_load_tp (NULL);
2188
2189         if (mode == ptr_mode)
2190           {
2191             if (mode == DImode)
2192               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2193             else
2194               {
2195                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2196                 tp = gen_lowpart (mode, tp);
2197               }
2198           }
2199         else
2200           {
2201             gcc_assert (mode == Pmode);
2202             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2203           }
2204
2205         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2206         if (REG_P (dest))
2207           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2208         return;
2209       }
2210
2211     case SYMBOL_TLSLE12:
2212     case SYMBOL_TLSLE24:
2213     case SYMBOL_TLSLE32:
2214     case SYMBOL_TLSLE48:
2215       {
2216         machine_mode mode = GET_MODE (dest);
2217         rtx tp = aarch64_load_tp (NULL);
2218
2219         if (mode != Pmode)
2220           tp = gen_lowpart (mode, tp);
2221
2222         switch (type)
2223           {
2224           case SYMBOL_TLSLE12:
2225             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2226                         (dest, tp, imm));
2227             break;
2228           case SYMBOL_TLSLE24:
2229             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2230                         (dest, tp, imm));
2231           break;
2232           case SYMBOL_TLSLE32:
2233             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2234                         (dest, imm));
2235             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2236                         (dest, dest, tp));
2237           break;
2238           case SYMBOL_TLSLE48:
2239             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2240                         (dest, imm));
2241             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2242                         (dest, dest, tp));
2243             break;
2244           default:
2245             gcc_unreachable ();
2246           }
2247
2248         if (REG_P (dest))
2249           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2250         return;
2251       }
2252
2253     case SYMBOL_TINY_GOT:
2254       emit_insn (gen_ldr_got_tiny (dest, imm));
2255       return;
2256
2257     case SYMBOL_TINY_TLSIE:
2258       {
2259         machine_mode mode = GET_MODE (dest);
2260         rtx tp = aarch64_load_tp (NULL);
2261
2262         if (mode == ptr_mode)
2263           {
2264             if (mode == DImode)
2265               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2266             else
2267               {
2268                 tp = gen_lowpart (mode, tp);
2269                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2270               }
2271           }
2272         else
2273           {
2274             gcc_assert (mode == Pmode);
2275             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2276           }
2277
2278         if (REG_P (dest))
2279           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2280         return;
2281       }
2282
2283     default:
2284       gcc_unreachable ();
2285     }
2286 }
2287
2288 /* Emit a move from SRC to DEST.  Assume that the move expanders can
2289    handle all moves if !can_create_pseudo_p ().  The distinction is
2290    important because, unlike emit_move_insn, the move expanders know
2291    how to force Pmode objects into the constant pool even when the
2292    constant pool address is not itself legitimate.  */
2293 static rtx
2294 aarch64_emit_move (rtx dest, rtx src)
2295 {
2296   return (can_create_pseudo_p ()
2297           ? emit_move_insn (dest, src)
2298           : emit_move_insn_1 (dest, src));
2299 }
2300
2301 /* Apply UNOPTAB to OP and store the result in DEST.  */
2302
2303 static void
2304 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2305 {
2306   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2307   if (dest != tmp)
2308     emit_move_insn (dest, tmp);
2309 }
2310
2311 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2312
2313 static void
2314 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2315 {
2316   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2317                           OPTAB_DIRECT);
2318   if (dest != tmp)
2319     emit_move_insn (dest, tmp);
2320 }
2321
2322 /* Split a 128-bit move operation into two 64-bit move operations,
2323    taking care to handle partial overlap of register to register
2324    copies.  Special cases are needed when moving between GP regs and
2325    FP regs.  SRC can be a register, constant or memory; DST a register
2326    or memory.  If either operand is memory it must not have any side
2327    effects.  */
2328 void
2329 aarch64_split_128bit_move (rtx dst, rtx src)
2330 {
2331   rtx dst_lo, dst_hi;
2332   rtx src_lo, src_hi;
2333
2334   machine_mode mode = GET_MODE (dst);
2335
2336   gcc_assert (mode == TImode || mode == TFmode);
2337   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2338   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2339
2340   if (REG_P (dst) && REG_P (src))
2341     {
2342       int src_regno = REGNO (src);
2343       int dst_regno = REGNO (dst);
2344
2345       /* Handle FP <-> GP regs.  */
2346       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2347         {
2348           src_lo = gen_lowpart (word_mode, src);
2349           src_hi = gen_highpart (word_mode, src);
2350
2351           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2352           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2353           return;
2354         }
2355       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2356         {
2357           dst_lo = gen_lowpart (word_mode, dst);
2358           dst_hi = gen_highpart (word_mode, dst);
2359
2360           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2361           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2362           return;
2363         }
2364     }
2365
2366   dst_lo = gen_lowpart (word_mode, dst);
2367   dst_hi = gen_highpart (word_mode, dst);
2368   src_lo = gen_lowpart (word_mode, src);
2369   src_hi = gen_highpart_mode (word_mode, mode, src);
2370
2371   /* At most one pairing may overlap.  */
2372   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2373     {
2374       aarch64_emit_move (dst_hi, src_hi);
2375       aarch64_emit_move (dst_lo, src_lo);
2376     }
2377   else
2378     {
2379       aarch64_emit_move (dst_lo, src_lo);
2380       aarch64_emit_move (dst_hi, src_hi);
2381     }
2382 }
2383
2384 bool
2385 aarch64_split_128bit_move_p (rtx dst, rtx src)
2386 {
2387   return (! REG_P (src)
2388           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2389 }
2390
2391 /* Split a complex SIMD combine.  */
2392
2393 void
2394 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2395 {
2396   machine_mode src_mode = GET_MODE (src1);
2397   machine_mode dst_mode = GET_MODE (dst);
2398
2399   gcc_assert (VECTOR_MODE_P (dst_mode));
2400   gcc_assert (register_operand (dst, dst_mode)
2401               && register_operand (src1, src_mode)
2402               && register_operand (src2, src_mode));
2403
2404   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2405   return;
2406 }
2407
2408 /* Split a complex SIMD move.  */
2409
2410 void
2411 aarch64_split_simd_move (rtx dst, rtx src)
2412 {
2413   machine_mode src_mode = GET_MODE (src);
2414   machine_mode dst_mode = GET_MODE (dst);
2415
2416   gcc_assert (VECTOR_MODE_P (dst_mode));
2417
2418   if (REG_P (dst) && REG_P (src))
2419     {
2420       gcc_assert (VECTOR_MODE_P (src_mode));
2421       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2422     }
2423 }
2424
2425 bool
2426 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2427                               machine_mode ymode, rtx y)
2428 {
2429   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2430   gcc_assert (r != NULL);
2431   return rtx_equal_p (x, r);
2432 }
2433
2434
2435 static rtx
2436 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2437 {
2438   if (can_create_pseudo_p ())
2439     return force_reg (mode, value);
2440   else
2441     {
2442       gcc_assert (x);
2443       aarch64_emit_move (x, value);
2444       return x;
2445     }
2446 }
2447
2448 /* Return true if we can move VALUE into a register using a single
2449    CNT[BHWD] instruction.  */
2450
2451 static bool
2452 aarch64_sve_cnt_immediate_p (poly_int64 value)
2453 {
2454   HOST_WIDE_INT factor = value.coeffs[0];
2455   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2456   return (value.coeffs[1] == factor
2457           && IN_RANGE (factor, 2, 16 * 16)
2458           && (factor & 1) == 0
2459           && factor <= 16 * (factor & -factor));
2460 }
2461
2462 /* Likewise for rtx X.  */
2463
2464 bool
2465 aarch64_sve_cnt_immediate_p (rtx x)
2466 {
2467   poly_int64 value;
2468   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2469 }
2470
2471 /* Return the asm string for an instruction with a CNT-like vector size
2472    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2473    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2474    first part of the operands template (the part that comes before the
2475    vector size itself).  FACTOR is the number of quadwords.
2476    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2477    If it is zero, we can use any element size.  */
2478
2479 static char *
2480 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2481                                   unsigned int factor,
2482                                   unsigned int nelts_per_vq)
2483 {
2484   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2485
2486   if (nelts_per_vq == 0)
2487     /* There is some overlap in the ranges of the four CNT instructions.
2488        Here we always use the smallest possible element size, so that the
2489        multiplier is 1 whereever possible.  */
2490     nelts_per_vq = factor & -factor;
2491   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2492   gcc_assert (IN_RANGE (shift, 1, 4));
2493   char suffix = "dwhb"[shift - 1];
2494
2495   factor >>= shift;
2496   unsigned int written;
2497   if (factor == 1)
2498     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2499                         prefix, suffix, operands);
2500   else
2501     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2502                         prefix, suffix, operands, factor);
2503   gcc_assert (written < sizeof (buffer));
2504   return buffer;
2505 }
2506
2507 /* Return the asm string for an instruction with a CNT-like vector size
2508    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2509    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2510    first part of the operands template (the part that comes before the
2511    vector size itself).  X is the value of the vector size operand,
2512    as a polynomial integer rtx.  */
2513
2514 char *
2515 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2516                                   rtx x)
2517 {
2518   poly_int64 value = rtx_to_poly_int64 (x);
2519   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2520   return aarch64_output_sve_cnt_immediate (prefix, operands,
2521                                            value.coeffs[1], 0);
2522 }
2523
2524 /* Return true if we can add VALUE to a register using a single ADDVL
2525    or ADDPL instruction.  */
2526
2527 static bool
2528 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2529 {
2530   HOST_WIDE_INT factor = value.coeffs[0];
2531   if (factor == 0 || value.coeffs[1] != factor)
2532     return false;
2533   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2534      and a value of 16 is one vector width.  */
2535   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2536           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2537 }
2538
2539 /* Likewise for rtx X.  */
2540
2541 bool
2542 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2543 {
2544   poly_int64 value;
2545   return (poly_int_rtx_p (x, &value)
2546           && aarch64_sve_addvl_addpl_immediate_p (value));
2547 }
2548
2549 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2550    and storing the result in operand 0.  */
2551
2552 char *
2553 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2554 {
2555   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2556   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2557   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2558
2559   /* Use INC or DEC if possible.  */
2560   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2561     {
2562       if (aarch64_sve_cnt_immediate_p (offset_value))
2563         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2564                                                  offset_value.coeffs[1], 0);
2565       if (aarch64_sve_cnt_immediate_p (-offset_value))
2566         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2567                                                  -offset_value.coeffs[1], 0);
2568     }
2569
2570   int factor = offset_value.coeffs[1];
2571   if ((factor & 15) == 0)
2572     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2573   else
2574     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2575   return buffer;
2576 }
2577
2578 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2579    instruction.  If it is, store the number of elements in each vector
2580    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2581    factor in *FACTOR_OUT (if nonnull).  */
2582
2583 bool
2584 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2585                                  unsigned int *nelts_per_vq_out)
2586 {
2587   rtx elt;
2588   poly_int64 value;
2589
2590   if (!const_vec_duplicate_p (x, &elt)
2591       || !poly_int_rtx_p (elt, &value))
2592     return false;
2593
2594   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2595   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2596     /* There's no vector INCB.  */
2597     return false;
2598
2599   HOST_WIDE_INT factor = value.coeffs[0];
2600   if (value.coeffs[1] != factor)
2601     return false;
2602
2603   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2604   if ((factor % nelts_per_vq) != 0
2605       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2606     return false;
2607
2608   if (factor_out)
2609     *factor_out = factor;
2610   if (nelts_per_vq_out)
2611     *nelts_per_vq_out = nelts_per_vq;
2612   return true;
2613 }
2614
2615 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2616    instruction.  */
2617
2618 bool
2619 aarch64_sve_inc_dec_immediate_p (rtx x)
2620 {
2621   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2622 }
2623
2624 /* Return the asm template for an SVE vector INC or DEC instruction.
2625    OPERANDS gives the operands before the vector count and X is the
2626    value of the vector count operand itself.  */
2627
2628 char *
2629 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2630 {
2631   int factor;
2632   unsigned int nelts_per_vq;
2633   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2634     gcc_unreachable ();
2635   if (factor < 0)
2636     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2637                                              nelts_per_vq);
2638   else
2639     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2640                                              nelts_per_vq);
2641 }
2642
2643 static int
2644 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2645                                 scalar_int_mode mode)
2646 {
2647   int i;
2648   unsigned HOST_WIDE_INT val, val2, mask;
2649   int one_match, zero_match;
2650   int num_insns;
2651
2652   val = INTVAL (imm);
2653
2654   if (aarch64_move_imm (val, mode))
2655     {
2656       if (generate)
2657         emit_insn (gen_rtx_SET (dest, imm));
2658       return 1;
2659     }
2660
2661   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2662      (with XXXX non-zero). In that case check to see if the move can be done in
2663      a smaller mode.  */
2664   val2 = val & 0xffffffff;
2665   if (mode == DImode
2666       && aarch64_move_imm (val2, SImode)
2667       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2668     {
2669       if (generate)
2670         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2671
2672       /* Check if we have to emit a second instruction by checking to see
2673          if any of the upper 32 bits of the original DI mode value is set.  */
2674       if (val == val2)
2675         return 1;
2676
2677       i = (val >> 48) ? 48 : 32;
2678
2679       if (generate)
2680          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2681                                     GEN_INT ((val >> i) & 0xffff)));
2682
2683       return 2;
2684     }
2685
2686   if ((val >> 32) == 0 || mode == SImode)
2687     {
2688       if (generate)
2689         {
2690           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2691           if (mode == SImode)
2692             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2693                                        GEN_INT ((val >> 16) & 0xffff)));
2694           else
2695             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2696                                        GEN_INT ((val >> 16) & 0xffff)));
2697         }
2698       return 2;
2699     }
2700
2701   /* Remaining cases are all for DImode.  */
2702
2703   mask = 0xffff;
2704   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2705     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2706   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2707     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2708
2709   if (zero_match != 2 && one_match != 2)
2710     {
2711       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2712          For a 64-bit bitmask try whether changing 16 bits to all ones or
2713          zeroes creates a valid bitmask.  To check any repeated bitmask,
2714          try using 16 bits from the other 32-bit half of val.  */
2715
2716       for (i = 0; i < 64; i += 16, mask <<= 16)
2717         {
2718           val2 = val & ~mask;
2719           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2720             break;
2721           val2 = val | mask;
2722           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2723             break;
2724           val2 = val2 & ~mask;
2725           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2726           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2727             break;
2728         }
2729       if (i != 64)
2730         {
2731           if (generate)
2732             {
2733               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2734               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2735                                          GEN_INT ((val >> i) & 0xffff)));
2736             }
2737           return 2;
2738         }
2739     }
2740
2741   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2742      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2743      otherwise skip zero bits.  */
2744
2745   num_insns = 1;
2746   mask = 0xffff;
2747   val2 = one_match > zero_match ? ~val : val;
2748   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2749
2750   if (generate)
2751     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2752                                            ? (val | ~(mask << i))
2753                                            : (val & (mask << i)))));
2754   for (i += 16; i < 64; i += 16)
2755     {
2756       if ((val2 & (mask << i)) == 0)
2757         continue;
2758       if (generate)
2759         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2760                                    GEN_INT ((val >> i) & 0xffff)));
2761       num_insns ++;
2762     }
2763
2764   return num_insns;
2765 }
2766
2767 /* Return whether imm is a 128-bit immediate which is simple enough to
2768    expand inline.  */
2769 bool
2770 aarch64_mov128_immediate (rtx imm)
2771 {
2772   if (GET_CODE (imm) == CONST_INT)
2773     return true;
2774
2775   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2776
2777   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2778   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2779
2780   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2781          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2782 }
2783
2784
2785 /* Return the number of temporary registers that aarch64_add_offset_1
2786    would need to add OFFSET to a register.  */
2787
2788 static unsigned int
2789 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2790 {
2791   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2792 }
2793
2794 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2795    a non-polynomial OFFSET.  MODE is the mode of the addition.
2796    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2797    be set and CFA adjustments added to the generated instructions.
2798
2799    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2800    temporary if register allocation is already complete.  This temporary
2801    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2802    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2803    the immediate again.
2804
2805    Since this function may be used to adjust the stack pointer, we must
2806    ensure that it cannot cause transient stack deallocation (for example
2807    by first incrementing SP and then decrementing when adjusting by a
2808    large immediate).  */
2809
2810 static void
2811 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2812                       rtx src, HOST_WIDE_INT offset, rtx temp1,
2813                       bool frame_related_p, bool emit_move_imm)
2814 {
2815   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2816   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2817
2818   HOST_WIDE_INT moffset = abs_hwi (offset);
2819   rtx_insn *insn;
2820
2821   if (!moffset)
2822     {
2823       if (!rtx_equal_p (dest, src))
2824         {
2825           insn = emit_insn (gen_rtx_SET (dest, src));
2826           RTX_FRAME_RELATED_P (insn) = frame_related_p;
2827         }
2828       return;
2829     }
2830
2831   /* Single instruction adjustment.  */
2832   if (aarch64_uimm12_shift (moffset))
2833     {
2834       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2835       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2836       return;
2837     }
2838
2839   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2840      and either:
2841
2842      a) the offset cannot be loaded by a 16-bit move or
2843      b) there is no spare register into which we can move it.  */
2844   if (moffset < 0x1000000
2845       && ((!temp1 && !can_create_pseudo_p ())
2846           || !aarch64_move_imm (moffset, mode)))
2847     {
2848       HOST_WIDE_INT low_off = moffset & 0xfff;
2849
2850       low_off = offset < 0 ? -low_off : low_off;
2851       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2852       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2853       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2854       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2855       return;
2856     }
2857
2858   /* Emit a move immediate if required and an addition/subtraction.  */
2859   if (emit_move_imm)
2860     {
2861       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2862       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2863     }
2864   insn = emit_insn (offset < 0
2865                     ? gen_sub3_insn (dest, src, temp1)
2866                     : gen_add3_insn (dest, src, temp1));
2867   if (frame_related_p)
2868     {
2869       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2870       rtx adj = plus_constant (mode, src, offset);
2871       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2872     }
2873 }
2874
2875 /* Return the number of temporary registers that aarch64_add_offset
2876    would need to move OFFSET into a register or add OFFSET to a register;
2877    ADD_P is true if we want the latter rather than the former.  */
2878
2879 static unsigned int
2880 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2881 {
2882   /* This follows the same structure as aarch64_add_offset.  */
2883   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2884     return 0;
2885
2886   unsigned int count = 0;
2887   HOST_WIDE_INT factor = offset.coeffs[1];
2888   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2889   poly_int64 poly_offset (factor, factor);
2890   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2891     /* Need one register for the ADDVL/ADDPL result.  */
2892     count += 1;
2893   else if (factor != 0)
2894     {
2895       factor = abs (factor);
2896       if (factor > 16 * (factor & -factor))
2897         /* Need one register for the CNT result and one for the multiplication
2898            factor.  If necessary, the second temporary can be reused for the
2899            constant part of the offset.  */
2900         return 2;
2901       /* Need one register for the CNT result (which might then
2902          be shifted).  */
2903       count += 1;
2904     }
2905   return count + aarch64_add_offset_1_temporaries (constant);
2906 }
2907
2908 /* If X can be represented as a poly_int64, return the number
2909    of temporaries that are required to add it to a register.
2910    Return -1 otherwise.  */
2911
2912 int
2913 aarch64_add_offset_temporaries (rtx x)
2914 {
2915   poly_int64 offset;
2916   if (!poly_int_rtx_p (x, &offset))
2917     return -1;
2918   return aarch64_offset_temporaries (true, offset);
2919 }
2920
2921 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
2922    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2923    be set and CFA adjustments added to the generated instructions.
2924
2925    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2926    temporary if register allocation is already complete.  This temporary
2927    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2928    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2929    false to avoid emitting the immediate again.
2930
2931    TEMP2, if nonnull, is a second temporary register that doesn't
2932    overlap either DEST or REG.
2933
2934    Since this function may be used to adjust the stack pointer, we must
2935    ensure that it cannot cause transient stack deallocation (for example
2936    by first incrementing SP and then decrementing when adjusting by a
2937    large immediate).  */
2938
2939 static void
2940 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2941                     poly_int64 offset, rtx temp1, rtx temp2,
2942                     bool frame_related_p, bool emit_move_imm = true)
2943 {
2944   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2945   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2946   gcc_assert (temp1 == NULL_RTX
2947               || !frame_related_p
2948               || !reg_overlap_mentioned_p (temp1, dest));
2949   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2950
2951   /* Try using ADDVL or ADDPL to add the whole value.  */
2952   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2953     {
2954       rtx offset_rtx = gen_int_mode (offset, mode);
2955       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2956       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2957       return;
2958     }
2959
2960   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2961      SVE vector register, over and above the minimum size of 128 bits.
2962      This is equivalent to half the value returned by CNTD with a
2963      vector shape of ALL.  */
2964   HOST_WIDE_INT factor = offset.coeffs[1];
2965   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2966
2967   /* Try using ADDVL or ADDPL to add the VG-based part.  */
2968   poly_int64 poly_offset (factor, factor);
2969   if (src != const0_rtx
2970       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2971     {
2972       rtx offset_rtx = gen_int_mode (poly_offset, mode);
2973       if (frame_related_p)
2974         {
2975           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2976           RTX_FRAME_RELATED_P (insn) = true;
2977           src = dest;
2978         }
2979       else
2980         {
2981           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2982           src = aarch64_force_temporary (mode, temp1, addr);
2983           temp1 = temp2;
2984           temp2 = NULL_RTX;
2985         }
2986     }
2987   /* Otherwise use a CNT-based sequence.  */
2988   else if (factor != 0)
2989     {
2990       /* Use a subtraction if we have a negative factor.  */
2991       rtx_code code = PLUS;
2992       if (factor < 0)
2993         {
2994           factor = -factor;
2995           code = MINUS;
2996         }
2997
2998       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
2999          into the multiplication.  */
3000       rtx val;
3001       int shift = 0;
3002       if (factor & 1)
3003         /* Use a right shift by 1.  */
3004         shift = -1;
3005       else
3006         factor /= 2;
3007       HOST_WIDE_INT low_bit = factor & -factor;
3008       if (factor <= 16 * low_bit)
3009         {
3010           if (factor > 16 * 8)
3011             {
3012               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3013                  the value with the minimum multiplier and shift it into
3014                  position.  */
3015               int extra_shift = exact_log2 (low_bit);
3016               shift += extra_shift;
3017               factor >>= extra_shift;
3018             }
3019           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3020         }
3021       else
3022         {
3023           /* Use CNTD, then multiply it by FACTOR.  */
3024           val = gen_int_mode (poly_int64 (2, 2), mode);
3025           val = aarch64_force_temporary (mode, temp1, val);
3026
3027           /* Go back to using a negative multiplication factor if we have
3028              no register from which to subtract.  */
3029           if (code == MINUS && src == const0_rtx)
3030             {
3031               factor = -factor;
3032               code = PLUS;
3033             }
3034           rtx coeff1 = gen_int_mode (factor, mode);
3035           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3036           val = gen_rtx_MULT (mode, val, coeff1);
3037         }
3038
3039       if (shift > 0)
3040         {
3041           /* Multiply by 1 << SHIFT.  */
3042           val = aarch64_force_temporary (mode, temp1, val);
3043           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3044         }
3045       else if (shift == -1)
3046         {
3047           /* Divide by 2.  */
3048           val = aarch64_force_temporary (mode, temp1, val);
3049           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3050         }
3051
3052       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
3053       if (src != const0_rtx)
3054         {
3055           val = aarch64_force_temporary (mode, temp1, val);
3056           val = gen_rtx_fmt_ee (code, mode, src, val);
3057         }
3058       else if (code == MINUS)
3059         {
3060           val = aarch64_force_temporary (mode, temp1, val);
3061           val = gen_rtx_NEG (mode, val);
3062         }
3063
3064       if (constant == 0 || frame_related_p)
3065         {
3066           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3067           if (frame_related_p)
3068             {
3069               RTX_FRAME_RELATED_P (insn) = true;
3070               add_reg_note (insn, REG_CFA_ADJUST_CFA,
3071                             gen_rtx_SET (dest, plus_constant (Pmode, src,
3072                                                               poly_offset)));
3073             }
3074           src = dest;
3075           if (constant == 0)
3076             return;
3077         }
3078       else
3079         {
3080           src = aarch64_force_temporary (mode, temp1, val);
3081           temp1 = temp2;
3082           temp2 = NULL_RTX;
3083         }
3084
3085       emit_move_imm = true;
3086     }
3087
3088   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3089                         frame_related_p, emit_move_imm);
3090 }
3091
3092 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3093    than a poly_int64.  */
3094
3095 void
3096 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3097                           rtx offset_rtx, rtx temp1, rtx temp2)
3098 {
3099   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3100                       temp1, temp2, false);
3101 }
3102
3103 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3104    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
3105    if TEMP1 already contains abs (DELTA).  */
3106
3107 static inline void
3108 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3109 {
3110   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3111                       temp1, temp2, true, emit_move_imm);
3112 }
3113
3114 /* Subtract DELTA from the stack pointer, marking the instructions
3115    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
3116    if nonnull.  */
3117
3118 static inline void
3119 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3120                 bool emit_move_imm = true)
3121 {
3122   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3123                       temp1, temp2, frame_related_p, emit_move_imm);
3124 }
3125
3126 /* Set DEST to (vec_series BASE STEP).  */
3127
3128 static void
3129 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3130 {
3131   machine_mode mode = GET_MODE (dest);
3132   scalar_mode inner = GET_MODE_INNER (mode);
3133
3134   /* Each operand can be a register or an immediate in the range [-16, 15].  */
3135   if (!aarch64_sve_index_immediate_p (base))
3136     base = force_reg (inner, base);
3137   if (!aarch64_sve_index_immediate_p (step))
3138     step = force_reg (inner, step);
3139
3140   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3141 }
3142
3143 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
3144    integer of mode INT_MODE.  Return true on success.  */
3145
3146 static bool
3147 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
3148                                       rtx src)
3149 {
3150   /* If the constant is smaller than 128 bits, we can do the move
3151      using a vector of SRC_MODEs.  */
3152   if (src_mode != TImode)
3153     {
3154       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
3155                                      GET_MODE_SIZE (src_mode));
3156       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
3157       emit_move_insn (gen_lowpart (dup_mode, dest),
3158                       gen_const_vec_duplicate (dup_mode, src));
3159       return true;
3160     }
3161
3162   /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
3163   src = force_const_mem (src_mode, src);
3164   if (!src)
3165     return false;
3166
3167   /* Make sure that the address is legitimate.  */
3168   if (!aarch64_sve_ld1r_operand_p (src))
3169     {
3170       rtx addr = force_reg (Pmode, XEXP (src, 0));
3171       src = replace_equiv_address (src, addr);
3172     }
3173
3174   machine_mode mode = GET_MODE (dest);
3175   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3176   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3177   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3178   src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
3179   emit_insn (gen_rtx_SET (dest, src));
3180   return true;
3181 }
3182
3183 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
3184    isn't a simple duplicate or series.  */
3185
3186 static void
3187 aarch64_expand_sve_const_vector (rtx dest, rtx src)
3188 {
3189   machine_mode mode = GET_MODE (src);
3190   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3191   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3192   gcc_assert (npatterns > 1);
3193
3194   if (nelts_per_pattern == 1)
3195     {
3196       /* The constant is a repeating seqeuence of at least two elements,
3197          where the repeating elements occupy no more than 128 bits.
3198          Get an integer representation of the replicated value.  */
3199       scalar_int_mode int_mode;
3200       if (BYTES_BIG_ENDIAN)
3201         /* For now, always use LD1RQ to load the value on big-endian
3202            targets, since the handling of smaller integers includes a
3203            subreg that is semantically an element reverse.  */
3204         int_mode = TImode;
3205       else
3206         {
3207           unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
3208           gcc_assert (int_bits <= 128);
3209           int_mode = int_mode_for_size (int_bits, 0).require ();
3210         }
3211       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
3212       if (int_value
3213           && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
3214         return;
3215     }
3216
3217   /* Expand each pattern individually.  */
3218   rtx_vector_builder builder;
3219   auto_vec<rtx, 16> vectors (npatterns);
3220   for (unsigned int i = 0; i < npatterns; ++i)
3221     {
3222       builder.new_vector (mode, 1, nelts_per_pattern);
3223       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3224         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3225       vectors.quick_push (force_reg (mode, builder.build ()));
3226     }
3227
3228   /* Use permutes to interleave the separate vectors.  */
3229   while (npatterns > 1)
3230     {
3231       npatterns /= 2;
3232       for (unsigned int i = 0; i < npatterns; ++i)
3233         {
3234           rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
3235           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3236           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3237           vectors[i] = tmp;
3238         }
3239     }
3240   gcc_assert (vectors[0] == dest);
3241 }
3242
3243 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
3244    is a pattern that can be used to set DEST to a replicated scalar
3245    element.  */
3246
3247 void
3248 aarch64_expand_mov_immediate (rtx dest, rtx imm,
3249                               rtx (*gen_vec_duplicate) (rtx, rtx))
3250 {
3251   machine_mode mode = GET_MODE (dest);
3252
3253   /* Check on what type of symbol it is.  */
3254   scalar_int_mode int_mode;
3255   if ((GET_CODE (imm) == SYMBOL_REF
3256        || GET_CODE (imm) == LABEL_REF
3257        || GET_CODE (imm) == CONST
3258        || GET_CODE (imm) == CONST_POLY_INT)
3259       && is_a <scalar_int_mode> (mode, &int_mode))
3260     {
3261       rtx mem;
3262       poly_int64 offset;
3263       HOST_WIDE_INT const_offset;
3264       enum aarch64_symbol_type sty;
3265
3266       /* If we have (const (plus symbol offset)), separate out the offset
3267          before we start classifying the symbol.  */
3268       rtx base = strip_offset (imm, &offset);
3269
3270       /* We must always add an offset involving VL separately, rather than
3271          folding it into the relocation.  */
3272       if (!offset.is_constant (&const_offset))
3273         {
3274           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
3275             emit_insn (gen_rtx_SET (dest, imm));
3276           else
3277             {
3278               /* Do arithmetic on 32-bit values if the result is smaller
3279                  than that.  */
3280               if (partial_subreg_p (int_mode, SImode))
3281                 {
3282                   /* It is invalid to do symbol calculations in modes
3283                      narrower than SImode.  */
3284                   gcc_assert (base == const0_rtx);
3285                   dest = gen_lowpart (SImode, dest);
3286                   int_mode = SImode;
3287                 }
3288               if (base != const0_rtx)
3289                 {
3290                   base = aarch64_force_temporary (int_mode, dest, base);
3291                   aarch64_add_offset (int_mode, dest, base, offset,
3292                                       NULL_RTX, NULL_RTX, false);
3293                 }
3294               else
3295                 aarch64_add_offset (int_mode, dest, base, offset,
3296                                     dest, NULL_RTX, false);
3297             }
3298           return;
3299         }
3300
3301       sty = aarch64_classify_symbol (base, const_offset);
3302       switch (sty)
3303         {
3304         case SYMBOL_FORCE_TO_MEM:
3305           if (const_offset != 0
3306               && targetm.cannot_force_const_mem (int_mode, imm))
3307             {
3308               gcc_assert (can_create_pseudo_p ());
3309               base = aarch64_force_temporary (int_mode, dest, base);
3310               aarch64_add_offset (int_mode, dest, base, const_offset,
3311                                   NULL_RTX, NULL_RTX, false);
3312               return;
3313             }
3314
3315           mem = force_const_mem (ptr_mode, imm);
3316           gcc_assert (mem);
3317
3318           /* If we aren't generating PC relative literals, then
3319              we need to expand the literal pool access carefully.
3320              This is something that needs to be done in a number
3321              of places, so could well live as a separate function.  */
3322           if (!aarch64_pcrelative_literal_loads)
3323             {
3324               gcc_assert (can_create_pseudo_p ());
3325               base = gen_reg_rtx (ptr_mode);
3326               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3327               if (ptr_mode != Pmode)
3328                 base = convert_memory_address (Pmode, base);
3329               mem = gen_rtx_MEM (ptr_mode, base);
3330             }
3331
3332           if (int_mode != ptr_mode)
3333             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3334
3335           emit_insn (gen_rtx_SET (dest, mem));
3336
3337           return;
3338
3339         case SYMBOL_SMALL_TLSGD:
3340         case SYMBOL_SMALL_TLSDESC:
3341         case SYMBOL_SMALL_TLSIE:
3342         case SYMBOL_SMALL_GOT_28K:
3343         case SYMBOL_SMALL_GOT_4G:
3344         case SYMBOL_TINY_GOT:
3345         case SYMBOL_TINY_TLSIE:
3346           if (const_offset != 0)
3347             {
3348               gcc_assert(can_create_pseudo_p ());
3349               base = aarch64_force_temporary (int_mode, dest, base);
3350               aarch64_add_offset (int_mode, dest, base, const_offset,
3351                                   NULL_RTX, NULL_RTX, false);
3352               return;
3353             }
3354           /* FALLTHRU */
3355
3356         case SYMBOL_SMALL_ABSOLUTE:
3357         case SYMBOL_TINY_ABSOLUTE:
3358         case SYMBOL_TLSLE12:
3359         case SYMBOL_TLSLE24:
3360         case SYMBOL_TLSLE32:
3361         case SYMBOL_TLSLE48:
3362           aarch64_load_symref_appropriately (dest, imm, sty);
3363           return;
3364
3365         default:
3366           gcc_unreachable ();
3367         }
3368     }
3369
3370   if (!CONST_INT_P (imm))
3371     {
3372       rtx base, step, value;
3373       if (GET_CODE (imm) == HIGH
3374           || aarch64_simd_valid_immediate (imm, NULL))
3375         emit_insn (gen_rtx_SET (dest, imm));
3376       else if (const_vec_series_p (imm, &base, &step))
3377         aarch64_expand_vec_series (dest, base, step);
3378       else if (const_vec_duplicate_p (imm, &value))
3379         {
3380           /* If the constant is out of range of an SVE vector move,
3381              load it from memory if we can, otherwise move it into
3382              a register and use a DUP.  */
3383           scalar_mode inner_mode = GET_MODE_INNER (mode);
3384           rtx op = force_const_mem (inner_mode, value);
3385           if (!op)
3386             op = force_reg (inner_mode, value);
3387           else if (!aarch64_sve_ld1r_operand_p (op))
3388             {
3389               rtx addr = force_reg (Pmode, XEXP (op, 0));
3390               op = replace_equiv_address (op, addr);
3391             }
3392           emit_insn (gen_vec_duplicate (dest, op));
3393         }
3394       else if (GET_CODE (imm) == CONST_VECTOR
3395                && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3396         aarch64_expand_sve_const_vector (dest, imm);
3397       else
3398         {
3399           rtx mem = force_const_mem (mode, imm);
3400           gcc_assert (mem);
3401           emit_move_insn (dest, mem);
3402         }
3403
3404       return;
3405     }
3406
3407   aarch64_internal_mov_immediate (dest, imm, true,
3408                                   as_a <scalar_int_mode> (mode));
3409 }
3410
3411 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3412    that is known to contain PTRUE.  */
3413
3414 void
3415 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3416 {
3417   expand_operand ops[3];
3418   machine_mode mode = GET_MODE (dest);
3419   create_output_operand (&ops[0], dest, mode);
3420   create_input_operand (&ops[1], pred, GET_MODE(pred));
3421   create_input_operand (&ops[2], src, mode);
3422   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
3423 }
3424
3425 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3426    operand is in memory.  In this case we need to use the predicated LD1
3427    and ST1 instead of LDR and STR, both for correctness on big-endian
3428    targets and because LD1 and ST1 support a wider range of addressing modes.
3429    PRED_MODE is the mode of the predicate.
3430
3431    See the comment at the head of aarch64-sve.md for details about the
3432    big-endian handling.  */
3433
3434 void
3435 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3436 {
3437   machine_mode mode = GET_MODE (dest);
3438   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3439   if (!register_operand (src, mode)
3440       && !register_operand (dest, mode))
3441     {
3442       rtx tmp = gen_reg_rtx (mode);
3443       if (MEM_P (src))
3444         aarch64_emit_sve_pred_move (tmp, ptrue, src);
3445       else
3446         emit_move_insn (tmp, src);
3447       src = tmp;
3448     }
3449   aarch64_emit_sve_pred_move (dest, ptrue, src);
3450 }
3451
3452 /* Called only on big-endian targets.  See whether an SVE vector move
3453    from SRC to DEST is effectively a REV[BHW] instruction, because at
3454    least one operand is a subreg of an SVE vector that has wider or
3455    narrower elements.  Return true and emit the instruction if so.
3456
3457    For example:
3458
3459      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3460
3461    represents a VIEW_CONVERT between the following vectors, viewed
3462    in memory order:
3463
3464      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
3465      R1: { [0],      [1],      [2],      [3],     ... }
3466
3467    The high part of lane X in R2 should therefore correspond to lane X*2
3468    of R1, but the register representations are:
3469
3470          msb                                      lsb
3471      R2: ...... [1].high  [1].low   [0].high  [0].low
3472      R1: ...... [3]       [2]       [1]       [0]
3473
3474    where the low part of lane X in R2 corresponds to lane X*2 in R1.
3475    We therefore need a reverse operation to swap the high and low values
3476    around.
3477
3478    This is purely an optimization.  Without it we would spill the
3479    subreg operand to the stack in one mode and reload it in the
3480    other mode, which has the same effect as the REV.  */
3481
3482 bool
3483 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3484 {
3485   gcc_assert (BYTES_BIG_ENDIAN);
3486   if (GET_CODE (dest) == SUBREG)
3487     dest = SUBREG_REG (dest);
3488   if (GET_CODE (src) == SUBREG)
3489     src = SUBREG_REG (src);
3490
3491   /* The optimization handles two single SVE REGs with different element
3492      sizes.  */
3493   if (!REG_P (dest)
3494       || !REG_P (src)
3495       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3496       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3497       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3498           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3499     return false;
3500
3501   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
3502   rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3503   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3504                                UNSPEC_REV_SUBREG);
3505   emit_insn (gen_rtx_SET (dest, unspec));
3506   return true;
3507 }
3508
3509 /* Return a copy of X with mode MODE, without changing its other
3510    attributes.  Unlike gen_lowpart, this doesn't care whether the
3511    mode change is valid.  */
3512
3513 static rtx
3514 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3515 {
3516   if (GET_MODE (x) == mode)
3517     return x;
3518
3519   x = shallow_copy_rtx (x);
3520   set_mode_and_regno (x, mode, REGNO (x));
3521   return x;
3522 }
3523
3524 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3525    operands.  */
3526
3527 void
3528 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3529 {
3530   /* Decide which REV operation we need.  The mode with narrower elements
3531      determines the mode of the operands and the mode with the wider
3532      elements determines the reverse width.  */
3533   machine_mode mode_with_wider_elts = GET_MODE (dest);
3534   machine_mode mode_with_narrower_elts = GET_MODE (src);
3535   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3536       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3537     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3538
3539   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3540   unsigned int unspec;
3541   if (wider_bytes == 8)
3542     unspec = UNSPEC_REV64;
3543   else if (wider_bytes == 4)
3544     unspec = UNSPEC_REV32;
3545   else if (wider_bytes == 2)
3546     unspec = UNSPEC_REV16;
3547   else
3548     gcc_unreachable ();
3549   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3550
3551   /* Emit:
3552
3553        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3554                          UNSPEC_MERGE_PTRUE))
3555
3556      with the appropriate modes.  */
3557   ptrue = gen_lowpart (pred_mode, ptrue);
3558   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3559   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3560   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3561   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3562                         UNSPEC_MERGE_PTRUE);
3563   emit_insn (gen_rtx_SET (dest, src));
3564 }
3565
3566 static bool
3567 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3568                                  tree exp ATTRIBUTE_UNUSED)
3569 {
3570   if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
3571     return false;
3572
3573   return true;
3574 }
3575
3576 /* Implement TARGET_PASS_BY_REFERENCE.  */
3577
3578 static bool
3579 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3580                            machine_mode mode,
3581                            const_tree type,
3582                            bool named ATTRIBUTE_UNUSED)
3583 {
3584   HOST_WIDE_INT size;
3585   machine_mode dummymode;
3586   int nregs;
3587
3588   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3589   if (mode == BLKmode && type)
3590     size = int_size_in_bytes (type);
3591   else
3592     /* No frontends can create types with variable-sized modes, so we
3593        shouldn't be asked to pass or return them.  */
3594     size = GET_MODE_SIZE (mode).to_constant ();
3595
3596   /* Aggregates are passed by reference based on their size.  */
3597   if (type && AGGREGATE_TYPE_P (type))
3598     {
3599       size = int_size_in_bytes (type);
3600     }
3601
3602   /* Variable sized arguments are always returned by reference.  */
3603   if (size < 0)
3604     return true;
3605
3606   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3607   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3608                                                &dummymode, &nregs,
3609                                                NULL))
3610     return false;
3611
3612   /* Arguments which are variable sized or larger than 2 registers are
3613      passed by reference unless they are a homogenous floating point
3614      aggregate.  */
3615   return size > 2 * UNITS_PER_WORD;
3616 }
3617
3618 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3619 static bool
3620 aarch64_return_in_msb (const_tree valtype)
3621 {
3622   machine_mode dummy_mode;
3623   int dummy_int;
3624
3625   /* Never happens in little-endian mode.  */
3626   if (!BYTES_BIG_ENDIAN)
3627     return false;
3628
3629   /* Only composite types smaller than or equal to 16 bytes can
3630      be potentially returned in registers.  */
3631   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3632       || int_size_in_bytes (valtype) <= 0
3633       || int_size_in_bytes (valtype) > 16)
3634     return false;
3635
3636   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3637      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3638      is always passed/returned in the least significant bits of fp/simd
3639      register(s).  */
3640   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3641                                                &dummy_mode, &dummy_int, NULL))
3642     return false;
3643
3644   return true;
3645 }
3646
3647 /* Implement TARGET_FUNCTION_VALUE.
3648    Define how to find the value returned by a function.  */
3649
3650 static rtx
3651 aarch64_function_value (const_tree type, const_tree func,
3652                         bool outgoing ATTRIBUTE_UNUSED)
3653 {
3654   machine_mode mode;
3655   int unsignedp;
3656   int count;
3657   machine_mode ag_mode;
3658
3659   mode = TYPE_MODE (type);
3660   if (INTEGRAL_TYPE_P (type))
3661     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3662
3663   if (aarch64_return_in_msb (type))
3664     {
3665       HOST_WIDE_INT size = int_size_in_bytes (type);
3666
3667       if (size % UNITS_PER_WORD != 0)
3668         {
3669           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3670           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3671         }
3672     }
3673
3674   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3675                                                &ag_mode, &count, NULL))
3676     {
3677       if (!aarch64_composite_type_p (type, mode))
3678         {
3679           gcc_assert (count == 1 && mode == ag_mode);
3680           return gen_rtx_REG (mode, V0_REGNUM);
3681         }
3682       else
3683         {
3684           int i;
3685           rtx par;
3686
3687           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3688           for (i = 0; i < count; i++)
3689             {
3690               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3691               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3692               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3693               XVECEXP (par, 0, i) = tmp;
3694             }
3695           return par;
3696         }
3697     }
3698   else
3699     return gen_rtx_REG (mode, R0_REGNUM);
3700 }
3701
3702 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3703    Return true if REGNO is the number of a hard register in which the values
3704    of called function may come back.  */
3705
3706 static bool
3707 aarch64_function_value_regno_p (const unsigned int regno)
3708 {
3709   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3710      of 16-byte return values are: 128-bit integers and 16-byte small
3711      structures (excluding homogeneous floating-point aggregates).  */
3712   if (regno == R0_REGNUM || regno == R1_REGNUM)
3713     return true;
3714
3715   /* Up to four fp/simd registers can return a function value, e.g. a
3716      homogeneous floating-point aggregate having four members.  */
3717   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3718     return TARGET_FLOAT;
3719
3720   return false;
3721 }
3722
3723 /* Implement TARGET_RETURN_IN_MEMORY.
3724
3725    If the type T of the result of a function is such that
3726      void func (T arg)
3727    would require that arg be passed as a value in a register (or set of
3728    registers) according to the parameter passing rules, then the result
3729    is returned in the same registers as would be used for such an
3730    argument.  */
3731
3732 static bool
3733 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3734 {
3735   HOST_WIDE_INT size;
3736   machine_mode ag_mode;
3737   int count;
3738
3739   if (!AGGREGATE_TYPE_P (type)
3740       && TREE_CODE (type) != COMPLEX_TYPE
3741       && TREE_CODE (type) != VECTOR_TYPE)
3742     /* Simple scalar types always returned in registers.  */
3743     return false;
3744
3745   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3746                                                type,
3747                                                &ag_mode,
3748                                                &count,
3749                                                NULL))
3750     return false;
3751
3752   /* Types larger than 2 registers returned in memory.  */
3753   size = int_size_in_bytes (type);
3754   return (size < 0 || size > 2 * UNITS_PER_WORD);
3755 }
3756
3757 static bool
3758 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3759                                const_tree type, int *nregs)
3760 {
3761   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3762   return aarch64_vfp_is_call_or_return_candidate (mode,
3763                                                   type,
3764                                                   &pcum->aapcs_vfp_rmode,
3765                                                   nregs,
3766                                                   NULL);
3767 }
3768
3769 /* Given MODE and TYPE of a function argument, return the alignment in
3770    bits.  The idea is to suppress any stronger alignment requested by
3771    the user and opt for the natural alignment (specified in AAPCS64 \S
3772    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
3773    calculated in versions of GCC prior to GCC-9.  This is a helper
3774    function for local use only.  */
3775
3776 static unsigned int
3777 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
3778                                 bool *abi_break)
3779 {
3780   *abi_break = false;
3781   if (!type)
3782     return GET_MODE_ALIGNMENT (mode);
3783
3784   if (integer_zerop (TYPE_SIZE (type)))
3785     return 0;
3786
3787   gcc_assert (TYPE_MODE (type) == mode);
3788
3789   if (!AGGREGATE_TYPE_P (type))
3790     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3791
3792   if (TREE_CODE (type) == ARRAY_TYPE)
3793     return TYPE_ALIGN (TREE_TYPE (type));
3794
3795   unsigned int alignment = 0;
3796   unsigned int bitfield_alignment = 0;
3797   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3798     if (TREE_CODE (field) == FIELD_DECL)
3799       {
3800         alignment = std::max (alignment, DECL_ALIGN (field));
3801         if (DECL_BIT_FIELD_TYPE (field))
3802           bitfield_alignment
3803             = std::max (bitfield_alignment,
3804                         TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
3805       }
3806
3807   if (bitfield_alignment > alignment)
3808     {
3809       *abi_break = true;
3810       return bitfield_alignment;
3811     }
3812
3813   return alignment;
3814 }
3815
3816 /* Layout a function argument according to the AAPCS64 rules.  The rule
3817    numbers refer to the rule numbers in the AAPCS64.  */
3818
3819 static void
3820 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3821                     const_tree type,
3822                     bool named ATTRIBUTE_UNUSED)
3823 {
3824   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3825   int ncrn, nvrn, nregs;
3826   bool allocate_ncrn, allocate_nvrn;
3827   HOST_WIDE_INT size;
3828   bool abi_break;
3829
3830   /* We need to do this once per argument.  */
3831   if (pcum->aapcs_arg_processed)
3832     return;
3833
3834   pcum->aapcs_arg_processed = true;
3835
3836   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
3837   if (type)
3838     size = int_size_in_bytes (type);
3839   else
3840     /* No frontends can create types with variable-sized modes, so we
3841        shouldn't be asked to pass or return them.  */
3842     size = GET_MODE_SIZE (mode).to_constant ();
3843   size = ROUND_UP (size, UNITS_PER_WORD);
3844
3845   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3846   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3847                                                  mode,
3848                                                  type,
3849                                                  &nregs);
3850
3851   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3852      The following code thus handles passing by SIMD/FP registers first.  */
3853
3854   nvrn = pcum->aapcs_nvrn;
3855
3856   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3857      and homogenous short-vector aggregates (HVA).  */
3858   if (allocate_nvrn)
3859     {
3860       if (!TARGET_FLOAT)
3861         aarch64_err_no_fpadvsimd (mode);
3862
3863       if (nvrn + nregs <= NUM_FP_ARG_REGS)
3864         {
3865           pcum->aapcs_nextnvrn = nvrn + nregs;
3866           if (!aarch64_composite_type_p (type, mode))
3867             {
3868               gcc_assert (nregs == 1);
3869               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3870             }
3871           else
3872             {
3873               rtx par;
3874               int i;
3875               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3876               for (i = 0; i < nregs; i++)
3877                 {
3878                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3879                                          V0_REGNUM + nvrn + i);
3880                   rtx offset = gen_int_mode
3881                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3882                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3883                   XVECEXP (par, 0, i) = tmp;
3884                 }
3885               pcum->aapcs_reg = par;
3886             }
3887           return;
3888         }
3889       else
3890         {
3891           /* C.3 NSRN is set to 8.  */
3892           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3893           goto on_stack;
3894         }
3895     }
3896
3897   ncrn = pcum->aapcs_ncrn;
3898   nregs = size / UNITS_PER_WORD;
3899
3900   /* C6 - C9.  though the sign and zero extension semantics are
3901      handled elsewhere.  This is the case where the argument fits
3902      entirely general registers.  */
3903   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3904     {
3905       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3906
3907       /* C.8 if the argument has an alignment of 16 then the NGRN is
3908          rounded up to the next even number.  */
3909       if (nregs == 2
3910           && ncrn % 2
3911           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3912              comparison is there because for > 16 * BITS_PER_UNIT
3913              alignment nregs should be > 2 and therefore it should be
3914              passed by reference rather than value.  */
3915           && (aarch64_function_arg_alignment (mode, type, &abi_break)
3916               == 16 * BITS_PER_UNIT))
3917         {
3918           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
3919             inform (input_location, "parameter passing for argument of type "
3920                     "%qT changed in GCC 9.1", type);
3921           ++ncrn;
3922           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3923         }
3924
3925       /* NREGS can be 0 when e.g. an empty structure is to be passed.
3926          A reg is still generated for it, but the caller should be smart
3927          enough not to use it.  */
3928       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3929         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3930       else
3931         {
3932           rtx par;
3933           int i;
3934
3935           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3936           for (i = 0; i < nregs; i++)
3937             {
3938               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3939               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3940                                        GEN_INT (i * UNITS_PER_WORD));
3941               XVECEXP (par, 0, i) = tmp;
3942             }
3943           pcum->aapcs_reg = par;
3944         }
3945
3946       pcum->aapcs_nextncrn = ncrn + nregs;
3947       return;
3948     }
3949
3950   /* C.11  */
3951   pcum->aapcs_nextncrn = NUM_ARG_REGS;
3952
3953   /* The argument is passed on stack; record the needed number of words for
3954      this argument and align the total size if necessary.  */
3955 on_stack:
3956   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3957
3958   if (aarch64_function_arg_alignment (mode, type, &abi_break)
3959       == 16 * BITS_PER_UNIT)
3960     {
3961       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
3962       if (pcum->aapcs_stack_size != new_size)
3963         {
3964           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
3965             inform (input_location, "parameter passing for argument of type "
3966                     "%qT changed in GCC 9.1", type);
3967           pcum->aapcs_stack_size = new_size;
3968         }
3969     }
3970   return;
3971 }
3972
3973 /* Implement TARGET_FUNCTION_ARG.  */
3974
3975 static rtx
3976 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3977                       const_tree type, bool named)
3978 {
3979   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3980   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3981
3982   if (mode == VOIDmode)
3983     return NULL_RTX;
3984
3985   aarch64_layout_arg (pcum_v, mode, type, named);
3986   return pcum->aapcs_reg;
3987 }
3988
3989 void
3990 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3991                            const_tree fntype ATTRIBUTE_UNUSED,
3992                            rtx libname ATTRIBUTE_UNUSED,
3993                            const_tree fndecl ATTRIBUTE_UNUSED,
3994                            unsigned n_named ATTRIBUTE_UNUSED)
3995 {
3996   pcum->aapcs_ncrn = 0;
3997   pcum->aapcs_nvrn = 0;
3998   pcum->aapcs_nextncrn = 0;
3999   pcum->aapcs_nextnvrn = 0;
4000   pcum->pcs_variant = ARM_PCS_AAPCS64;
4001   pcum->aapcs_reg = NULL_RTX;
4002   pcum->aapcs_arg_processed = false;
4003   pcum->aapcs_stack_words = 0;
4004   pcum->aapcs_stack_size = 0;
4005
4006   if (!TARGET_FLOAT
4007       && fndecl && TREE_PUBLIC (fndecl)
4008       && fntype && fntype != error_mark_node)
4009     {
4010       const_tree type = TREE_TYPE (fntype);
4011       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
4012       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
4013       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4014                                                    &mode, &nregs, NULL))
4015         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4016     }
4017   return;
4018 }
4019
4020 static void
4021 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4022                               machine_mode mode,
4023                               const_tree type,
4024                               bool named)
4025 {
4026   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4027   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4028     {
4029       aarch64_layout_arg (pcum_v, mode, type, named);
4030       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4031                   != (pcum->aapcs_stack_words != 0));
4032       pcum->aapcs_arg_processed = false;
4033       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4034       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4035       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4036       pcum->aapcs_stack_words = 0;
4037       pcum->aapcs_reg = NULL_RTX;
4038     }
4039 }
4040
4041 bool
4042 aarch64_function_arg_regno_p (unsigned regno)
4043 {
4044   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4045           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4046 }
4047
4048 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
4049    PARM_BOUNDARY bits of alignment, but will be given anything up
4050    to STACK_BOUNDARY bits if the type requires it.  This makes sure
4051    that both before and after the layout of each argument, the Next
4052    Stacked Argument Address (NSAA) will have a minimum alignment of
4053    8 bytes.  */
4054
4055 static unsigned int
4056 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4057 {
4058   bool abi_break;
4059   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4060                                                            &abi_break);
4061   if (abi_break & warn_psabi)
4062     inform (input_location, "parameter passing for argument of type "
4063             "%qT changed in GCC 9.1", type);
4064
4065   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4066 }
4067
4068 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
4069
4070 static fixed_size_mode
4071 aarch64_get_reg_raw_mode (int regno)
4072 {
4073   if (TARGET_SVE && FP_REGNUM_P (regno))
4074     /* Don't use the SVE part of the register for __builtin_apply and
4075        __builtin_return.  The SVE registers aren't used by the normal PCS,
4076        so using them there would be a waste of time.  The PCS extensions
4077        for SVE types are fundamentally incompatible with the
4078        __builtin_return/__builtin_apply interface.  */
4079     return as_a <fixed_size_mode> (V16QImode);
4080   return default_get_reg_raw_mode (regno);
4081 }
4082
4083 /* Implement TARGET_FUNCTION_ARG_PADDING.
4084
4085    Small aggregate types are placed in the lowest memory address.
4086
4087    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
4088
4089 static pad_direction
4090 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4091 {
4092   /* On little-endian targets, the least significant byte of every stack
4093      argument is passed at the lowest byte address of the stack slot.  */
4094   if (!BYTES_BIG_ENDIAN)
4095     return PAD_UPWARD;
4096
4097   /* Otherwise, integral, floating-point and pointer types are padded downward:
4098      the least significant byte of a stack argument is passed at the highest
4099      byte address of the stack slot.  */
4100   if (type
4101       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4102          || POINTER_TYPE_P (type))
4103       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4104     return PAD_DOWNWARD;
4105
4106   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
4107   return PAD_UPWARD;
4108 }
4109
4110 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4111
4112    It specifies padding for the last (may also be the only)
4113    element of a block move between registers and memory.  If
4114    assuming the block is in the memory, padding upward means that
4115    the last element is padded after its highest significant byte,
4116    while in downward padding, the last element is padded at the
4117    its least significant byte side.
4118
4119    Small aggregates and small complex types are always padded
4120    upwards.
4121
4122    We don't need to worry about homogeneous floating-point or
4123    short-vector aggregates; their move is not affected by the
4124    padding direction determined here.  Regardless of endianness,
4125    each element of such an aggregate is put in the least
4126    significant bits of a fp/simd register.
4127
4128    Return !BYTES_BIG_ENDIAN if the least significant byte of the
4129    register has useful data, and return the opposite if the most
4130    significant byte does.  */
4131
4132 bool
4133 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4134                      bool first ATTRIBUTE_UNUSED)
4135 {
4136
4137   /* Small composite types are always padded upward.  */
4138   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4139     {
4140       HOST_WIDE_INT size;
4141       if (type)
4142         size = int_size_in_bytes (type);
4143       else
4144         /* No frontends can create types with variable-sized modes, so we
4145            shouldn't be asked to pass or return them.  */
4146         size = GET_MODE_SIZE (mode).to_constant ();
4147       if (size < 2 * UNITS_PER_WORD)
4148         return true;
4149     }
4150
4151   /* Otherwise, use the default padding.  */
4152   return !BYTES_BIG_ENDIAN;
4153 }
4154
4155 static scalar_int_mode
4156 aarch64_libgcc_cmp_return_mode (void)
4157 {
4158   return SImode;
4159 }
4160
4161 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4162
4163 /* We use the 12-bit shifted immediate arithmetic instructions so values
4164    must be multiple of (1 << 12), i.e. 4096.  */
4165 #define ARITH_FACTOR 4096
4166
4167 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4168 #error Cannot use simple address calculation for stack probing
4169 #endif
4170
4171 /* The pair of scratch registers used for stack probing.  */
4172 #define PROBE_STACK_FIRST_REG  R9_REGNUM
4173 #define PROBE_STACK_SECOND_REG R10_REGNUM
4174
4175 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4176    inclusive.  These are offsets from the current stack pointer.  */
4177
4178 static void
4179 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4180 {
4181   HOST_WIDE_INT size;
4182   if (!poly_size.is_constant (&size))
4183     {
4184       sorry ("stack probes for SVE frames");
4185       return;
4186     }
4187
4188   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4189
4190   /* See the same assertion on PROBE_INTERVAL above.  */
4191   gcc_assert ((first % ARITH_FACTOR) == 0);
4192
4193   /* See if we have a constant small number of probes to generate.  If so,
4194      that's the easy case.  */
4195   if (size <= PROBE_INTERVAL)
4196     {
4197       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4198
4199       emit_set_insn (reg1,
4200                      plus_constant (Pmode,
4201                                     stack_pointer_rtx, -(first + base)));
4202       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4203     }
4204
4205   /* The run-time loop is made up of 8 insns in the generic case while the
4206      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
4207   else if (size <= 4 * PROBE_INTERVAL)
4208     {
4209       HOST_WIDE_INT i, rem;
4210
4211       emit_set_insn (reg1,
4212                      plus_constant (Pmode,
4213                                     stack_pointer_rtx,
4214                                     -(first + PROBE_INTERVAL)));
4215       emit_stack_probe (reg1);
4216
4217       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4218          it exceeds SIZE.  If only two probes are needed, this will not
4219          generate any code.  Then probe at FIRST + SIZE.  */
4220       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4221         {
4222           emit_set_insn (reg1,
4223                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4224           emit_stack_probe (reg1);
4225         }
4226
4227       rem = size - (i - PROBE_INTERVAL);
4228       if (rem > 256)
4229         {
4230           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4231
4232           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4233           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4234         }
4235       else
4236         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4237     }
4238
4239   /* Otherwise, do the same as above, but in a loop.  Note that we must be
4240      extra careful with variables wrapping around because we might be at
4241      the very top (or the very bottom) of the address space and we have
4242      to be able to handle this case properly; in particular, we use an
4243      equality test for the loop condition.  */
4244   else
4245     {
4246       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
4247
4248       /* Step 1: round SIZE to the previous multiple of the interval.  */
4249
4250       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
4251
4252
4253       /* Step 2: compute initial and final value of the loop counter.  */
4254
4255       /* TEST_ADDR = SP + FIRST.  */
4256       emit_set_insn (reg1,
4257                      plus_constant (Pmode, stack_pointer_rtx, -first));
4258
4259       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
4260       HOST_WIDE_INT adjustment = - (first + rounded_size);
4261       if (! aarch64_uimm12_shift (adjustment))
4262         {
4263           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
4264                                           true, Pmode);
4265           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
4266         }
4267       else
4268         emit_set_insn (reg2,
4269                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
4270
4271       /* Step 3: the loop
4272
4273          do
4274            {
4275              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4276              probe at TEST_ADDR
4277            }
4278          while (TEST_ADDR != LAST_ADDR)
4279
4280          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4281          until it is equal to ROUNDED_SIZE.  */
4282
4283       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
4284
4285
4286       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4287          that SIZE is equal to ROUNDED_SIZE.  */
4288
4289       if (size != rounded_size)
4290         {
4291           HOST_WIDE_INT rem = size - rounded_size;
4292
4293           if (rem > 256)
4294             {
4295               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4296
4297               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
4298               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
4299             }
4300           else
4301             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
4302         }
4303     }
4304
4305   /* Make sure nothing is scheduled before we are done.  */
4306   emit_insn (gen_blockage ());
4307 }
4308
4309 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
4310    absolute addresses.  */
4311
4312 const char *
4313 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
4314 {
4315   static int labelno = 0;
4316   char loop_lab[32];
4317   rtx xops[2];
4318
4319   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
4320
4321   /* Loop.  */
4322   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
4323
4324   HOST_WIDE_INT stack_clash_probe_interval
4325     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
4326
4327   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
4328   xops[0] = reg1;
4329   HOST_WIDE_INT interval;
4330   if (flag_stack_clash_protection)
4331     interval = stack_clash_probe_interval;
4332   else
4333     interval = PROBE_INTERVAL;
4334
4335   gcc_assert (aarch64_uimm12_shift (interval));
4336   xops[1] = GEN_INT (interval);
4337
4338   output_asm_insn ("sub\t%0, %0, %1", xops);
4339
4340   /* If doing stack clash protection then we probe up by the ABI specified
4341      amount.  We do this because we're dropping full pages at a time in the
4342      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
4343   if (flag_stack_clash_protection)
4344     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
4345   else
4346     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
4347
4348   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
4349      by this amount for each iteration.  */
4350   output_asm_insn ("str\txzr, [%0, %1]", xops);
4351
4352   /* Test if TEST_ADDR == LAST_ADDR.  */
4353   xops[1] = reg2;
4354   output_asm_insn ("cmp\t%0, %1", xops);
4355
4356   /* Branch.  */
4357   fputs ("\tb.ne\t", asm_out_file);
4358   assemble_name_raw (asm_out_file, loop_lab);
4359   fputc ('\n', asm_out_file);
4360
4361   return "";
4362 }
4363
4364 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4365    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4366    of GUARD_SIZE.  When a probe is emitted it is done at most
4367    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4368    at most MIN_PROBE_THRESHOLD.  By the end of this function
4369    BASE = BASE - ADJUSTMENT.  */
4370
4371 const char *
4372 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
4373                                       rtx min_probe_threshold, rtx guard_size)
4374 {
4375   /* This function is not allowed to use any instruction generation function
4376      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
4377      so instead emit the code you want using output_asm_insn.  */
4378   gcc_assert (flag_stack_clash_protection);
4379   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
4380   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
4381
4382   /* The minimum required allocation before the residual requires probing.  */
4383   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
4384
4385   /* Clamp the value down to the nearest value that can be used with a cmp.  */
4386   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
4387   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
4388
4389   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
4390   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
4391
4392   static int labelno = 0;
4393   char loop_start_lab[32];
4394   char loop_end_lab[32];
4395   rtx xops[2];
4396
4397   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
4398   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
4399
4400   /* Emit loop start label.  */
4401   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
4402
4403   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
4404   xops[0] = adjustment;
4405   xops[1] = probe_offset_value_rtx;
4406   output_asm_insn ("cmp\t%0, %1", xops);
4407
4408   /* Branch to end if not enough adjustment to probe.  */
4409   fputs ("\tb.lt\t", asm_out_file);
4410   assemble_name_raw (asm_out_file, loop_end_lab);
4411   fputc ('\n', asm_out_file);
4412
4413   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
4414   xops[0] = base;
4415   xops[1] = probe_offset_value_rtx;
4416   output_asm_insn ("sub\t%0, %0, %1", xops);
4417
4418   /* Probe at BASE.  */
4419   xops[1] = const0_rtx;
4420   output_asm_insn ("str\txzr, [%0, %1]", xops);
4421
4422   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
4423   xops[0] = adjustment;
4424   xops[1] = probe_offset_value_rtx;
4425   output_asm_insn ("sub\t%0, %0, %1", xops);
4426
4427   /* Branch to start if still more bytes to allocate.  */
4428   fputs ("\tb\t", asm_out_file);
4429   assemble_name_raw (asm_out_file, loop_start_lab);
4430   fputc ('\n', asm_out_file);
4431
4432   /* No probe leave.  */
4433   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
4434
4435   /* BASE = BASE - ADJUSTMENT.  */
4436   xops[0] = base;
4437   xops[1] = adjustment;
4438   output_asm_insn ("sub\t%0, %0, %1", xops);
4439   return "";
4440 }
4441
4442 /* Determine whether a frame chain needs to be generated.  */
4443 static bool
4444 aarch64_needs_frame_chain (void)
4445 {
4446   /* Force a frame chain for EH returns so the return address is at FP+8.  */
4447   if (frame_pointer_needed || crtl->calls_eh_return)
4448     return true;
4449
4450   /* A leaf function cannot have calls or write LR.  */
4451   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4452
4453   /* Don't use a frame chain in leaf functions if leaf frame pointers
4454      are disabled.  */
4455   if (flag_omit_leaf_frame_pointer && is_leaf)
4456     return false;
4457
4458   return aarch64_use_frame_pointer;
4459 }
4460
4461 /* Mark the registers that need to be saved by the callee and calculate
4462    the size of the callee-saved registers area and frame record (both FP
4463    and LR may be omitted).  */
4464 static void
4465 aarch64_layout_frame (void)
4466 {
4467   HOST_WIDE_INT offset = 0;
4468   int regno, last_fp_reg = INVALID_REGNUM;
4469   bool simd_function = aarch64_simd_decl_p (cfun->decl);
4470
4471   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4472
4473   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
4474      the mid-end is doing.  */
4475   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
4476
4477 #define SLOT_NOT_REQUIRED (-2)
4478 #define SLOT_REQUIRED     (-1)
4479
4480   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4481   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4482
4483   /* If this is a non-leaf simd function with calls we assume that
4484      at least one of those calls is to a non-simd function and thus
4485      we must save V8 to V23 in the prologue.  */
4486
4487   if (simd_function && !crtl->is_leaf)
4488     {
4489       for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4490         if (FP_SIMD_SAVED_REGNUM_P (regno))
4491           df_set_regs_ever_live (regno, true);
4492     }
4493
4494   /* First mark all the registers that really need to be saved...  */
4495   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4496     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4497
4498   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4499     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4500
4501   /* ... that includes the eh data registers (if needed)...  */
4502   if (crtl->calls_eh_return)
4503     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4504       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4505         = SLOT_REQUIRED;
4506
4507   /* ... and any callee saved register that dataflow says is live.  */
4508   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4509     if (df_regs_ever_live_p (regno)
4510         && (regno == R30_REGNUM
4511             || !call_used_regs[regno]))
4512       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4513
4514   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4515     if (df_regs_ever_live_p (regno)
4516         && (!call_used_regs[regno]
4517             || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
4518       {
4519         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4520         last_fp_reg = regno;
4521       }
4522
4523   if (cfun->machine->frame.emit_frame_chain)
4524     {
4525       /* FP and LR are placed in the linkage record.  */
4526       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4527       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4528       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4529       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4530       offset = 2 * UNITS_PER_WORD;
4531     }
4532
4533   /* With stack-clash, LR must be saved in non-leaf functions.  */
4534   gcc_assert (crtl->is_leaf
4535               || (cfun->machine->frame.reg_offset[R30_REGNUM]
4536                   != SLOT_NOT_REQUIRED));
4537
4538   /* Now assign stack slots for them.  */
4539   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4540     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4541       {
4542         cfun->machine->frame.reg_offset[regno] = offset;
4543         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4544           cfun->machine->frame.wb_candidate1 = regno;
4545         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4546           cfun->machine->frame.wb_candidate2 = regno;
4547         offset += UNITS_PER_WORD;
4548       }
4549
4550   HOST_WIDE_INT max_int_offset = offset;
4551   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4552   bool has_align_gap = offset != max_int_offset;
4553
4554   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4555     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4556       {
4557         /* If there is an alignment gap between integer and fp callee-saves,
4558            allocate the last fp register to it if possible.  */
4559         if (regno == last_fp_reg
4560             && has_align_gap
4561             && !simd_function
4562             && (offset & 8) == 0)
4563           {
4564             cfun->machine->frame.reg_offset[regno] = max_int_offset;
4565             break;
4566           }
4567
4568         cfun->machine->frame.reg_offset[regno] = offset;
4569         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4570           cfun->machine->frame.wb_candidate1 = regno;
4571         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4572                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4573           cfun->machine->frame.wb_candidate2 = regno;
4574         offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
4575       }
4576
4577   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4578
4579   cfun->machine->frame.saved_regs_size = offset;
4580
4581   HOST_WIDE_INT varargs_and_saved_regs_size
4582     = offset + cfun->machine->frame.saved_varargs_size;
4583
4584   cfun->machine->frame.hard_fp_offset
4585     = aligned_upper_bound (varargs_and_saved_regs_size
4586                            + get_frame_size (),
4587                            STACK_BOUNDARY / BITS_PER_UNIT);
4588
4589   /* Both these values are already aligned.  */
4590   gcc_assert (multiple_p (crtl->outgoing_args_size,
4591                           STACK_BOUNDARY / BITS_PER_UNIT));
4592   cfun->machine->frame.frame_size
4593     = (cfun->machine->frame.hard_fp_offset
4594        + crtl->outgoing_args_size);
4595
4596   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4597
4598   cfun->machine->frame.initial_adjust = 0;
4599   cfun->machine->frame.final_adjust = 0;
4600   cfun->machine->frame.callee_adjust = 0;
4601   cfun->machine->frame.callee_offset = 0;
4602
4603   HOST_WIDE_INT max_push_offset = 0;
4604   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4605     max_push_offset = 512;
4606   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4607     max_push_offset = 256;
4608
4609   HOST_WIDE_INT const_size, const_fp_offset;
4610   if (cfun->machine->frame.frame_size.is_constant (&const_size)
4611       && const_size < max_push_offset
4612       && known_eq (crtl->outgoing_args_size, 0))
4613     {
4614       /* Simple, small frame with no outgoing arguments:
4615          stp reg1, reg2, [sp, -frame_size]!
4616          stp reg3, reg4, [sp, 16]  */
4617       cfun->machine->frame.callee_adjust = const_size;
4618     }
4619   else if (known_lt (crtl->outgoing_args_size
4620                      + cfun->machine->frame.saved_regs_size, 512)
4621            && !(cfun->calls_alloca
4622                 && known_lt (cfun->machine->frame.hard_fp_offset,
4623                              max_push_offset)))
4624     {
4625       /* Frame with small outgoing arguments:
4626          sub sp, sp, frame_size
4627          stp reg1, reg2, [sp, outgoing_args_size]
4628          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
4629       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4630       cfun->machine->frame.callee_offset
4631         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4632     }
4633   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4634            && const_fp_offset < max_push_offset)
4635     {
4636       /* Frame with large outgoing arguments but a small local area:
4637          stp reg1, reg2, [sp, -hard_fp_offset]!
4638          stp reg3, reg4, [sp, 16]
4639          sub sp, sp, outgoing_args_size  */
4640       cfun->machine->frame.callee_adjust = const_fp_offset;
4641       cfun->machine->frame.final_adjust
4642         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4643     }
4644   else
4645     {
4646       /* Frame with large local area and outgoing arguments using frame pointer:
4647          sub sp, sp, hard_fp_offset
4648          stp x29, x30, [sp, 0]
4649          add x29, sp, 0
4650          stp reg3, reg4, [sp, 16]
4651          sub sp, sp, outgoing_args_size  */
4652       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4653       cfun->machine->frame.final_adjust
4654         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4655     }
4656
4657   cfun->machine->frame.laid_out = true;
4658 }
4659
4660 /* Return true if the register REGNO is saved on entry to
4661    the current function.  */
4662
4663 static bool
4664 aarch64_register_saved_on_entry (int regno)
4665 {
4666   return cfun->machine->frame.reg_offset[regno] >= 0;
4667 }
4668
4669 /* Return the next register up from REGNO up to LIMIT for the callee
4670    to save.  */
4671
4672 static unsigned
4673 aarch64_next_callee_save (unsigned regno, unsigned limit)
4674 {
4675   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4676     regno ++;
4677   return regno;
4678 }
4679
4680 /* Push the register number REGNO of mode MODE to the stack with write-back
4681    adjusting the stack by ADJUSTMENT.  */
4682
4683 static void
4684 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4685                            HOST_WIDE_INT adjustment)
4686  {
4687   rtx base_rtx = stack_pointer_rtx;
4688   rtx insn, reg, mem;
4689
4690   reg = gen_rtx_REG (mode, regno);
4691   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4692                             plus_constant (Pmode, base_rtx, -adjustment));
4693   mem = gen_frame_mem (mode, mem);
4694
4695   insn = emit_move_insn (mem, reg);
4696   RTX_FRAME_RELATED_P (insn) = 1;
4697 }
4698
4699 /* Generate and return an instruction to store the pair of registers
4700    REG and REG2 of mode MODE to location BASE with write-back adjusting
4701    the stack location BASE by ADJUSTMENT.  */
4702
4703 static rtx
4704 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4705                           HOST_WIDE_INT adjustment)
4706 {
4707   switch (mode)
4708     {
4709     case E_DImode:
4710       return gen_storewb_pairdi_di (base, base, reg, reg2,
4711                                     GEN_INT (-adjustment),
4712                                     GEN_INT (UNITS_PER_WORD - adjustment));
4713     case E_DFmode:
4714       return gen_storewb_pairdf_di (base, base, reg, reg2,
4715                                     GEN_INT (-adjustment),
4716                                     GEN_INT (UNITS_PER_WORD - adjustment));
4717     case E_TFmode:
4718       return gen_storewb_pairtf_di (base, base, reg, reg2,
4719                                     GEN_INT (-adjustment),
4720                                     GEN_INT (UNITS_PER_VREG - adjustment));
4721     default:
4722       gcc_unreachable ();
4723     }
4724 }
4725
4726 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4727    stack pointer by ADJUSTMENT.  */
4728
4729 static void
4730 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4731 {
4732   rtx_insn *insn;
4733   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4734
4735   if (regno2 == INVALID_REGNUM)
4736     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4737
4738   rtx reg1 = gen_rtx_REG (mode, regno1);
4739   rtx reg2 = gen_rtx_REG (mode, regno2);
4740
4741   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4742                                               reg2, adjustment));
4743   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4744   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4745   RTX_FRAME_RELATED_P (insn) = 1;
4746 }
4747
4748 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4749    adjusting it by ADJUSTMENT afterwards.  */
4750
4751 static rtx
4752 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4753                          HOST_WIDE_INT adjustment)
4754 {
4755   switch (mode)
4756     {
4757     case E_DImode:
4758       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4759                                    GEN_INT (UNITS_PER_WORD));
4760     case E_DFmode:
4761       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4762                                    GEN_INT (UNITS_PER_WORD));
4763     case E_TFmode:
4764       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
4765                                    GEN_INT (UNITS_PER_VREG));
4766     default:
4767       gcc_unreachable ();
4768     }
4769 }
4770
4771 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4772    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4773    into CFI_OPS.  */
4774
4775 static void
4776 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4777                   rtx *cfi_ops)
4778 {
4779   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4780   rtx reg1 = gen_rtx_REG (mode, regno1);
4781
4782   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4783
4784   if (regno2 == INVALID_REGNUM)
4785     {
4786       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4787       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4788       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4789     }
4790   else
4791     {
4792       rtx reg2 = gen_rtx_REG (mode, regno2);
4793       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4794       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4795                                           reg2, adjustment));
4796     }
4797 }
4798
4799 /* Generate and return a store pair instruction of mode MODE to store
4800    register REG1 to MEM1 and register REG2 to MEM2.  */
4801
4802 static rtx
4803 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4804                         rtx reg2)
4805 {
4806   switch (mode)
4807     {
4808     case E_DImode:
4809       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4810
4811     case E_DFmode:
4812       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4813
4814     case E_TFmode:
4815       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
4816
4817     default:
4818       gcc_unreachable ();
4819     }
4820 }
4821
4822 /* Generate and regurn a load pair isntruction of mode MODE to load register
4823    REG1 from MEM1 and register REG2 from MEM2.  */
4824
4825 static rtx
4826 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4827                        rtx mem2)
4828 {
4829   switch (mode)
4830     {
4831     case E_DImode:
4832       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4833
4834     case E_DFmode:
4835       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4836
4837     case E_TFmode:
4838       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
4839
4840     default:
4841       gcc_unreachable ();
4842     }
4843 }
4844
4845 /* Return TRUE if return address signing should be enabled for the current
4846    function, otherwise return FALSE.  */
4847
4848 bool
4849 aarch64_return_address_signing_enabled (void)
4850 {
4851   /* This function should only be called after frame laid out.   */
4852   gcc_assert (cfun->machine->frame.laid_out);
4853
4854   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4855      if it's LR is pushed onto stack.  */
4856   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4857           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4858               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4859 }
4860
4861 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
4862 bool
4863 aarch64_bti_enabled (void)
4864 {
4865   return (aarch64_enable_bti == 1);
4866 }
4867
4868 /* Emit code to save the callee-saved registers from register number START
4869    to LIMIT to the stack at the location starting at offset START_OFFSET,
4870    skipping any write-back candidates if SKIP_WB is true.  */
4871
4872 static void
4873 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4874                            unsigned start, unsigned limit, bool skip_wb)
4875 {
4876   rtx_insn *insn;
4877   unsigned regno;
4878   unsigned regno2;
4879
4880   for (regno = aarch64_next_callee_save (start, limit);
4881        regno <= limit;
4882        regno = aarch64_next_callee_save (regno + 1, limit))
4883     {
4884       rtx reg, mem;
4885       poly_int64 offset;
4886       int offset_diff;
4887
4888       if (skip_wb
4889           && (regno == cfun->machine->frame.wb_candidate1
4890               || regno == cfun->machine->frame.wb_candidate2))
4891         continue;
4892
4893       if (cfun->machine->reg_is_wrapped_separately[regno])
4894        continue;
4895
4896       reg = gen_rtx_REG (mode, regno);
4897       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4898       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4899                                                 offset));
4900
4901       regno2 = aarch64_next_callee_save (regno + 1, limit);
4902       offset_diff = cfun->machine->frame.reg_offset[regno2]
4903                     - cfun->machine->frame.reg_offset[regno];
4904
4905       if (regno2 <= limit
4906           && !cfun->machine->reg_is_wrapped_separately[regno2]
4907           && known_eq (GET_MODE_SIZE (mode), offset_diff))
4908         {
4909           rtx reg2 = gen_rtx_REG (mode, regno2);
4910           rtx mem2;
4911
4912           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4913           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4914                                                      offset));
4915           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4916                                                     reg2));
4917
4918           /* The first part of a frame-related parallel insn is
4919              always assumed to be relevant to the frame
4920              calculations; subsequent parts, are only
4921              frame-related if explicitly marked.  */
4922           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4923           regno = regno2;
4924         }
4925       else
4926         insn = emit_move_insn (mem, reg);
4927
4928       RTX_FRAME_RELATED_P (insn) = 1;
4929     }
4930 }
4931
4932 /* Emit code to restore the callee registers of mode MODE from register
4933    number START up to and including LIMIT.  Restore from the stack offset
4934    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4935    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
4936
4937 static void
4938 aarch64_restore_callee_saves (machine_mode mode,
4939                               poly_int64 start_offset, unsigned start,
4940                               unsigned limit, bool skip_wb, rtx *cfi_ops)
4941 {
4942   rtx base_rtx = stack_pointer_rtx;
4943   unsigned regno;
4944   unsigned regno2;
4945   poly_int64 offset;
4946
4947   for (regno = aarch64_next_callee_save (start, limit);
4948        regno <= limit;
4949        regno = aarch64_next_callee_save (regno + 1, limit))
4950     {
4951       if (cfun->machine->reg_is_wrapped_separately[regno])
4952        continue;
4953
4954       rtx reg, mem;
4955       int offset_diff;
4956
4957       if (skip_wb
4958           && (regno == cfun->machine->frame.wb_candidate1
4959               || regno == cfun->machine->frame.wb_candidate2))
4960         continue;
4961
4962       reg = gen_rtx_REG (mode, regno);
4963       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4964       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4965
4966       regno2 = aarch64_next_callee_save (regno + 1, limit);
4967       offset_diff = cfun->machine->frame.reg_offset[regno2]
4968                     - cfun->machine->frame.reg_offset[regno];
4969
4970       if (regno2 <= limit
4971           && !cfun->machine->reg_is_wrapped_separately[regno2]
4972           && known_eq (GET_MODE_SIZE (mode), offset_diff))
4973         {
4974           rtx reg2 = gen_rtx_REG (mode, regno2);
4975           rtx mem2;
4976
4977           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4978           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4979           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4980
4981           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4982           regno = regno2;
4983         }
4984       else
4985         emit_move_insn (reg, mem);
4986       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4987     }
4988 }
4989
4990 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4991    of MODE.  */
4992
4993 static inline bool
4994 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4995 {
4996   HOST_WIDE_INT multiple;
4997   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4998           && IN_RANGE (multiple, -8, 7));
4999 }
5000
5001 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5002    of MODE.  */
5003
5004 static inline bool
5005 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5006 {
5007   HOST_WIDE_INT multiple;
5008   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5009           && IN_RANGE (multiple, 0, 63));
5010 }
5011
5012 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5013    of MODE.  */
5014
5015 bool
5016 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5017 {
5018   HOST_WIDE_INT multiple;
5019   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5020           && IN_RANGE (multiple, -64, 63));
5021 }
5022
5023 /* Return true if OFFSET is a signed 9-bit value.  */
5024
5025 bool
5026 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5027                                        poly_int64 offset)
5028 {
5029   HOST_WIDE_INT const_offset;
5030   return (offset.is_constant (&const_offset)
5031           && IN_RANGE (const_offset, -256, 255));
5032 }
5033
5034 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5035    of MODE.  */
5036
5037 static inline bool
5038 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5039 {
5040   HOST_WIDE_INT multiple;
5041   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5042           && IN_RANGE (multiple, -256, 255));
5043 }
5044
5045 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5046    of MODE.  */
5047
5048 static inline bool
5049 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5050 {
5051   HOST_WIDE_INT multiple;
5052   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5053           && IN_RANGE (multiple, 0, 4095));
5054 }
5055
5056 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
5057
5058 static sbitmap
5059 aarch64_get_separate_components (void)
5060 {
5061   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5062   bitmap_clear (components);
5063
5064   /* The registers we need saved to the frame.  */
5065   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5066     if (aarch64_register_saved_on_entry (regno))
5067       {
5068         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5069         if (!frame_pointer_needed)
5070           offset += cfun->machine->frame.frame_size
5071                     - cfun->machine->frame.hard_fp_offset;
5072         /* Check that we can access the stack slot of the register with one
5073            direct load with no adjustments needed.  */
5074         if (offset_12bit_unsigned_scaled_p (DImode, offset))
5075           bitmap_set_bit (components, regno);
5076       }
5077
5078   /* Don't mess with the hard frame pointer.  */
5079   if (frame_pointer_needed)
5080     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5081
5082   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5083   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5084   /* If registers have been chosen to be stored/restored with
5085      writeback don't interfere with them to avoid having to output explicit
5086      stack adjustment instructions.  */
5087   if (reg2 != INVALID_REGNUM)
5088     bitmap_clear_bit (components, reg2);
5089   if (reg1 != INVALID_REGNUM)
5090     bitmap_clear_bit (components, reg1);
5091
5092   bitmap_clear_bit (components, LR_REGNUM);
5093   bitmap_clear_bit (components, SP_REGNUM);
5094
5095   return components;
5096 }
5097
5098 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
5099
5100 static sbitmap
5101 aarch64_components_for_bb (basic_block bb)
5102 {
5103   bitmap in = DF_LIVE_IN (bb);
5104   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5105   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5106   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5107
5108   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5109   bitmap_clear (components);
5110
5111   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
5112   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5113     if ((!call_used_regs[regno]
5114         || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5115        && (bitmap_bit_p (in, regno)
5116            || bitmap_bit_p (gen, regno)
5117            || bitmap_bit_p (kill, regno)))
5118       {
5119         unsigned regno2, offset, offset2;
5120         bitmap_set_bit (components, regno);
5121
5122         /* If there is a callee-save at an adjacent offset, add it too
5123            to increase the use of LDP/STP.  */
5124         offset = cfun->machine->frame.reg_offset[regno];
5125         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5126
5127         if (regno2 <= LAST_SAVED_REGNUM)
5128           {
5129             offset2 = cfun->machine->frame.reg_offset[regno2];
5130             if ((offset & ~8) == (offset2 & ~8))
5131               bitmap_set_bit (components, regno2);
5132           }
5133       }
5134
5135   return components;
5136 }
5137
5138 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5139    Nothing to do for aarch64.  */
5140
5141 static void
5142 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5143 {
5144 }
5145
5146 /* Return the next set bit in BMP from START onwards.  Return the total number
5147    of bits in BMP if no set bit is found at or after START.  */
5148
5149 static unsigned int
5150 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5151 {
5152   unsigned int nbits = SBITMAP_SIZE (bmp);
5153   if (start == nbits)
5154     return start;
5155
5156   gcc_assert (start < nbits);
5157   for (unsigned int i = start; i < nbits; i++)
5158     if (bitmap_bit_p (bmp, i))
5159       return i;
5160
5161   return nbits;
5162 }
5163
5164 /* Do the work for aarch64_emit_prologue_components and
5165    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
5166    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5167    for these components or the epilogue sequence.  That is, it determines
5168    whether we should emit stores or loads and what kind of CFA notes to attach
5169    to the insns.  Otherwise the logic for the two sequences is very
5170    similar.  */
5171
5172 static void
5173 aarch64_process_components (sbitmap components, bool prologue_p)
5174 {
5175   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5176                              ? HARD_FRAME_POINTER_REGNUM
5177                              : STACK_POINTER_REGNUM);
5178
5179   unsigned last_regno = SBITMAP_SIZE (components);
5180   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5181   rtx_insn *insn = NULL;
5182
5183   while (regno != last_regno)
5184     {
5185       /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5186          so DFmode for the vector registers is enough.  For simd functions
5187          we want to save the low 128 bits.  */
5188       machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5189
5190       rtx reg = gen_rtx_REG (mode, regno);
5191       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5192       if (!frame_pointer_needed)
5193         offset += cfun->machine->frame.frame_size
5194                   - cfun->machine->frame.hard_fp_offset;
5195       rtx addr = plus_constant (Pmode, ptr_reg, offset);
5196       rtx mem = gen_frame_mem (mode, addr);
5197
5198       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5199       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5200       /* No more registers to handle after REGNO.
5201          Emit a single save/restore and exit.  */
5202       if (regno2 == last_regno)
5203         {
5204           insn = emit_insn (set);
5205           RTX_FRAME_RELATED_P (insn) = 1;
5206           if (prologue_p)
5207             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5208           else
5209             add_reg_note (insn, REG_CFA_RESTORE, reg);
5210           break;
5211         }
5212
5213       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5214       /* The next register is not of the same class or its offset is not
5215          mergeable with the current one into a pair.  */
5216       if (!satisfies_constraint_Ump (mem)
5217           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5218           || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5219           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5220                        GET_MODE_SIZE (mode)))
5221         {
5222           insn = emit_insn (set);
5223           RTX_FRAME_RELATED_P (insn) = 1;
5224           if (prologue_p)
5225             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5226           else
5227             add_reg_note (insn, REG_CFA_RESTORE, reg);
5228
5229           regno = regno2;
5230           continue;
5231         }
5232
5233       /* REGNO2 can be saved/restored in a pair with REGNO.  */
5234       rtx reg2 = gen_rtx_REG (mode, regno2);
5235       if (!frame_pointer_needed)
5236         offset2 += cfun->machine->frame.frame_size
5237                   - cfun->machine->frame.hard_fp_offset;
5238       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5239       rtx mem2 = gen_frame_mem (mode, addr2);
5240       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5241                              : gen_rtx_SET (reg2, mem2);
5242
5243       if (prologue_p)
5244         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
5245       else
5246         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5247
5248       RTX_FRAME_RELATED_P (insn) = 1;
5249       if (prologue_p)
5250         {
5251           add_reg_note (insn, REG_CFA_OFFSET, set);
5252           add_reg_note (insn, REG_CFA_OFFSET, set2);
5253         }
5254       else
5255         {
5256           add_reg_note (insn, REG_CFA_RESTORE, reg);
5257           add_reg_note (insn, REG_CFA_RESTORE, reg2);
5258         }
5259
5260       regno = aarch64_get_next_set_bit (components, regno2 + 1);
5261     }
5262 }
5263
5264 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
5265
5266 static void
5267 aarch64_emit_prologue_components (sbitmap components)
5268 {
5269   aarch64_process_components (components, true);
5270 }
5271
5272 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
5273
5274 static void
5275 aarch64_emit_epilogue_components (sbitmap components)
5276 {
5277   aarch64_process_components (components, false);
5278 }
5279
5280 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
5281
5282 static void
5283 aarch64_set_handled_components (sbitmap components)
5284 {
5285   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5286     if (bitmap_bit_p (components, regno))
5287       cfun->machine->reg_is_wrapped_separately[regno] = true;
5288 }
5289
5290 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
5291    determining the probe offset for alloca.  */
5292
5293 static HOST_WIDE_INT
5294 aarch64_stack_clash_protection_alloca_probe_range (void)
5295 {
5296   return STACK_CLASH_CALLER_GUARD;
5297 }
5298
5299
5300 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5301    registers.  If POLY_SIZE is not large enough to require a probe this function
5302    will only adjust the stack.  When allocating the stack space
5303    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5304    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5305    arguments.  If we are then we ensure that any allocation larger than the ABI
5306    defined buffer needs a probe so that the invariant of having a 1KB buffer is
5307    maintained.
5308
5309    We emit barriers after each stack adjustment to prevent optimizations from
5310    breaking the invariant that we never drop the stack more than a page.  This
5311    invariant is needed to make it easier to correctly handle asynchronous
5312    events, e.g. if we were to allow the stack to be dropped by more than a page
5313    and then have multiple probes up and we take a signal somewhere in between
5314    then the signal handler doesn't know the state of the stack and can make no
5315    assumptions about which pages have been probed.  */
5316
5317 static void
5318 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
5319                                         poly_int64 poly_size,
5320                                         bool frame_related_p,
5321                                         bool final_adjustment_p)
5322 {
5323   HOST_WIDE_INT guard_size
5324     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5325   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5326   /* When doing the final adjustment for the outgoing argument size we can't
5327      assume that LR was saved at position 0.  So subtract it's offset from the
5328      ABI safe buffer so that we don't accidentally allow an adjustment that
5329      would result in an allocation larger than the ABI buffer without
5330      probing.  */
5331   HOST_WIDE_INT min_probe_threshold
5332     = final_adjustment_p
5333       ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
5334       : guard_size - guard_used_by_caller;
5335
5336   poly_int64 frame_size = cfun->machine->frame.frame_size;
5337
5338   /* We should always have a positive probe threshold.  */
5339   gcc_assert (min_probe_threshold > 0);
5340
5341   if (flag_stack_clash_protection && !final_adjustment_p)
5342     {
5343       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5344       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5345
5346       if (known_eq (frame_size, 0))
5347         {
5348           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
5349         }
5350       else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
5351                && known_lt (final_adjust, guard_used_by_caller))
5352         {
5353           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
5354         }
5355     }
5356
5357   /* If SIZE is not large enough to require probing, just adjust the stack and
5358      exit.  */
5359   if (known_lt (poly_size, min_probe_threshold)
5360       || !flag_stack_clash_protection)
5361     {
5362       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
5363       return;
5364     }
5365
5366   HOST_WIDE_INT size;
5367   /* Handle the SVE non-constant case first.  */
5368   if (!poly_size.is_constant (&size))
5369     {
5370      if (dump_file)
5371       {
5372         fprintf (dump_file, "Stack clash SVE prologue: ");
5373         print_dec (poly_size, dump_file);
5374         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
5375       }
5376
5377       /* First calculate the amount of bytes we're actually spilling.  */
5378       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
5379                           poly_size, temp1, temp2, false, true);
5380
5381       rtx_insn *insn = get_last_insn ();
5382
5383       if (frame_related_p)
5384         {
5385           /* This is done to provide unwinding information for the stack
5386              adjustments we're about to do, however to prevent the optimizers
5387              from removing the R11 move and leaving the CFA note (which would be
5388              very wrong) we tie the old and new stack pointer together.
5389              The tie will expand to nothing but the optimizers will not touch
5390              the instruction.  */
5391           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
5392           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
5393           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
5394
5395           /* We want the CFA independent of the stack pointer for the
5396              duration of the loop.  */
5397           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
5398           RTX_FRAME_RELATED_P (insn) = 1;
5399         }
5400
5401       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
5402       rtx guard_const = gen_int_mode (guard_size, Pmode);
5403
5404       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
5405                                                    stack_pointer_rtx, temp1,
5406                                                    probe_const, guard_const));
5407
5408       /* Now reset the CFA register if needed.  */
5409       if (frame_related_p)
5410         {
5411           add_reg_note (insn, REG_CFA_DEF_CFA,
5412                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
5413                                       gen_int_mode (poly_size, Pmode)));
5414           RTX_FRAME_RELATED_P (insn) = 1;
5415         }
5416
5417       return;
5418     }
5419
5420   if (dump_file)
5421     fprintf (dump_file,
5422              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5423              " bytes, probing will be required.\n", size);
5424
5425   /* Round size to the nearest multiple of guard_size, and calculate the
5426      residual as the difference between the original size and the rounded
5427      size.  */
5428   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
5429   HOST_WIDE_INT residual = size - rounded_size;
5430
5431   /* We can handle a small number of allocations/probes inline.  Otherwise
5432      punt to a loop.  */
5433   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
5434     {
5435       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
5436         {
5437           aarch64_sub_sp (NULL, temp2, guard_size, true);
5438           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5439                                            guard_used_by_caller));
5440           emit_insn (gen_blockage ());
5441         }
5442       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
5443     }
5444   else
5445     {
5446       /* Compute the ending address.  */
5447       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
5448                           temp1, NULL, false, true);
5449       rtx_insn *insn = get_last_insn ();
5450
5451       /* For the initial allocation, we don't have a frame pointer
5452          set up, so we always need CFI notes.  If we're doing the
5453          final allocation, then we may have a frame pointer, in which
5454          case it is the CFA, otherwise we need CFI notes.
5455
5456          We can determine which allocation we are doing by looking at
5457          the value of FRAME_RELATED_P since the final allocations are not
5458          frame related.  */
5459       if (frame_related_p)
5460         {
5461           /* We want the CFA independent of the stack pointer for the
5462              duration of the loop.  */
5463           add_reg_note (insn, REG_CFA_DEF_CFA,
5464                         plus_constant (Pmode, temp1, rounded_size));
5465           RTX_FRAME_RELATED_P (insn) = 1;
5466         }
5467
5468       /* This allocates and probes the stack.  Note that this re-uses some of
5469          the existing Ada stack protection code.  However we are guaranteed not
5470          to enter the non loop or residual branches of that code.
5471
5472          The non-loop part won't be entered because if our allocation amount
5473          doesn't require a loop, the case above would handle it.
5474
5475          The residual amount won't be entered because TEMP1 is a mutliple of
5476          the allocation size.  The residual will always be 0.  As such, the only
5477          part we are actually using from that code is the loop setup.  The
5478          actual probing is done in aarch64_output_probe_stack_range.  */
5479       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
5480                                                stack_pointer_rtx, temp1));
5481
5482       /* Now reset the CFA register if needed.  */
5483       if (frame_related_p)
5484         {
5485           add_reg_note (insn, REG_CFA_DEF_CFA,
5486                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
5487           RTX_FRAME_RELATED_P (insn) = 1;
5488         }
5489
5490       emit_insn (gen_blockage ());
5491       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
5492     }
5493
5494   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
5495      be probed.  This maintains the requirement that each page is probed at
5496      least once.  For initial probing we probe only if the allocation is
5497      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
5498      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
5499      GUARD_SIZE.  This works that for any allocation that is large enough to
5500      trigger a probe here, we'll have at least one, and if they're not large
5501      enough for this code to emit anything for them, The page would have been
5502      probed by the saving of FP/LR either by this function or any callees.  If
5503      we don't have any callees then we won't have more stack adjustments and so
5504      are still safe.  */
5505   if (residual)
5506     {
5507       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
5508       /* If we're doing final adjustments, and we've done any full page
5509          allocations then any residual needs to be probed.  */
5510       if (final_adjustment_p && rounded_size != 0)
5511         min_probe_threshold = 0;
5512       /* If doing a small final adjustment, we always probe at offset 0.
5513          This is done to avoid issues when LR is not at position 0 or when
5514          the final adjustment is smaller than the probing offset.  */
5515       else if (final_adjustment_p && rounded_size == 0)
5516         residual_probe_offset = 0;
5517
5518       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
5519       if (residual >= min_probe_threshold)
5520         {
5521           if (dump_file)
5522             fprintf (dump_file,
5523                      "Stack clash AArch64 prologue residuals: "
5524                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
5525                      "\n", residual);
5526
5527             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5528                                              residual_probe_offset));
5529           emit_insn (gen_blockage ());
5530         }
5531     }
5532 }
5533
5534 /* Return 1 if the register is used by the epilogue.  We need to say the
5535    return register is used, but only after epilogue generation is complete.
5536    Note that in the case of sibcalls, the values "used by the epilogue" are
5537    considered live at the start of the called function.
5538
5539    For SIMD functions we need to return 1 for FP registers that are saved and
5540    restored by a function but are not zero in call_used_regs.  If we do not do
5541    this optimizations may remove the restore of the register.  */
5542
5543 int
5544 aarch64_epilogue_uses (int regno)
5545 {
5546   if (epilogue_completed)
5547     {
5548       if (regno == LR_REGNUM)
5549         return 1;
5550       if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
5551         return 1;
5552     }
5553   return 0;
5554 }
5555
5556 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
5557    is saved at BASE + OFFSET.  */
5558
5559 static void
5560 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
5561                             rtx base, poly_int64 offset)
5562 {
5563   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
5564   add_reg_note (insn, REG_CFA_EXPRESSION,
5565                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
5566 }
5567
5568 /* AArch64 stack frames generated by this compiler look like:
5569
5570         +-------------------------------+
5571         |                               |
5572         |  incoming stack arguments     |
5573         |                               |
5574         +-------------------------------+
5575         |                               | <-- incoming stack pointer (aligned)
5576         |  callee-allocated save area   |
5577         |  for register varargs         |
5578         |                               |
5579         +-------------------------------+
5580         |  local variables              | <-- frame_pointer_rtx
5581         |                               |
5582         +-------------------------------+
5583         |  padding                      | \
5584         +-------------------------------+  |
5585         |  callee-saved registers       |  | frame.saved_regs_size
5586         +-------------------------------+  |
5587         |  LR'                          |  |
5588         +-------------------------------+  |
5589         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
5590         +-------------------------------+
5591         |  dynamic allocation           |
5592         +-------------------------------+
5593         |  padding                      |
5594         +-------------------------------+
5595         |  outgoing stack arguments     | <-- arg_pointer
5596         |                               |
5597         +-------------------------------+
5598         |                               | <-- stack_pointer_rtx (aligned)
5599
5600    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
5601    but leave frame_pointer_rtx and hard_frame_pointer_rtx
5602    unchanged.
5603
5604    By default for stack-clash we assume the guard is at least 64KB, but this
5605    value is configurable to either 4KB or 64KB.  We also force the guard size to
5606    be the same as the probing interval and both values are kept in sync.
5607
5608    With those assumptions the callee can allocate up to 63KB (or 3KB depending
5609    on the guard size) of stack space without probing.
5610
5611    When probing is needed, we emit a probe at the start of the prologue
5612    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
5613
5614    We have to track how much space has been allocated and the only stores
5615    to the stack we track as implicit probes are the FP/LR stores.
5616
5617    For outgoing arguments we probe if the size is larger than 1KB, such that
5618    the ABI specified buffer is maintained for the next callee.
5619
5620    The following registers are reserved during frame layout and should not be
5621    used for any other purpose:
5622
5623    - r11: Used by stack clash protection when SVE is enabled.
5624    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
5625    - r14 and r15: Used for speculation tracking.
5626    - r16(IP0), r17(IP1): Used by indirect tailcalls.
5627    - r30(LR), r29(FP): Used by standard frame layout.
5628
5629    These registers must be avoided in frame layout related code unless the
5630    explicit intention is to interact with one of the features listed above.  */
5631
5632 /* Generate the prologue instructions for entry into a function.
5633    Establish the stack frame by decreasing the stack pointer with a
5634    properly calculated size and, if necessary, create a frame record
5635    filled with the values of LR and previous frame pointer.  The
5636    current FP is also set up if it is in use.  */
5637
5638 void
5639 aarch64_expand_prologue (void)
5640 {
5641   poly_int64 frame_size = cfun->machine->frame.frame_size;
5642   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5643   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5644   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5645   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5646   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5647   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5648   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
5649   rtx_insn *insn;
5650
5651   /* Sign return address for functions.  */
5652   if (aarch64_return_address_signing_enabled ())
5653     {
5654       insn = emit_insn (gen_pacisp ());
5655       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5656       RTX_FRAME_RELATED_P (insn) = 1;
5657     }
5658
5659   if (flag_stack_usage_info)
5660     current_function_static_stack_size = constant_lower_bound (frame_size);
5661
5662   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5663     {
5664       if (crtl->is_leaf && !cfun->calls_alloca)
5665         {
5666           if (maybe_gt (frame_size, PROBE_INTERVAL)
5667               && maybe_gt (frame_size, get_stack_check_protect ()))
5668             aarch64_emit_probe_stack_range (get_stack_check_protect (),
5669                                             (frame_size
5670                                              - get_stack_check_protect ()));
5671         }
5672       else if (maybe_gt (frame_size, 0))
5673         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
5674     }
5675
5676   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5677   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5678
5679   /* In theory we should never have both an initial adjustment
5680      and a callee save adjustment.  Verify that is the case since the
5681      code below does not handle it for -fstack-clash-protection.  */
5682   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
5683
5684   /* Will only probe if the initial adjustment is larger than the guard
5685      less the amount of the guard reserved for use by the caller's
5686      outgoing args.  */
5687   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
5688                                           true, false);
5689
5690   if (callee_adjust != 0)
5691     aarch64_push_regs (reg1, reg2, callee_adjust);
5692
5693   if (emit_frame_chain)
5694     {
5695       poly_int64 reg_offset = callee_adjust;
5696       if (callee_adjust == 0)
5697         {
5698           reg1 = R29_REGNUM;
5699           reg2 = R30_REGNUM;
5700           reg_offset = callee_offset;
5701           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
5702         }
5703       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
5704                           stack_pointer_rtx, callee_offset,
5705                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
5706       if (frame_pointer_needed && !frame_size.is_constant ())
5707         {
5708           /* Variable-sized frames need to describe the save slot
5709              address using DW_CFA_expression rather than DW_CFA_offset.
5710              This means that, without taking further action, the
5711              locations of the registers that we've already saved would
5712              remain based on the stack pointer even after we redefine
5713              the CFA based on the frame pointer.  We therefore need new
5714              DW_CFA_expressions to re-express the save slots with addresses
5715              based on the frame pointer.  */
5716           rtx_insn *insn = get_last_insn ();
5717           gcc_assert (RTX_FRAME_RELATED_P (insn));
5718
5719           /* Add an explicit CFA definition if this was previously
5720              implicit.  */
5721           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
5722             {
5723               rtx src = plus_constant (Pmode, stack_pointer_rtx,
5724                                        callee_offset);
5725               add_reg_note (insn, REG_CFA_ADJUST_CFA,
5726                             gen_rtx_SET (hard_frame_pointer_rtx, src));
5727             }
5728
5729           /* Change the save slot expressions for the registers that
5730              we've already saved.  */
5731           reg_offset -= callee_offset;
5732           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
5733                                       reg_offset + UNITS_PER_WORD);
5734           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
5735                                       reg_offset);
5736         }
5737       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
5738     }
5739
5740   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5741                              callee_adjust != 0 || emit_frame_chain);
5742   if (aarch64_simd_decl_p (cfun->decl))
5743     aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5744                                callee_adjust != 0 || emit_frame_chain);
5745   else
5746     aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5747                                callee_adjust != 0 || emit_frame_chain);
5748
5749   /* We may need to probe the final adjustment if it is larger than the guard
5750      that is assumed by the called.  */
5751   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
5752                                           !frame_pointer_needed, true);
5753 }
5754
5755 /* Return TRUE if we can use a simple_return insn.
5756
5757    This function checks whether the callee saved stack is empty, which
5758    means no restore actions are need. The pro_and_epilogue will use
5759    this to check whether shrink-wrapping opt is feasible.  */
5760
5761 bool
5762 aarch64_use_return_insn_p (void)
5763 {
5764   if (!reload_completed)
5765     return false;
5766
5767   if (crtl->profile)
5768     return false;
5769
5770   return known_eq (cfun->machine->frame.frame_size, 0);
5771 }
5772
5773 /* Return false for non-leaf SIMD functions in order to avoid
5774    shrink-wrapping them.  Doing this will lose the necessary
5775    save/restore of FP registers.  */
5776
5777 bool
5778 aarch64_use_simple_return_insn_p (void)
5779 {
5780   if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
5781     return false;
5782
5783   return true;
5784 }
5785
5786 /* Generate the epilogue instructions for returning from a function.
5787    This is almost exactly the reverse of the prolog sequence, except
5788    that we need to insert barriers to avoid scheduling loads that read
5789    from a deallocated stack, and we optimize the unwind records by
5790    emitting them all together if possible.  */
5791 void
5792 aarch64_expand_epilogue (bool for_sibcall)
5793 {
5794   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5795   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5796   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5797   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5798   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5799   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5800   rtx cfi_ops = NULL;
5801   rtx_insn *insn;
5802   /* A stack clash protection prologue may not have left EP0_REGNUM or
5803      EP1_REGNUM in a usable state.  The same is true for allocations
5804      with an SVE component, since we then need both temporary registers
5805      for each allocation.  For stack clash we are in a usable state if
5806      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
5807   HOST_WIDE_INT guard_size
5808     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5809   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5810
5811   /* We can re-use the registers when the allocation amount is smaller than
5812      guard_size - guard_used_by_caller because we won't be doing any probes
5813      then.  In such situations the register should remain live with the correct
5814      value.  */
5815   bool can_inherit_p = (initial_adjust.is_constant ()
5816                         && final_adjust.is_constant ())
5817                         && (!flag_stack_clash_protection
5818                             || known_lt (initial_adjust,
5819                                          guard_size - guard_used_by_caller));
5820
5821   /* We need to add memory barrier to prevent read from deallocated stack.  */
5822   bool need_barrier_p
5823     = maybe_ne (get_frame_size ()
5824                 + cfun->machine->frame.saved_varargs_size, 0);
5825
5826   /* Emit a barrier to prevent loads from a deallocated stack.  */
5827   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5828       || cfun->calls_alloca
5829       || crtl->calls_eh_return)
5830     {
5831       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5832       need_barrier_p = false;
5833     }
5834
5835   /* Restore the stack pointer from the frame pointer if it may not
5836      be the same as the stack pointer.  */
5837   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5838   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5839   if (frame_pointer_needed
5840       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5841     /* If writeback is used when restoring callee-saves, the CFA
5842        is restored on the instruction doing the writeback.  */
5843     aarch64_add_offset (Pmode, stack_pointer_rtx,
5844                         hard_frame_pointer_rtx, -callee_offset,
5845                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
5846   else
5847      /* The case where we need to re-use the register here is very rare, so
5848         avoid the complicated condition and just always emit a move if the
5849         immediate doesn't fit.  */
5850      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
5851
5852   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5853                                 callee_adjust != 0, &cfi_ops);
5854   if (aarch64_simd_decl_p (cfun->decl))
5855     aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5856                                   callee_adjust != 0, &cfi_ops);
5857   else
5858     aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5859                                   callee_adjust != 0, &cfi_ops);
5860
5861   if (need_barrier_p)
5862     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5863
5864   if (callee_adjust != 0)
5865     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5866
5867   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5868     {
5869       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
5870       insn = get_last_insn ();
5871       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5872       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5873       RTX_FRAME_RELATED_P (insn) = 1;
5874       cfi_ops = NULL;
5875     }
5876
5877   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
5878      add restriction on emit_move optimization to leaf functions.  */
5879   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
5880                   (!can_inherit_p || !crtl->is_leaf
5881                    || df_regs_ever_live_p (EP0_REGNUM)));
5882
5883   if (cfi_ops)
5884     {
5885       /* Emit delayed restores and reset the CFA to be SP.  */
5886       insn = get_last_insn ();
5887       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5888       REG_NOTES (insn) = cfi_ops;
5889       RTX_FRAME_RELATED_P (insn) = 1;
5890     }
5891
5892   /* We prefer to emit the combined return/authenticate instruction RETAA,
5893      however there are three cases in which we must instead emit an explicit
5894      authentication instruction.
5895
5896         1) Sibcalls don't return in a normal way, so if we're about to call one
5897            we must authenticate.
5898
5899         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5900            generating code for !TARGET_ARMV8_3 we can't use it and must
5901            explicitly authenticate.
5902
5903         3) On an eh_return path we make extra stack adjustments to update the
5904            canonical frame address to be the exception handler's CFA.  We want
5905            to authenticate using the CFA of the function which calls eh_return.
5906     */
5907   if (aarch64_return_address_signing_enabled ()
5908       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5909     {
5910       insn = emit_insn (gen_autisp ());
5911       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5912       RTX_FRAME_RELATED_P (insn) = 1;
5913     }
5914
5915   /* Stack adjustment for exception handler.  */
5916   if (crtl->calls_eh_return)
5917     {
5918       /* We need to unwind the stack by the offset computed by
5919          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
5920          to be SP; letting the CFA move during this adjustment
5921          is just as correct as retaining the CFA from the body
5922          of the function.  Therefore, do nothing special.  */
5923       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5924     }
5925
5926   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5927   if (!for_sibcall)
5928     emit_jump_insn (ret_rtx);
5929 }
5930
5931 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
5932    normally or return to a previous frame after unwinding.
5933
5934    An EH return uses a single shared return sequence.  The epilogue is
5935    exactly like a normal epilogue except that it has an extra input
5936    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5937    that must be applied after the frame has been destroyed.  An extra label
5938    is inserted before the epilogue which initializes this register to zero,
5939    and this is the entry point for a normal return.
5940
5941    An actual EH return updates the return address, initializes the stack
5942    adjustment and jumps directly into the epilogue (bypassing the zeroing
5943    of the adjustment).  Since the return address is typically saved on the
5944    stack when a function makes a call, the saved LR must be updated outside
5945    the epilogue.
5946
5947    This poses problems as the store is generated well before the epilogue,
5948    so the offset of LR is not known yet.  Also optimizations will remove the
5949    store as it appears dead, even after the epilogue is generated (as the
5950    base or offset for loading LR is different in many cases).
5951
5952    To avoid these problems this implementation forces the frame pointer
5953    in eh_return functions so that the location of LR is fixed and known early.
5954    It also marks the store volatile, so no optimization is permitted to
5955    remove the store.  */
5956 rtx
5957 aarch64_eh_return_handler_rtx (void)
5958 {
5959   rtx tmp = gen_frame_mem (Pmode,
5960     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5961
5962   /* Mark the store volatile, so no optimization is permitted to remove it.  */
5963   MEM_VOLATILE_P (tmp) = true;
5964   return tmp;
5965 }
5966
5967 /* Output code to add DELTA to the first argument, and then jump
5968    to FUNCTION.  Used for C++ multiple inheritance.  */
5969 static void
5970 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5971                          HOST_WIDE_INT delta,
5972                          HOST_WIDE_INT vcall_offset,
5973                          tree function)
5974 {
5975   /* The this pointer is always in x0.  Note that this differs from
5976      Arm where the this pointer maybe bumped to r1 if r0 is required
5977      to return a pointer to an aggregate.  On AArch64 a result value
5978      pointer will be in x8.  */
5979   int this_regno = R0_REGNUM;
5980   rtx this_rtx, temp0, temp1, addr, funexp;
5981   rtx_insn *insn;
5982
5983   if (aarch64_bti_enabled ())
5984     emit_insn (gen_bti_c());
5985
5986   reload_completed = 1;
5987   emit_note (NOTE_INSN_PROLOGUE_END);
5988
5989   this_rtx = gen_rtx_REG (Pmode, this_regno);
5990   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
5991   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
5992
5993   if (vcall_offset == 0)
5994     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5995   else
5996     {
5997       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5998
5999       addr = this_rtx;
6000       if (delta != 0)
6001         {
6002           if (delta >= -256 && delta < 256)
6003             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6004                                        plus_constant (Pmode, this_rtx, delta));
6005           else
6006             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6007                                 temp1, temp0, false);
6008         }
6009
6010       if (Pmode == ptr_mode)
6011         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6012       else
6013         aarch64_emit_move (temp0,
6014                            gen_rtx_ZERO_EXTEND (Pmode,
6015                                                 gen_rtx_MEM (ptr_mode, addr)));
6016
6017       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6018           addr = plus_constant (Pmode, temp0, vcall_offset);
6019       else
6020         {
6021           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6022                                           Pmode);
6023           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6024         }
6025
6026       if (Pmode == ptr_mode)
6027         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6028       else
6029         aarch64_emit_move (temp1,
6030                            gen_rtx_SIGN_EXTEND (Pmode,
6031                                                 gen_rtx_MEM (ptr_mode, addr)));
6032
6033       emit_insn (gen_add2_insn (this_rtx, temp1));
6034     }
6035
6036   /* Generate a tail call to the target function.  */
6037   if (!TREE_USED (function))
6038     {
6039       assemble_external (function);
6040       TREE_USED (function) = 1;
6041     }
6042   funexp = XEXP (DECL_RTL (function), 0);
6043   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6044   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6045   SIBLING_CALL_P (insn) = 1;
6046
6047   insn = get_insns ();
6048   shorten_branches (insn);
6049   final_start_function (insn, file, 1);
6050   final (insn, file, 1);
6051   final_end_function ();
6052
6053   /* Stop pretending to be a post-reload pass.  */
6054   reload_completed = 0;
6055 }
6056
6057 static bool
6058 aarch64_tls_referenced_p (rtx x)
6059 {
6060   if (!TARGET_HAVE_TLS)
6061     return false;
6062   subrtx_iterator::array_type array;
6063   FOR_EACH_SUBRTX (iter, array, x, ALL)
6064     {
6065       const_rtx x = *iter;
6066       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6067         return true;
6068       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6069          TLS offsets, not real symbol references.  */
6070       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6071         iter.skip_subrtxes ();
6072     }
6073   return false;
6074 }
6075
6076
6077 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6078    a left shift of 0 or 12 bits.  */
6079 bool
6080 aarch64_uimm12_shift (HOST_WIDE_INT val)
6081 {
6082   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6083           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6084           );
6085 }
6086
6087 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6088    that can be created with a left shift of 0 or 12.  */
6089 static HOST_WIDE_INT
6090 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6091 {
6092   /* Check to see if the value fits in 24 bits, as that is the maximum we can
6093      handle correctly.  */
6094   gcc_assert ((val & 0xffffff) == val);
6095
6096   if (((val & 0xfff) << 0) == val)
6097     return val;
6098
6099   return val & (0xfff << 12);
6100 }
6101
6102 /* Return true if val is an immediate that can be loaded into a
6103    register by a MOVZ instruction.  */
6104 static bool
6105 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6106 {
6107   if (GET_MODE_SIZE (mode) > 4)
6108     {
6109       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6110           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6111         return 1;
6112     }
6113   else
6114     {
6115       /* Ignore sign extension.  */
6116       val &= (HOST_WIDE_INT) 0xffffffff;
6117     }
6118   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6119           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6120 }
6121
6122 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
6123    64-bit (DImode) integer.  */
6124
6125 static unsigned HOST_WIDE_INT
6126 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6127 {
6128   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6129   while (size < 64)
6130     {
6131       val &= (HOST_WIDE_INT_1U << size) - 1;
6132       val |= val << size;
6133       size *= 2;
6134     }
6135   return val;
6136 }
6137
6138 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
6139
6140 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6141   {
6142     0x0000000100000001ull,
6143     0x0001000100010001ull,
6144     0x0101010101010101ull,
6145     0x1111111111111111ull,
6146     0x5555555555555555ull,
6147   };
6148
6149
6150 /* Return true if val is a valid bitmask immediate.  */
6151
6152 bool
6153 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6154 {
6155   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6156   int bits;
6157
6158   /* Check for a single sequence of one bits and return quickly if so.
6159      The special cases of all ones and all zeroes returns false.  */
6160   val = aarch64_replicate_bitmask_imm (val_in, mode);
6161   tmp = val + (val & -val);
6162
6163   if (tmp == (tmp & -tmp))
6164     return (val + 1) > 1;
6165
6166   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
6167   if (mode == SImode)
6168     val = (val << 32) | (val & 0xffffffff);
6169
6170   /* Invert if the immediate doesn't start with a zero bit - this means we
6171      only need to search for sequences of one bits.  */
6172   if (val & 1)
6173     val = ~val;
6174
6175   /* Find the first set bit and set tmp to val with the first sequence of one
6176      bits removed.  Return success if there is a single sequence of ones.  */
6177   first_one = val & -val;
6178   tmp = val & (val + first_one);
6179
6180   if (tmp == 0)
6181     return true;
6182
6183   /* Find the next set bit and compute the difference in bit position.  */
6184   next_one = tmp & -tmp;
6185   bits = clz_hwi (first_one) - clz_hwi (next_one);
6186   mask = val ^ tmp;
6187
6188   /* Check the bit position difference is a power of 2, and that the first
6189      sequence of one bits fits within 'bits' bits.  */
6190   if ((mask >> bits) != 0 || bits != (bits & -bits))
6191     return false;
6192
6193   /* Check the sequence of one bits is repeated 64/bits times.  */
6194   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6195 }
6196
6197 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6198    Assumed precondition: VAL_IN Is not zero.  */
6199
6200 unsigned HOST_WIDE_INT
6201 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6202 {
6203   int lowest_bit_set = ctz_hwi (val_in);
6204   int highest_bit_set = floor_log2 (val_in);
6205   gcc_assert (val_in != 0);
6206
6207   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6208           (HOST_WIDE_INT_1U << lowest_bit_set));
6209 }
6210
6211 /* Create constant where bits outside of lowest bit set to highest bit set
6212    are set to 1.  */
6213
6214 unsigned HOST_WIDE_INT
6215 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6216 {
6217   return val_in | ~aarch64_and_split_imm1 (val_in);
6218 }
6219
6220 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
6221
6222 bool
6223 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
6224 {
6225   scalar_int_mode int_mode;
6226   if (!is_a <scalar_int_mode> (mode, &int_mode))
6227     return false;
6228
6229   if (aarch64_bitmask_imm (val_in, int_mode))
6230     return false;
6231
6232   if (aarch64_move_imm (val_in, int_mode))
6233     return false;
6234
6235   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
6236
6237   return aarch64_bitmask_imm (imm2, int_mode);
6238 }
6239
6240 /* Return true if val is an immediate that can be loaded into a
6241    register in a single instruction.  */
6242 bool
6243 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
6244 {
6245   scalar_int_mode int_mode;
6246   if (!is_a <scalar_int_mode> (mode, &int_mode))
6247     return false;
6248
6249   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
6250     return 1;
6251   return aarch64_bitmask_imm (val, int_mode);
6252 }
6253
6254 static bool
6255 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
6256 {
6257   rtx base, offset;
6258
6259   if (GET_CODE (x) == HIGH)
6260     return true;
6261
6262   /* There's no way to calculate VL-based values using relocations.  */
6263   subrtx_iterator::array_type array;
6264   FOR_EACH_SUBRTX (iter, array, x, ALL)
6265     if (GET_CODE (*iter) == CONST_POLY_INT)
6266       return true;
6267
6268   split_const (x, &base, &offset);
6269   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
6270     {
6271       if (aarch64_classify_symbol (base, INTVAL (offset))
6272           != SYMBOL_FORCE_TO_MEM)
6273         return true;
6274       else
6275         /* Avoid generating a 64-bit relocation in ILP32; leave
6276            to aarch64_expand_mov_immediate to handle it properly.  */
6277         return mode != ptr_mode;
6278     }
6279
6280   return aarch64_tls_referenced_p (x);
6281 }
6282
6283 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6284    The expansion for a table switch is quite expensive due to the number
6285    of instructions, the table lookup and hard to predict indirect jump.
6286    When optimizing for speed, and -O3 enabled, use the per-core tuning if
6287    set, otherwise use tables for > 16 cases as a tradeoff between size and
6288    performance.  When optimizing for size, use the default setting.  */
6289
6290 static unsigned int
6291 aarch64_case_values_threshold (void)
6292 {
6293   /* Use the specified limit for the number of cases before using jump
6294      tables at higher optimization levels.  */
6295   if (optimize > 2
6296       && selected_cpu->tune->max_case_values != 0)
6297     return selected_cpu->tune->max_case_values;
6298   else
6299     return optimize_size ? default_case_values_threshold () : 17;
6300 }
6301
6302 /* Return true if register REGNO is a valid index register.
6303    STRICT_P is true if REG_OK_STRICT is in effect.  */
6304
6305 bool
6306 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
6307 {
6308   if (!HARD_REGISTER_NUM_P (regno))
6309     {
6310       if (!strict_p)
6311         return true;
6312
6313       if (!reg_renumber)
6314         return false;
6315
6316       regno = reg_renumber[regno];
6317     }
6318   return GP_REGNUM_P (regno);
6319 }
6320
6321 /* Return true if register REGNO is a valid base register for mode MODE.
6322    STRICT_P is true if REG_OK_STRICT is in effect.  */
6323
6324 bool
6325 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
6326 {
6327   if (!HARD_REGISTER_NUM_P (regno))
6328     {
6329       if (!strict_p)
6330         return true;
6331
6332       if (!reg_renumber)
6333         return false;
6334
6335       regno = reg_renumber[regno];
6336     }
6337
6338   /* The fake registers will be eliminated to either the stack or
6339      hard frame pointer, both of which are usually valid base registers.
6340      Reload deals with the cases where the eliminated form isn't valid.  */
6341   return (GP_REGNUM_P (regno)
6342           || regno == SP_REGNUM
6343           || regno == FRAME_POINTER_REGNUM
6344           || regno == ARG_POINTER_REGNUM);
6345 }
6346
6347 /* Return true if X is a valid base register for mode MODE.
6348    STRICT_P is true if REG_OK_STRICT is in effect.  */
6349
6350 static bool
6351 aarch64_base_register_rtx_p (rtx x, bool strict_p)
6352 {
6353   if (!strict_p
6354       && GET_CODE (x) == SUBREG
6355       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
6356     x = SUBREG_REG (x);
6357
6358   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
6359 }
6360
6361 /* Return true if address offset is a valid index.  If it is, fill in INFO
6362    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6363
6364 static bool
6365 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
6366                         machine_mode mode, bool strict_p)
6367 {
6368   enum aarch64_address_type type;
6369   rtx index;
6370   int shift;
6371
6372   /* (reg:P) */
6373   if ((REG_P (x) || GET_CODE (x) == SUBREG)
6374       && GET_MODE (x) == Pmode)
6375     {
6376       type = ADDRESS_REG_REG;
6377       index = x;
6378       shift = 0;
6379     }
6380   /* (sign_extend:DI (reg:SI)) */
6381   else if ((GET_CODE (x) == SIGN_EXTEND
6382             || GET_CODE (x) == ZERO_EXTEND)
6383            && GET_MODE (x) == DImode
6384            && GET_MODE (XEXP (x, 0)) == SImode)
6385     {
6386       type = (GET_CODE (x) == SIGN_EXTEND)
6387         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6388       index = XEXP (x, 0);
6389       shift = 0;
6390     }
6391   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6392   else if (GET_CODE (x) == MULT
6393            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6394                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6395            && GET_MODE (XEXP (x, 0)) == DImode
6396            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6397            && CONST_INT_P (XEXP (x, 1)))
6398     {
6399       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6400         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6401       index = XEXP (XEXP (x, 0), 0);
6402       shift = exact_log2 (INTVAL (XEXP (x, 1)));
6403     }
6404   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6405   else if (GET_CODE (x) == ASHIFT
6406            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6407                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6408            && GET_MODE (XEXP (x, 0)) == DImode
6409            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6410            && CONST_INT_P (XEXP (x, 1)))
6411     {
6412       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6413         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6414       index = XEXP (XEXP (x, 0), 0);
6415       shift = INTVAL (XEXP (x, 1));
6416     }
6417   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6418   else if ((GET_CODE (x) == SIGN_EXTRACT
6419             || GET_CODE (x) == ZERO_EXTRACT)
6420            && GET_MODE (x) == DImode
6421            && GET_CODE (XEXP (x, 0)) == MULT
6422            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6423            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6424     {
6425       type = (GET_CODE (x) == SIGN_EXTRACT)
6426         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6427       index = XEXP (XEXP (x, 0), 0);
6428       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6429       if (INTVAL (XEXP (x, 1)) != 32 + shift
6430           || INTVAL (XEXP (x, 2)) != 0)
6431         shift = -1;
6432     }
6433   /* (and:DI (mult:DI (reg:DI) (const_int scale))
6434      (const_int 0xffffffff<<shift)) */
6435   else if (GET_CODE (x) == AND
6436            && GET_MODE (x) == DImode
6437            && GET_CODE (XEXP (x, 0)) == MULT
6438            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6439            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6440            && CONST_INT_P (XEXP (x, 1)))
6441     {
6442       type = ADDRESS_REG_UXTW;
6443       index = XEXP (XEXP (x, 0), 0);
6444       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6445       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6446         shift = -1;
6447     }
6448   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
6449   else if ((GET_CODE (x) == SIGN_EXTRACT
6450             || GET_CODE (x) == ZERO_EXTRACT)
6451            && GET_MODE (x) == DImode
6452            && GET_CODE (XEXP (x, 0)) == ASHIFT
6453            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6454            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6455     {
6456       type = (GET_CODE (x) == SIGN_EXTRACT)
6457         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6458       index = XEXP (XEXP (x, 0), 0);
6459       shift = INTVAL (XEXP (XEXP (x, 0), 1));
6460       if (INTVAL (XEXP (x, 1)) != 32 + shift
6461           || INTVAL (XEXP (x, 2)) != 0)
6462         shift = -1;
6463     }
6464   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
6465      (const_int 0xffffffff<<shift)) */
6466   else if (GET_CODE (x) == AND
6467            && GET_MODE (x) == DImode
6468            && GET_CODE (XEXP (x, 0)) == ASHIFT
6469            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6470            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6471            && CONST_INT_P (XEXP (x, 1)))
6472     {
6473       type = ADDRESS_REG_UXTW;
6474       index = XEXP (XEXP (x, 0), 0);
6475       shift = INTVAL (XEXP (XEXP (x, 0), 1));
6476       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6477         shift = -1;
6478     }
6479   /* (mult:P (reg:P) (const_int scale)) */
6480   else if (GET_CODE (x) == MULT
6481            && GET_MODE (x) == Pmode
6482            && GET_MODE (XEXP (x, 0)) == Pmode
6483            && CONST_INT_P (XEXP (x, 1)))
6484     {
6485       type = ADDRESS_REG_REG;
6486       index = XEXP (x, 0);
6487       shift = exact_log2 (INTVAL (XEXP (x, 1)));
6488     }
6489   /* (ashift:P (reg:P) (const_int shift)) */
6490   else if (GET_CODE (x) == ASHIFT
6491            && GET_MODE (x) == Pmode
6492            && GET_MODE (XEXP (x, 0)) == Pmode
6493            && CONST_INT_P (XEXP (x, 1)))
6494     {
6495       type = ADDRESS_REG_REG;
6496       index = XEXP (x, 0);
6497       shift = INTVAL (XEXP (x, 1));
6498     }
6499   else
6500     return false;
6501
6502   if (!strict_p
6503       && GET_CODE (index) == SUBREG
6504       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
6505     index = SUBREG_REG (index);
6506
6507   if (aarch64_sve_data_mode_p (mode))
6508     {
6509       if (type != ADDRESS_REG_REG
6510           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
6511         return false;
6512     }
6513   else
6514     {
6515       if (shift != 0
6516           && !(IN_RANGE (shift, 1, 3)
6517                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
6518         return false;
6519     }
6520
6521   if (REG_P (index)
6522       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
6523     {
6524       info->type = type;
6525       info->offset = index;
6526       info->shift = shift;
6527       return true;
6528     }
6529
6530   return false;
6531 }
6532
6533 /* Return true if MODE is one of the modes for which we
6534    support LDP/STP operations.  */
6535
6536 static bool
6537 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
6538 {
6539   return mode == SImode || mode == DImode
6540          || mode == SFmode || mode == DFmode
6541          || (aarch64_vector_mode_supported_p (mode)
6542              && (known_eq (GET_MODE_SIZE (mode), 8)
6543                  || (known_eq (GET_MODE_SIZE (mode), 16)
6544                     && (aarch64_tune_params.extra_tuning_flags
6545                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
6546 }
6547
6548 /* Return true if REGNO is a virtual pointer register, or an eliminable
6549    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
6550    include stack_pointer or hard_frame_pointer.  */
6551 static bool
6552 virt_or_elim_regno_p (unsigned regno)
6553 {
6554   return ((regno >= FIRST_VIRTUAL_REGISTER
6555            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
6556           || regno == FRAME_POINTER_REGNUM
6557           || regno == ARG_POINTER_REGNUM);
6558 }
6559
6560 /* Return true if X is a valid address of type TYPE for machine mode MODE.
6561    If it is, fill in INFO appropriately.  STRICT_P is true if
6562    REG_OK_STRICT is in effect.  */
6563
6564 bool
6565 aarch64_classify_address (struct aarch64_address_info *info,
6566                           rtx x, machine_mode mode, bool strict_p,
6567                           aarch64_addr_query_type type)
6568 {
6569   enum rtx_code code = GET_CODE (x);
6570   rtx op0, op1;
6571   poly_int64 offset;
6572
6573   HOST_WIDE_INT const_size;
6574
6575   /* On BE, we use load/store pair for all large int mode load/stores.
6576      TI/TFmode may also use a load/store pair.  */
6577   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6578   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
6579   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
6580                             || type == ADDR_QUERY_LDP_STP_N
6581                             || mode == TImode
6582                             || mode == TFmode
6583                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
6584
6585   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
6586      corresponds to the actual size of the memory being loaded/stored and the
6587      mode of the corresponding addressing mode is half of that.  */
6588   if (type == ADDR_QUERY_LDP_STP_N
6589       && known_eq (GET_MODE_SIZE (mode), 16))
6590     mode = DFmode;
6591
6592   bool allow_reg_index_p = (!load_store_pair_p
6593                             && (known_lt (GET_MODE_SIZE (mode), 16)
6594                                 || vec_flags == VEC_ADVSIMD
6595                                 || vec_flags == VEC_SVE_DATA));
6596
6597   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
6598      [Rn, #offset, MUL VL].  */
6599   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
6600       && (code != REG && code != PLUS))
6601     return false;
6602
6603   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
6604      REG addressing.  */
6605   if (advsimd_struct_p
6606       && !BYTES_BIG_ENDIAN
6607       && (code != POST_INC && code != REG))
6608     return false;
6609
6610   gcc_checking_assert (GET_MODE (x) == VOIDmode
6611                        || SCALAR_INT_MODE_P (GET_MODE (x)));
6612
6613   switch (code)
6614     {
6615     case REG:
6616     case SUBREG:
6617       info->type = ADDRESS_REG_IMM;
6618       info->base = x;
6619       info->offset = const0_rtx;
6620       info->const_offset = 0;
6621       return aarch64_base_register_rtx_p (x, strict_p);
6622
6623     case PLUS:
6624       op0 = XEXP (x, 0);
6625       op1 = XEXP (x, 1);
6626
6627       if (! strict_p
6628           && REG_P (op0)
6629           && virt_or_elim_regno_p (REGNO (op0))
6630           && poly_int_rtx_p (op1, &offset))
6631         {
6632           info->type = ADDRESS_REG_IMM;
6633           info->base = op0;
6634           info->offset = op1;
6635           info->const_offset = offset;
6636
6637           return true;
6638         }
6639
6640       if (maybe_ne (GET_MODE_SIZE (mode), 0)
6641           && aarch64_base_register_rtx_p (op0, strict_p)
6642           && poly_int_rtx_p (op1, &offset))
6643         {
6644           info->type = ADDRESS_REG_IMM;
6645           info->base = op0;
6646           info->offset = op1;
6647           info->const_offset = offset;
6648
6649           /* TImode and TFmode values are allowed in both pairs of X
6650              registers and individual Q registers.  The available
6651              address modes are:
6652              X,X: 7-bit signed scaled offset
6653              Q:   9-bit signed offset
6654              We conservatively require an offset representable in either mode.
6655              When performing the check for pairs of X registers i.e.  LDP/STP
6656              pass down DImode since that is the natural size of the LDP/STP
6657              instruction memory accesses.  */
6658           if (mode == TImode || mode == TFmode)
6659             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
6660                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6661                         || offset_12bit_unsigned_scaled_p (mode, offset)));
6662
6663           /* A 7bit offset check because OImode will emit a ldp/stp
6664              instruction (only big endian will get here).
6665              For ldp/stp instructions, the offset is scaled for the size of a
6666              single element of the pair.  */
6667           if (mode == OImode)
6668             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
6669
6670           /* Three 9/12 bit offsets checks because CImode will emit three
6671              ldr/str instructions (only big endian will get here).  */
6672           if (mode == CImode)
6673             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6674                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
6675                                                                offset + 32)
6676                         || offset_12bit_unsigned_scaled_p (V16QImode,
6677                                                            offset + 32)));
6678
6679           /* Two 7bit offsets checks because XImode will emit two ldp/stp
6680              instructions (only big endian will get here).  */
6681           if (mode == XImode)
6682             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6683                     && aarch64_offset_7bit_signed_scaled_p (TImode,
6684                                                             offset + 32));
6685
6686           /* Make "m" use the LD1 offset range for SVE data modes, so
6687              that pre-RTL optimizers like ivopts will work to that
6688              instead of the wider LDR/STR range.  */
6689           if (vec_flags == VEC_SVE_DATA)
6690             return (type == ADDR_QUERY_M
6691                     ? offset_4bit_signed_scaled_p (mode, offset)
6692                     : offset_9bit_signed_scaled_p (mode, offset));
6693
6694           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
6695             {
6696               poly_int64 end_offset = (offset
6697                                        + GET_MODE_SIZE (mode)
6698                                        - BYTES_PER_SVE_VECTOR);
6699               return (type == ADDR_QUERY_M
6700                       ? offset_4bit_signed_scaled_p (mode, offset)
6701                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
6702                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
6703                                                          end_offset)));
6704             }
6705
6706           if (vec_flags == VEC_SVE_PRED)
6707             return offset_9bit_signed_scaled_p (mode, offset);
6708
6709           if (load_store_pair_p)
6710             return ((known_eq (GET_MODE_SIZE (mode), 4)
6711                      || known_eq (GET_MODE_SIZE (mode), 8)
6712                      || known_eq (GET_MODE_SIZE (mode), 16))
6713                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6714           else
6715             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6716                     || offset_12bit_unsigned_scaled_p (mode, offset));
6717         }
6718
6719       if (allow_reg_index_p)
6720         {
6721           /* Look for base + (scaled/extended) index register.  */
6722           if (aarch64_base_register_rtx_p (op0, strict_p)
6723               && aarch64_classify_index (info, op1, mode, strict_p))
6724             {
6725               info->base = op0;
6726               return true;
6727             }
6728           if (aarch64_base_register_rtx_p (op1, strict_p)
6729               && aarch64_classify_index (info, op0, mode, strict_p))
6730             {
6731               info->base = op1;
6732               return true;
6733             }
6734         }
6735
6736       return false;
6737
6738     case POST_INC:
6739     case POST_DEC:
6740     case PRE_INC:
6741     case PRE_DEC:
6742       info->type = ADDRESS_REG_WB;
6743       info->base = XEXP (x, 0);
6744       info->offset = NULL_RTX;
6745       return aarch64_base_register_rtx_p (info->base, strict_p);
6746
6747     case POST_MODIFY:
6748     case PRE_MODIFY:
6749       info->type = ADDRESS_REG_WB;
6750       info->base = XEXP (x, 0);
6751       if (GET_CODE (XEXP (x, 1)) == PLUS
6752           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
6753           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
6754           && aarch64_base_register_rtx_p (info->base, strict_p))
6755         {
6756           info->offset = XEXP (XEXP (x, 1), 1);
6757           info->const_offset = offset;
6758
6759           /* TImode and TFmode values are allowed in both pairs of X
6760              registers and individual Q registers.  The available
6761              address modes are:
6762              X,X: 7-bit signed scaled offset
6763              Q:   9-bit signed offset
6764              We conservatively require an offset representable in either mode.
6765            */
6766           if (mode == TImode || mode == TFmode)
6767             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
6768                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
6769
6770           if (load_store_pair_p)
6771             return ((known_eq (GET_MODE_SIZE (mode), 4)
6772                      || known_eq (GET_MODE_SIZE (mode), 8)
6773                      || known_eq (GET_MODE_SIZE (mode), 16))
6774                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6775           else
6776             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
6777         }
6778       return false;
6779
6780     case CONST:
6781     case SYMBOL_REF:
6782     case LABEL_REF:
6783       /* load literal: pc-relative constant pool entry.  Only supported
6784          for SI mode or larger.  */
6785       info->type = ADDRESS_SYMBOLIC;
6786
6787       if (!load_store_pair_p
6788           && GET_MODE_SIZE (mode).is_constant (&const_size)
6789           && const_size >= 4)
6790         {
6791           rtx sym, addend;
6792
6793           split_const (x, &sym, &addend);
6794           return ((GET_CODE (sym) == LABEL_REF
6795                    || (GET_CODE (sym) == SYMBOL_REF
6796                        && CONSTANT_POOL_ADDRESS_P (sym)
6797                        && aarch64_pcrelative_literal_loads)));
6798         }
6799       return false;
6800
6801     case LO_SUM:
6802       info->type = ADDRESS_LO_SUM;
6803       info->base = XEXP (x, 0);
6804       info->offset = XEXP (x, 1);
6805       if (allow_reg_index_p
6806           && aarch64_base_register_rtx_p (info->base, strict_p))
6807         {
6808           rtx sym, offs;
6809           split_const (info->offset, &sym, &offs);
6810           if (GET_CODE (sym) == SYMBOL_REF
6811               && (aarch64_classify_symbol (sym, INTVAL (offs))
6812                   == SYMBOL_SMALL_ABSOLUTE))
6813             {
6814               /* The symbol and offset must be aligned to the access size.  */
6815               unsigned int align;
6816
6817               if (CONSTANT_POOL_ADDRESS_P (sym))
6818                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
6819               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
6820                 {
6821                   tree exp = SYMBOL_REF_DECL (sym);
6822                   align = TYPE_ALIGN (TREE_TYPE (exp));
6823                   align = aarch64_constant_alignment (exp, align);
6824                 }
6825               else if (SYMBOL_REF_DECL (sym))
6826                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
6827               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
6828                        && SYMBOL_REF_BLOCK (sym) != NULL)
6829                 align = SYMBOL_REF_BLOCK (sym)->alignment;
6830               else
6831                 align = BITS_PER_UNIT;
6832
6833               poly_int64 ref_size = GET_MODE_SIZE (mode);
6834               if (known_eq (ref_size, 0))
6835                 ref_size = GET_MODE_SIZE (DImode);
6836
6837               return (multiple_p (INTVAL (offs), ref_size)
6838                       && multiple_p (align / BITS_PER_UNIT, ref_size));
6839             }
6840         }
6841       return false;
6842
6843     default:
6844       return false;
6845     }
6846 }
6847
6848 /* Return true if the address X is valid for a PRFM instruction.
6849    STRICT_P is true if we should do strict checking with
6850    aarch64_classify_address.  */
6851
6852 bool
6853 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
6854 {
6855   struct aarch64_address_info addr;
6856
6857   /* PRFM accepts the same addresses as DImode...  */
6858   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6859   if (!res)
6860     return false;
6861
6862   /* ... except writeback forms.  */
6863   return addr.type != ADDRESS_REG_WB;
6864 }
6865
6866 bool
6867 aarch64_symbolic_address_p (rtx x)
6868 {
6869   rtx offset;
6870
6871   split_const (x, &x, &offset);
6872   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6873 }
6874
6875 /* Classify the base of symbolic expression X.  */
6876
6877 enum aarch64_symbol_type
6878 aarch64_classify_symbolic_expression (rtx x)
6879 {
6880   rtx offset;
6881
6882   split_const (x, &x, &offset);
6883   return aarch64_classify_symbol (x, INTVAL (offset));
6884 }
6885
6886
6887 /* Return TRUE if X is a legitimate address for accessing memory in
6888    mode MODE.  */
6889 static bool
6890 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6891 {
6892   struct aarch64_address_info addr;
6893
6894   return aarch64_classify_address (&addr, x, mode, strict_p);
6895 }
6896
6897 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6898    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6899 bool
6900 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6901                               aarch64_addr_query_type type)
6902 {
6903   struct aarch64_address_info addr;
6904
6905   return aarch64_classify_address (&addr, x, mode, strict_p, type);
6906 }
6907
6908 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
6909
6910 static bool
6911 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6912                                          poly_int64 orig_offset,
6913                                          machine_mode mode)
6914 {
6915   HOST_WIDE_INT size;
6916   if (GET_MODE_SIZE (mode).is_constant (&size))
6917     {
6918       HOST_WIDE_INT const_offset, second_offset;
6919
6920       /* A general SVE offset is A * VQ + B.  Remove the A component from
6921          coefficient 0 in order to get the constant B.  */
6922       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6923
6924       /* Split an out-of-range address displacement into a base and
6925          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
6926          range otherwise to increase opportunities for sharing the base
6927          address of different sizes.  Unaligned accesses use the signed
6928          9-bit range, TImode/TFmode use the intersection of signed
6929          scaled 7-bit and signed 9-bit offset.  */
6930       if (mode == TImode || mode == TFmode)
6931         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6932       else if ((const_offset & (size - 1)) != 0)
6933         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6934       else
6935         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6936
6937       if (second_offset == 0 || known_eq (orig_offset, second_offset))
6938         return false;
6939
6940       /* Split the offset into second_offset and the rest.  */
6941       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6942       *offset2 = gen_int_mode (second_offset, Pmode);
6943       return true;
6944     }
6945   else
6946     {
6947       /* Get the mode we should use as the basis of the range.  For structure
6948          modes this is the mode of one vector.  */
6949       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6950       machine_mode step_mode
6951         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6952
6953       /* Get the "mul vl" multiplier we'd like to use.  */
6954       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6955       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6956       if (vec_flags & VEC_SVE_DATA)
6957         /* LDR supports a 9-bit range, but the move patterns for
6958            structure modes require all vectors to be in range of the
6959            same base.  The simplest way of accomodating that while still
6960            promoting reuse of anchor points between different modes is
6961            to use an 8-bit range unconditionally.  */
6962         vnum = ((vnum + 128) & 255) - 128;
6963       else
6964         /* Predicates are only handled singly, so we might as well use
6965            the full range.  */
6966         vnum = ((vnum + 256) & 511) - 256;
6967       if (vnum == 0)
6968         return false;
6969
6970       /* Convert the "mul vl" multiplier into a byte offset.  */
6971       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6972       if (known_eq (second_offset, orig_offset))
6973         return false;
6974
6975       /* Split the offset into second_offset and the rest.  */
6976       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6977       *offset2 = gen_int_mode (second_offset, Pmode);
6978       return true;
6979     }
6980 }
6981
6982 /* Return the binary representation of floating point constant VALUE in INTVAL.
6983    If the value cannot be converted, return false without setting INTVAL.
6984    The conversion is done in the given MODE.  */
6985 bool
6986 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6987 {
6988
6989   /* We make a general exception for 0.  */
6990   if (aarch64_float_const_zero_rtx_p (value))
6991     {
6992       *intval = 0;
6993       return true;
6994     }
6995
6996   scalar_float_mode mode;
6997   if (GET_CODE (value) != CONST_DOUBLE
6998       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6999       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7000       /* Only support up to DF mode.  */
7001       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7002     return false;
7003
7004   unsigned HOST_WIDE_INT ival = 0;
7005
7006   long res[2];
7007   real_to_target (res,
7008                   CONST_DOUBLE_REAL_VALUE (value),
7009                   REAL_MODE_FORMAT (mode));
7010
7011   if (mode == DFmode)
7012     {
7013       int order = BYTES_BIG_ENDIAN ? 1 : 0;
7014       ival = zext_hwi (res[order], 32);
7015       ival |= (zext_hwi (res[1 - order], 32) << 32);
7016     }
7017   else
7018       ival = zext_hwi (res[0], 32);
7019
7020   *intval = ival;
7021   return true;
7022 }
7023
7024 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7025    single MOV(+MOVK) followed by an FMOV.  */
7026 bool
7027 aarch64_float_const_rtx_p (rtx x)
7028 {
7029   machine_mode mode = GET_MODE (x);
7030   if (mode == VOIDmode)
7031     return false;
7032
7033   /* Determine whether it's cheaper to write float constants as
7034      mov/movk pairs over ldr/adrp pairs.  */
7035   unsigned HOST_WIDE_INT ival;
7036
7037   if (GET_CODE (x) == CONST_DOUBLE
7038       && SCALAR_FLOAT_MODE_P (mode)
7039       && aarch64_reinterpret_float_as_int (x, &ival))
7040     {
7041       scalar_int_mode imode = (mode == HFmode
7042                                ? SImode
7043                                : int_mode_for_mode (mode).require ());
7044       int num_instr = aarch64_internal_mov_immediate
7045                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7046       return num_instr < 3;
7047     }
7048
7049   return false;
7050 }
7051
7052 /* Return TRUE if rtx X is immediate constant 0.0 */
7053 bool
7054 aarch64_float_const_zero_rtx_p (rtx x)
7055 {
7056   if (GET_MODE (x) == VOIDmode)
7057     return false;
7058
7059   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7060     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7061   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7062 }
7063
7064 /* Return TRUE if rtx X is immediate constant that fits in a single
7065    MOVI immediate operation.  */
7066 bool
7067 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7068 {
7069   if (!TARGET_SIMD)
7070      return false;
7071
7072   machine_mode vmode;
7073   scalar_int_mode imode;
7074   unsigned HOST_WIDE_INT ival;
7075
7076   if (GET_CODE (x) == CONST_DOUBLE
7077       && SCALAR_FLOAT_MODE_P (mode))
7078     {
7079       if (!aarch64_reinterpret_float_as_int (x, &ival))
7080         return false;
7081
7082       /* We make a general exception for 0.  */
7083       if (aarch64_float_const_zero_rtx_p (x))
7084         return true;
7085
7086       imode = int_mode_for_mode (mode).require ();
7087     }
7088   else if (GET_CODE (x) == CONST_INT
7089            && is_a <scalar_int_mode> (mode, &imode))
7090     ival = INTVAL (x);
7091   else
7092     return false;
7093
7094    /* use a 64 bit mode for everything except for DI/DF mode, where we use
7095      a 128 bit vector mode.  */
7096   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7097
7098   vmode = aarch64_simd_container_mode (imode, width);
7099   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7100
7101   return aarch64_simd_valid_immediate (v_op, NULL);
7102 }
7103
7104
7105 /* Return the fixed registers used for condition codes.  */
7106
7107 static bool
7108 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7109 {
7110   *p1 = CC_REGNUM;
7111   *p2 = INVALID_REGNUM;
7112   return true;
7113 }
7114
7115 /* This function is used by the call expanders of the machine description.
7116    RESULT is the register in which the result is returned.  It's NULL for
7117    "call" and "sibcall".
7118    MEM is the location of the function call.
7119    SIBCALL indicates whether this function call is normal call or sibling call.
7120    It will generate different pattern accordingly.  */
7121
7122 void
7123 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7124 {
7125   rtx call, callee, tmp;
7126   rtvec vec;
7127   machine_mode mode;
7128
7129   gcc_assert (MEM_P (mem));
7130   callee = XEXP (mem, 0);
7131   mode = GET_MODE (callee);
7132   gcc_assert (mode == Pmode);
7133
7134   /* Decide if we should generate indirect calls by loading the
7135      address of the callee into a register before performing
7136      the branch-and-link.  */
7137   if (SYMBOL_REF_P (callee)
7138       ? (aarch64_is_long_call_p (callee)
7139          || aarch64_is_noplt_call_p (callee))
7140       : !REG_P (callee))
7141     XEXP (mem, 0) = force_reg (mode, callee);
7142
7143   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7144
7145   if (result != NULL_RTX)
7146     call = gen_rtx_SET (result, call);
7147
7148   if (sibcall)
7149     tmp = ret_rtx;
7150   else
7151     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7152
7153   vec = gen_rtvec (2, call, tmp);
7154   call = gen_rtx_PARALLEL (VOIDmode, vec);
7155
7156   aarch64_emit_call_insn (call);
7157 }
7158
7159 /* Emit call insn with PAT and do aarch64-specific handling.  */
7160
7161 void
7162 aarch64_emit_call_insn (rtx pat)
7163 {
7164   rtx insn = emit_call_insn (pat);
7165
7166   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7167   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7168   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7169 }
7170
7171 machine_mode
7172 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7173 {
7174   machine_mode mode_x = GET_MODE (x);
7175   rtx_code code_x = GET_CODE (x);
7176
7177   /* All floating point compares return CCFP if it is an equality
7178      comparison, and CCFPE otherwise.  */
7179   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
7180     {
7181       switch (code)
7182         {
7183         case EQ:
7184         case NE:
7185         case UNORDERED:
7186         case ORDERED:
7187         case UNLT:
7188         case UNLE:
7189         case UNGT:
7190         case UNGE:
7191         case UNEQ:
7192           return CCFPmode;
7193
7194         case LT:
7195         case LE:
7196         case GT:
7197         case GE:
7198         case LTGT:
7199           return CCFPEmode;
7200
7201         default:
7202           gcc_unreachable ();
7203         }
7204     }
7205
7206   /* Equality comparisons of short modes against zero can be performed
7207      using the TST instruction with the appropriate bitmask.  */
7208   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
7209       && (code == EQ || code == NE)
7210       && (mode_x == HImode || mode_x == QImode))
7211     return CC_NZmode;
7212
7213   /* Similarly, comparisons of zero_extends from shorter modes can
7214      be performed using an ANDS with an immediate mask.  */
7215   if (y == const0_rtx && code_x == ZERO_EXTEND
7216       && (mode_x == SImode || mode_x == DImode)
7217       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
7218       && (code == EQ || code == NE))
7219     return CC_NZmode;
7220
7221   if ((mode_x == SImode || mode_x == DImode)
7222       && y == const0_rtx
7223       && (code == EQ || code == NE || code == LT || code == GE)
7224       && (code_x == PLUS || code_x == MINUS || code_x == AND
7225           || code_x == NEG
7226           || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7227               && CONST_INT_P (XEXP (x, 2)))))
7228     return CC_NZmode;
7229
7230   /* A compare with a shifted operand.  Because of canonicalization,
7231      the comparison will have to be swapped when we emit the assembly
7232      code.  */
7233   if ((mode_x == SImode || mode_x == DImode)
7234       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
7235       && (code_x == ASHIFT || code_x == ASHIFTRT
7236           || code_x == LSHIFTRT
7237           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
7238     return CC_SWPmode;
7239
7240   /* Similarly for a negated operand, but we can only do this for
7241      equalities.  */
7242   if ((mode_x == SImode || mode_x == DImode)
7243       && (REG_P (y) || GET_CODE (y) == SUBREG)
7244       && (code == EQ || code == NE)
7245       && code_x == NEG)
7246     return CC_Zmode;
7247
7248   /* A test for unsigned overflow from an addition.  */
7249   if ((mode_x == DImode || mode_x == TImode)
7250       && (code == LTU || code == GEU)
7251       && code_x == PLUS
7252       && rtx_equal_p (XEXP (x, 0), y))
7253     return CC_Cmode;
7254
7255   /* A test for unsigned overflow from an add with carry.  */
7256   if ((mode_x == DImode || mode_x == TImode)
7257       && (code == LTU || code == GEU)
7258       && code_x == PLUS
7259       && CONST_SCALAR_INT_P (y)
7260       && (rtx_mode_t (y, mode_x)
7261           == (wi::shwi (1, mode_x)
7262               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
7263     return CC_ADCmode;
7264
7265   /* A test for signed overflow.  */
7266   if ((mode_x == DImode || mode_x == TImode)
7267       && code == NE
7268       && code_x == PLUS
7269       && GET_CODE (y) == SIGN_EXTEND)
7270     return CC_Vmode;
7271
7272   /* For everything else, return CCmode.  */
7273   return CCmode;
7274 }
7275
7276 static int
7277 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
7278
7279 int
7280 aarch64_get_condition_code (rtx x)
7281 {
7282   machine_mode mode = GET_MODE (XEXP (x, 0));
7283   enum rtx_code comp_code = GET_CODE (x);
7284
7285   if (GET_MODE_CLASS (mode) != MODE_CC)
7286     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
7287   return aarch64_get_condition_code_1 (mode, comp_code);
7288 }
7289
7290 static int
7291 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
7292 {
7293   switch (mode)
7294     {
7295     case E_CCFPmode:
7296     case E_CCFPEmode:
7297       switch (comp_code)
7298         {
7299         case GE: return AARCH64_GE;
7300         case GT: return AARCH64_GT;
7301         case LE: return AARCH64_LS;
7302         case LT: return AARCH64_MI;
7303         case NE: return AARCH64_NE;
7304         case EQ: return AARCH64_EQ;
7305         case ORDERED: return AARCH64_VC;
7306         case UNORDERED: return AARCH64_VS;
7307         case UNLT: return AARCH64_LT;
7308         case UNLE: return AARCH64_LE;
7309         case UNGT: return AARCH64_HI;
7310         case UNGE: return AARCH64_PL;
7311         default: return -1;
7312         }
7313       break;
7314
7315     case E_CCmode:
7316       switch (comp_code)
7317         {
7318         case NE: return AARCH64_NE;
7319         case EQ: return AARCH64_EQ;
7320         case GE: return AARCH64_GE;
7321         case GT: return AARCH64_GT;
7322         case LE: return AARCH64_LE;
7323         case LT: return AARCH64_LT;
7324         case GEU: return AARCH64_CS;
7325         case GTU: return AARCH64_HI;
7326         case LEU: return AARCH64_LS;
7327         case LTU: return AARCH64_CC;
7328         default: return -1;
7329         }
7330       break;
7331
7332     case E_CC_SWPmode:
7333       switch (comp_code)
7334         {
7335         case NE: return AARCH64_NE;
7336         case EQ: return AARCH64_EQ;
7337         case GE: return AARCH64_LE;
7338         case GT: return AARCH64_LT;
7339         case LE: return AARCH64_GE;
7340         case LT: return AARCH64_GT;
7341         case GEU: return AARCH64_LS;
7342         case GTU: return AARCH64_CC;
7343         case LEU: return AARCH64_CS;
7344         case LTU: return AARCH64_HI;
7345         default: return -1;
7346         }
7347       break;
7348
7349     case E_CC_NZmode:
7350       switch (comp_code)
7351         {
7352         case NE: return AARCH64_NE;
7353         case EQ: return AARCH64_EQ;
7354         case GE: return AARCH64_PL;
7355         case LT: return AARCH64_MI;
7356         default: return -1;
7357         }
7358       break;
7359
7360     case E_CC_Zmode:
7361       switch (comp_code)
7362         {
7363         case NE: return AARCH64_NE;
7364         case EQ: return AARCH64_EQ;
7365         default: return -1;
7366         }
7367       break;
7368
7369     case E_CC_Cmode:
7370       switch (comp_code)
7371         {
7372         case LTU: return AARCH64_CS;
7373         case GEU: return AARCH64_CC;
7374         default: return -1;
7375         }
7376       break;
7377
7378     case E_CC_ADCmode:
7379       switch (comp_code)
7380         {
7381         case GEU: return AARCH64_CS;
7382         case LTU: return AARCH64_CC;
7383         default: return -1;
7384         }
7385       break;
7386
7387     case E_CC_Vmode:
7388       switch (comp_code)
7389         {
7390         case NE: return AARCH64_VS;
7391         case EQ: return AARCH64_VC;
7392         default: return -1;
7393         }
7394       break;
7395
7396     default:
7397       return -1;
7398     }
7399
7400   return -1;
7401 }
7402
7403 bool
7404 aarch64_const_vec_all_same_in_range_p (rtx x,
7405                                        HOST_WIDE_INT minval,
7406                                        HOST_WIDE_INT maxval)
7407 {
7408   rtx elt;
7409   return (const_vec_duplicate_p (x, &elt)
7410           && CONST_INT_P (elt)
7411           && IN_RANGE (INTVAL (elt), minval, maxval));
7412 }
7413
7414 bool
7415 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
7416 {
7417   return aarch64_const_vec_all_same_in_range_p (x, val, val);
7418 }
7419
7420 /* Return true if VEC is a constant in which every element is in the range
7421    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
7422
7423 static bool
7424 aarch64_const_vec_all_in_range_p (rtx vec,
7425                                   HOST_WIDE_INT minval,
7426                                   HOST_WIDE_INT maxval)
7427 {
7428   if (GET_CODE (vec) != CONST_VECTOR
7429       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
7430     return false;
7431
7432   int nunits;
7433   if (!CONST_VECTOR_STEPPED_P (vec))
7434     nunits = const_vector_encoded_nelts (vec);
7435   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
7436     return false;
7437
7438   for (int i = 0; i < nunits; i++)
7439     {
7440       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
7441       if (!CONST_INT_P (vec_elem)
7442           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
7443         return false;
7444     }
7445   return true;
7446 }
7447
7448 /* N Z C V.  */
7449 #define AARCH64_CC_V 1
7450 #define AARCH64_CC_C (1 << 1)
7451 #define AARCH64_CC_Z (1 << 2)
7452 #define AARCH64_CC_N (1 << 3)
7453
7454 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
7455 static const int aarch64_nzcv_codes[] =
7456 {
7457   0,            /* EQ, Z == 1.  */
7458   AARCH64_CC_Z, /* NE, Z == 0.  */
7459   0,            /* CS, C == 1.  */
7460   AARCH64_CC_C, /* CC, C == 0.  */
7461   0,            /* MI, N == 1.  */
7462   AARCH64_CC_N, /* PL, N == 0.  */
7463   0,            /* VS, V == 1.  */
7464   AARCH64_CC_V, /* VC, V == 0.  */
7465   0,            /* HI, C ==1 && Z == 0.  */
7466   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
7467   AARCH64_CC_V, /* GE, N == V.  */
7468   0,            /* LT, N != V.  */
7469   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
7470   0,            /* LE, !(Z == 0 && N == V).  */
7471   0,            /* AL, Any.  */
7472   0             /* NV, Any.  */
7473 };
7474
7475 /* Print floating-point vector immediate operand X to F, negating it
7476    first if NEGATE is true.  Return true on success, false if it isn't
7477    a constant we can handle.  */
7478
7479 static bool
7480 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
7481 {
7482   rtx elt;
7483
7484   if (!const_vec_duplicate_p (x, &elt))
7485     return false;
7486
7487   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
7488   if (negate)
7489     r = real_value_negate (&r);
7490
7491   /* We only handle the SVE single-bit immediates here.  */
7492   if (real_equal (&r, &dconst0))
7493     asm_fprintf (f, "0.0");
7494   else if (real_equal (&r, &dconst1))
7495     asm_fprintf (f, "1.0");
7496   else if (real_equal (&r, &dconsthalf))
7497     asm_fprintf (f, "0.5");
7498   else
7499     return false;
7500
7501   return true;
7502 }
7503
7504 /* Return the equivalent letter for size.  */
7505 static char
7506 sizetochar (int size)
7507 {
7508   switch (size)
7509     {
7510     case 64: return 'd';
7511     case 32: return 's';
7512     case 16: return 'h';
7513     case 8 : return 'b';
7514     default: gcc_unreachable ();
7515     }
7516 }
7517
7518 /* Print operand X to file F in a target specific manner according to CODE.
7519    The acceptable formatting commands given by CODE are:
7520      'c':               An integer or symbol address without a preceding #
7521                         sign.
7522      'C':               Take the duplicated element in a vector constant
7523                         and print it in hex.
7524      'D':               Take the duplicated element in a vector constant
7525                         and print it as an unsigned integer, in decimal.
7526      'e':               Print the sign/zero-extend size as a character 8->b,
7527                         16->h, 32->w.
7528      'p':               Prints N such that 2^N == X (X must be power of 2 and
7529                         const int).
7530      'P':               Print the number of non-zero bits in X (a const_int).
7531      'H':               Print the higher numbered register of a pair (TImode)
7532                         of regs.
7533      'm':               Print a condition (eq, ne, etc).
7534      'M':               Same as 'm', but invert condition.
7535      'N':               Take the duplicated element in a vector constant
7536                         and print the negative of it in decimal.
7537      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
7538      'S/T/U/V':         Print a FP/SIMD register name for a register list.
7539                         The register printed is the FP/SIMD register name
7540                         of X + 0/1/2/3 for S/T/U/V.
7541      'R':               Print a scalar FP/SIMD register name + 1.
7542      'X':               Print bottom 16 bits of integer constant in hex.
7543      'w/x':             Print a general register name or the zero register
7544                         (32-bit or 64-bit).
7545      '0':               Print a normal operand, if it's a general register,
7546                         then we assume DImode.
7547      'k':               Print NZCV for conditional compare instructions.
7548      'A':               Output address constant representing the first
7549                         argument of X, specifying a relocation offset
7550                         if appropriate.
7551      'L':               Output constant address specified by X
7552                         with a relocation offset if appropriate.
7553      'G':               Prints address of X, specifying a PC relative
7554                         relocation mode if appropriate.
7555      'y':               Output address of LDP or STP - this is used for
7556                         some LDP/STPs which don't use a PARALLEL in their
7557                         pattern (so the mode needs to be adjusted).
7558      'z':               Output address of a typical LDP or STP.  */
7559
7560 static void
7561 aarch64_print_operand (FILE *f, rtx x, int code)
7562 {
7563   rtx elt;
7564   switch (code)
7565     {
7566     case 'c':
7567       switch (GET_CODE (x))
7568         {
7569         case CONST_INT:
7570           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
7571           break;
7572
7573         case SYMBOL_REF:
7574           output_addr_const (f, x);
7575           break;
7576
7577         case CONST:
7578           if (GET_CODE (XEXP (x, 0)) == PLUS
7579               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
7580             {
7581               output_addr_const (f, x);
7582               break;
7583             }
7584           /* Fall through.  */
7585
7586         default:
7587           output_operand_lossage ("unsupported operand for code '%c'", code);
7588         }
7589       break;
7590
7591     case 'e':
7592       {
7593         int n;
7594
7595         if (!CONST_INT_P (x)
7596             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
7597           {
7598             output_operand_lossage ("invalid operand for '%%%c'", code);
7599             return;
7600           }
7601
7602         switch (n)
7603           {
7604           case 3:
7605             fputc ('b', f);
7606             break;
7607           case 4:
7608             fputc ('h', f);
7609             break;
7610           case 5:
7611             fputc ('w', f);
7612             break;
7613           default:
7614             output_operand_lossage ("invalid operand for '%%%c'", code);
7615             return;
7616           }
7617       }
7618       break;
7619
7620     case 'p':
7621       {
7622         int n;
7623
7624         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
7625           {
7626             output_operand_lossage ("invalid operand for '%%%c'", code);
7627             return;
7628           }
7629
7630         asm_fprintf (f, "%d", n);
7631       }
7632       break;
7633
7634     case 'P':
7635       if (!CONST_INT_P (x))
7636         {
7637           output_operand_lossage ("invalid operand for '%%%c'", code);
7638           return;
7639         }
7640
7641       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
7642       break;
7643
7644     case 'H':
7645       if (x == const0_rtx)
7646         {
7647           asm_fprintf (f, "xzr");
7648           break;
7649         }
7650
7651       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
7652         {
7653           output_operand_lossage ("invalid operand for '%%%c'", code);
7654           return;
7655         }
7656
7657       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
7658       break;
7659
7660     case 'M':
7661     case 'm':
7662       {
7663         int cond_code;
7664         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
7665         if (x == const_true_rtx)
7666           {
7667             if (code == 'M')
7668               fputs ("nv", f);
7669             return;
7670           }
7671
7672         if (!COMPARISON_P (x))
7673           {
7674             output_operand_lossage ("invalid operand for '%%%c'", code);
7675             return;
7676           }
7677
7678         cond_code = aarch64_get_condition_code (x);
7679         gcc_assert (cond_code >= 0);
7680         if (code == 'M')
7681           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
7682         fputs (aarch64_condition_codes[cond_code], f);
7683       }
7684       break;
7685
7686     case 'N':
7687       if (!const_vec_duplicate_p (x, &elt))
7688         {
7689           output_operand_lossage ("invalid vector constant");
7690           return;
7691         }
7692
7693       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7694         asm_fprintf (f, "%wd", -INTVAL (elt));
7695       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7696                && aarch64_print_vector_float_operand (f, x, true))
7697         ;
7698       else
7699         {
7700           output_operand_lossage ("invalid vector constant");
7701           return;
7702         }
7703       break;
7704
7705     case 'b':
7706     case 'h':
7707     case 's':
7708     case 'd':
7709     case 'q':
7710       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7711         {
7712           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7713           return;
7714         }
7715       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
7716       break;
7717
7718     case 'S':
7719     case 'T':
7720     case 'U':
7721     case 'V':
7722       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7723         {
7724           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7725           return;
7726         }
7727       asm_fprintf (f, "%c%d",
7728                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
7729                    REGNO (x) - V0_REGNUM + (code - 'S'));
7730       break;
7731
7732     case 'R':
7733       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7734         {
7735           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7736           return;
7737         }
7738       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
7739       break;
7740
7741     case 'X':
7742       if (!CONST_INT_P (x))
7743         {
7744           output_operand_lossage ("invalid operand for '%%%c'", code);
7745           return;
7746         }
7747       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
7748       break;
7749
7750     case 'C':
7751       {
7752         /* Print a replicated constant in hex.  */
7753         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7754           {
7755             output_operand_lossage ("invalid operand for '%%%c'", code);
7756             return;
7757           }
7758         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7759         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7760       }
7761       break;
7762
7763     case 'D':
7764       {
7765         /* Print a replicated constant in decimal, treating it as
7766            unsigned.  */
7767         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7768           {
7769             output_operand_lossage ("invalid operand for '%%%c'", code);
7770             return;
7771           }
7772         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7773         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7774       }
7775       break;
7776
7777     case 'w':
7778     case 'x':
7779       if (x == const0_rtx
7780           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
7781         {
7782           asm_fprintf (f, "%czr", code);
7783           break;
7784         }
7785
7786       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
7787         {
7788           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
7789           break;
7790         }
7791
7792       if (REG_P (x) && REGNO (x) == SP_REGNUM)
7793         {
7794           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
7795           break;
7796         }
7797
7798       /* Fall through */
7799
7800     case 0:
7801       if (x == NULL)
7802         {
7803           output_operand_lossage ("missing operand");
7804           return;
7805         }
7806
7807       switch (GET_CODE (x))
7808         {
7809         case REG:
7810           if (aarch64_sve_data_mode_p (GET_MODE (x)))
7811             {
7812               if (REG_NREGS (x) == 1)
7813                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
7814               else
7815                 {
7816                   char suffix
7817                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
7818                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
7819                                REGNO (x) - V0_REGNUM, suffix,
7820                                END_REGNO (x) - V0_REGNUM - 1, suffix);
7821                 }
7822             }
7823           else
7824             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
7825           break;
7826
7827         case MEM:
7828           output_address (GET_MODE (x), XEXP (x, 0));
7829           break;
7830
7831         case LABEL_REF:
7832         case SYMBOL_REF:
7833           output_addr_const (asm_out_file, x);
7834           break;
7835
7836         case CONST_INT:
7837           asm_fprintf (f, "%wd", INTVAL (x));
7838           break;
7839
7840         case CONST:
7841           if (!VECTOR_MODE_P (GET_MODE (x)))
7842             {
7843               output_addr_const (asm_out_file, x);
7844               break;
7845             }
7846           /* fall through */
7847
7848         case CONST_VECTOR:
7849           if (!const_vec_duplicate_p (x, &elt))
7850             {
7851               output_operand_lossage ("invalid vector constant");
7852               return;
7853             }
7854
7855           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7856             asm_fprintf (f, "%wd", INTVAL (elt));
7857           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7858                    && aarch64_print_vector_float_operand (f, x, false))
7859             ;
7860           else
7861             {
7862               output_operand_lossage ("invalid vector constant");
7863               return;
7864             }
7865           break;
7866
7867         case CONST_DOUBLE:
7868           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
7869              be getting CONST_DOUBLEs holding integers.  */
7870           gcc_assert (GET_MODE (x) != VOIDmode);
7871           if (aarch64_float_const_zero_rtx_p (x))
7872             {
7873               fputc ('0', f);
7874               break;
7875             }
7876           else if (aarch64_float_const_representable_p (x))
7877             {
7878 #define buf_size 20
7879               char float_buf[buf_size] = {'\0'};
7880               real_to_decimal_for_mode (float_buf,
7881                                         CONST_DOUBLE_REAL_VALUE (x),
7882                                         buf_size, buf_size,
7883                                         1, GET_MODE (x));
7884               asm_fprintf (asm_out_file, "%s", float_buf);
7885               break;
7886 #undef buf_size
7887             }
7888           output_operand_lossage ("invalid constant");
7889           return;
7890         default:
7891           output_operand_lossage ("invalid operand");
7892           return;
7893         }
7894       break;
7895
7896     case 'A':
7897       if (GET_CODE (x) == HIGH)
7898         x = XEXP (x, 0);
7899
7900       switch (aarch64_classify_symbolic_expression (x))
7901         {
7902         case SYMBOL_SMALL_GOT_4G:
7903           asm_fprintf (asm_out_file, ":got:");
7904           break;
7905
7906         case SYMBOL_SMALL_TLSGD:
7907           asm_fprintf (asm_out_file, ":tlsgd:");
7908           break;
7909
7910         case SYMBOL_SMALL_TLSDESC:
7911           asm_fprintf (asm_out_file, ":tlsdesc:");
7912           break;
7913
7914         case SYMBOL_SMALL_TLSIE:
7915           asm_fprintf (asm_out_file, ":gottprel:");
7916           break;
7917
7918         case SYMBOL_TLSLE24:
7919           asm_fprintf (asm_out_file, ":tprel:");
7920           break;
7921
7922         case SYMBOL_TINY_GOT:
7923           gcc_unreachable ();
7924           break;
7925
7926         default:
7927           break;
7928         }
7929       output_addr_const (asm_out_file, x);
7930       break;
7931
7932     case 'L':
7933       switch (aarch64_classify_symbolic_expression (x))
7934         {
7935         case SYMBOL_SMALL_GOT_4G:
7936           asm_fprintf (asm_out_file, ":lo12:");
7937           break;
7938
7939         case SYMBOL_SMALL_TLSGD:
7940           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7941           break;
7942
7943         case SYMBOL_SMALL_TLSDESC:
7944           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7945           break;
7946
7947         case SYMBOL_SMALL_TLSIE:
7948           asm_fprintf (asm_out_file, ":gottprel_lo12:");
7949           break;
7950
7951         case SYMBOL_TLSLE12:
7952           asm_fprintf (asm_out_file, ":tprel_lo12:");
7953           break;
7954
7955         case SYMBOL_TLSLE24:
7956           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7957           break;
7958
7959         case SYMBOL_TINY_GOT:
7960           asm_fprintf (asm_out_file, ":got:");
7961           break;
7962
7963         case SYMBOL_TINY_TLSIE:
7964           asm_fprintf (asm_out_file, ":gottprel:");
7965           break;
7966
7967         default:
7968           break;
7969         }
7970       output_addr_const (asm_out_file, x);
7971       break;
7972
7973     case 'G':
7974       switch (aarch64_classify_symbolic_expression (x))
7975         {
7976         case SYMBOL_TLSLE24:
7977           asm_fprintf (asm_out_file, ":tprel_hi12:");
7978           break;
7979         default:
7980           break;
7981         }
7982       output_addr_const (asm_out_file, x);
7983       break;
7984
7985     case 'k':
7986       {
7987         HOST_WIDE_INT cond_code;
7988
7989         if (!CONST_INT_P (x))
7990           {
7991             output_operand_lossage ("invalid operand for '%%%c'", code);
7992             return;
7993           }
7994
7995         cond_code = INTVAL (x);
7996         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7997         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7998       }
7999       break;
8000
8001     case 'y':
8002     case 'z':
8003       {
8004         machine_mode mode = GET_MODE (x);
8005
8006         if (GET_CODE (x) != MEM
8007             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8008           {
8009             output_operand_lossage ("invalid operand for '%%%c'", code);
8010             return;
8011           }
8012
8013         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8014                                             code == 'y'
8015                                             ? ADDR_QUERY_LDP_STP_N
8016                                             : ADDR_QUERY_LDP_STP))
8017           output_operand_lossage ("invalid operand prefix '%%%c'", code);
8018       }
8019       break;
8020
8021     default:
8022       output_operand_lossage ("invalid operand prefix '%%%c'", code);
8023       return;
8024     }
8025 }
8026
8027 /* Print address 'x' of a memory access with mode 'mode'.
8028    'op' is the context required by aarch64_classify_address.  It can either be
8029    MEM for a normal memory access or PARALLEL for LDP/STP.  */
8030 static bool
8031 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8032                                 aarch64_addr_query_type type)
8033 {
8034   struct aarch64_address_info addr;
8035   unsigned int size;
8036
8037   /* Check all addresses are Pmode - including ILP32.  */
8038   if (GET_MODE (x) != Pmode
8039       && (!CONST_INT_P (x)
8040           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8041     {
8042       output_operand_lossage ("invalid address mode");
8043       return false;
8044     }
8045
8046   if (aarch64_classify_address (&addr, x, mode, true, type))
8047     switch (addr.type)
8048       {
8049       case ADDRESS_REG_IMM:
8050         if (known_eq (addr.const_offset, 0))
8051           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8052         else if (aarch64_sve_data_mode_p (mode))
8053           {
8054             HOST_WIDE_INT vnum
8055               = exact_div (addr.const_offset,
8056                            BYTES_PER_SVE_VECTOR).to_constant ();
8057             asm_fprintf (f, "[%s, #%wd, mul vl]",
8058                          reg_names[REGNO (addr.base)], vnum);
8059           }
8060         else if (aarch64_sve_pred_mode_p (mode))
8061           {
8062             HOST_WIDE_INT vnum
8063               = exact_div (addr.const_offset,
8064                            BYTES_PER_SVE_PRED).to_constant ();
8065             asm_fprintf (f, "[%s, #%wd, mul vl]",
8066                          reg_names[REGNO (addr.base)], vnum);
8067           }
8068         else
8069           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8070                        INTVAL (addr.offset));
8071         return true;
8072
8073       case ADDRESS_REG_REG:
8074         if (addr.shift == 0)
8075           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8076                        reg_names [REGNO (addr.offset)]);
8077         else
8078           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8079                        reg_names [REGNO (addr.offset)], addr.shift);
8080         return true;
8081
8082       case ADDRESS_REG_UXTW:
8083         if (addr.shift == 0)
8084           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8085                        REGNO (addr.offset) - R0_REGNUM);
8086         else
8087           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8088                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8089         return true;
8090
8091       case ADDRESS_REG_SXTW:
8092         if (addr.shift == 0)
8093           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8094                        REGNO (addr.offset) - R0_REGNUM);
8095         else
8096           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8097                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8098         return true;
8099
8100       case ADDRESS_REG_WB:
8101         /* Writeback is only supported for fixed-width modes.  */
8102         size = GET_MODE_SIZE (mode).to_constant ();
8103         switch (GET_CODE (x))
8104           {
8105           case PRE_INC:
8106             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8107             return true;
8108           case POST_INC:
8109             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8110             return true;
8111           case PRE_DEC:
8112             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8113             return true;
8114           case POST_DEC:
8115             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
8116             return true;
8117           case PRE_MODIFY:
8118             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
8119                          INTVAL (addr.offset));
8120             return true;
8121           case POST_MODIFY:
8122             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8123                          INTVAL (addr.offset));
8124             return true;
8125           default:
8126             break;
8127           }
8128         break;
8129
8130       case ADDRESS_LO_SUM:
8131         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8132         output_addr_const (f, addr.offset);
8133         asm_fprintf (f, "]");
8134         return true;
8135
8136       case ADDRESS_SYMBOLIC:
8137         output_addr_const (f, x);
8138         return true;
8139       }
8140
8141   return false;
8142 }
8143
8144 /* Print address 'x' of a memory access with mode 'mode'.  */
8145 static void
8146 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8147 {
8148   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
8149     output_addr_const (f, x);
8150 }
8151
8152 bool
8153 aarch64_label_mentioned_p (rtx x)
8154 {
8155   const char *fmt;
8156   int i;
8157
8158   if (GET_CODE (x) == LABEL_REF)
8159     return true;
8160
8161   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8162      referencing instruction, but they are constant offsets, not
8163      symbols.  */
8164   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8165     return false;
8166
8167   fmt = GET_RTX_FORMAT (GET_CODE (x));
8168   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8169     {
8170       if (fmt[i] == 'E')
8171         {
8172           int j;
8173
8174           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8175             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8176               return 1;
8177         }
8178       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
8179         return 1;
8180     }
8181
8182   return 0;
8183 }
8184
8185 /* Implement REGNO_REG_CLASS.  */
8186
8187 enum reg_class
8188 aarch64_regno_regclass (unsigned regno)
8189 {
8190   if (GP_REGNUM_P (regno))
8191     return GENERAL_REGS;
8192
8193   if (regno == SP_REGNUM)
8194     return STACK_REG;
8195
8196   if (regno == FRAME_POINTER_REGNUM
8197       || regno == ARG_POINTER_REGNUM)
8198     return POINTER_REGS;
8199
8200   if (FP_REGNUM_P (regno))
8201     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
8202
8203   if (PR_REGNUM_P (regno))
8204     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
8205
8206   return NO_REGS;
8207 }
8208
8209 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8210    If OFFSET is out of range, return an offset of an anchor point
8211    that is in range.  Return 0 otherwise.  */
8212
8213 static HOST_WIDE_INT
8214 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
8215                        machine_mode mode)
8216 {
8217   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
8218   if (size > 16)
8219     return (offset + 0x400) & ~0x7f0;
8220
8221   /* For offsets that aren't a multiple of the access size, the limit is
8222      -256...255.  */
8223   if (offset & (size - 1))
8224     {
8225       /* BLKmode typically uses LDP of X-registers.  */
8226       if (mode == BLKmode)
8227         return (offset + 512) & ~0x3ff;
8228       return (offset + 0x100) & ~0x1ff;
8229     }
8230
8231   /* Small negative offsets are supported.  */
8232   if (IN_RANGE (offset, -256, 0))
8233     return 0;
8234
8235   if (mode == TImode || mode == TFmode)
8236     return (offset + 0x100) & ~0x1ff;
8237
8238   /* Use 12-bit offset by access size.  */
8239   return offset & (~0xfff * size);
8240 }
8241
8242 static rtx
8243 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
8244 {
8245   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8246      where mask is selected by alignment and size of the offset.
8247      We try to pick as large a range for the offset as possible to
8248      maximize the chance of a CSE.  However, for aligned addresses
8249      we limit the range to 4k so that structures with different sized
8250      elements are likely to use the same base.  We need to be careful
8251      not to split a CONST for some forms of address expression, otherwise
8252      it will generate sub-optimal code.  */
8253
8254   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
8255     {
8256       rtx base = XEXP (x, 0);
8257       rtx offset_rtx = XEXP (x, 1);
8258       HOST_WIDE_INT offset = INTVAL (offset_rtx);
8259
8260       if (GET_CODE (base) == PLUS)
8261         {
8262           rtx op0 = XEXP (base, 0);
8263           rtx op1 = XEXP (base, 1);
8264
8265           /* Force any scaling into a temp for CSE.  */
8266           op0 = force_reg (Pmode, op0);
8267           op1 = force_reg (Pmode, op1);
8268
8269           /* Let the pointer register be in op0.  */
8270           if (REG_POINTER (op1))
8271             std::swap (op0, op1);
8272
8273           /* If the pointer is virtual or frame related, then we know that
8274              virtual register instantiation or register elimination is going
8275              to apply a second constant.  We want the two constants folded
8276              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
8277           if (virt_or_elim_regno_p (REGNO (op0)))
8278             {
8279               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
8280                                    NULL_RTX, true, OPTAB_DIRECT);
8281               return gen_rtx_PLUS (Pmode, base, op1);
8282             }
8283
8284           /* Otherwise, in order to encourage CSE (and thence loop strength
8285              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
8286           base = expand_binop (Pmode, add_optab, op0, op1,
8287                                NULL_RTX, true, OPTAB_DIRECT);
8288           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
8289         }
8290
8291       HOST_WIDE_INT size;
8292       if (GET_MODE_SIZE (mode).is_constant (&size))
8293         {
8294           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
8295                                                              mode);
8296           if (base_offset != 0)
8297             {
8298               base = plus_constant (Pmode, base, base_offset);
8299               base = force_operand (base, NULL_RTX);
8300               return plus_constant (Pmode, base, offset - base_offset);
8301             }
8302         }
8303     }
8304
8305   return x;
8306 }
8307
8308 static reg_class_t
8309 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
8310                           reg_class_t rclass,
8311                           machine_mode mode,
8312                           secondary_reload_info *sri)
8313 {
8314   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8315      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
8316      comment at the head of aarch64-sve.md for more details about the
8317      big-endian handling.  */
8318   if (BYTES_BIG_ENDIAN
8319       && reg_class_subset_p (rclass, FP_REGS)
8320       && !((REG_P (x) && HARD_REGISTER_P (x))
8321            || aarch64_simd_valid_immediate (x, NULL))
8322       && aarch64_sve_data_mode_p (mode))
8323     {
8324       sri->icode = CODE_FOR_aarch64_sve_reload_be;
8325       return NO_REGS;
8326     }
8327
8328   /* If we have to disable direct literal pool loads and stores because the
8329      function is too big, then we need a scratch register.  */
8330   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
8331       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
8332           || targetm.vector_mode_supported_p (GET_MODE (x)))
8333       && !aarch64_pcrelative_literal_loads)
8334     {
8335       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
8336       return NO_REGS;
8337     }
8338
8339   /* Without the TARGET_SIMD instructions we cannot move a Q register
8340      to a Q register directly.  We need a scratch.  */
8341   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
8342       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
8343       && reg_class_subset_p (rclass, FP_REGS))
8344     {
8345       sri->icode = code_for_aarch64_reload_mov (mode);
8346       return NO_REGS;
8347     }
8348
8349   /* A TFmode or TImode memory access should be handled via an FP_REGS
8350      because AArch64 has richer addressing modes for LDR/STR instructions
8351      than LDP/STP instructions.  */
8352   if (TARGET_FLOAT && rclass == GENERAL_REGS
8353       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
8354     return FP_REGS;
8355
8356   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
8357       return GENERAL_REGS;
8358
8359   return NO_REGS;
8360 }
8361
8362 static bool
8363 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
8364 {
8365   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
8366
8367   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8368      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
8369   if (frame_pointer_needed)
8370     return to == HARD_FRAME_POINTER_REGNUM;
8371   return true;
8372 }
8373
8374 poly_int64
8375 aarch64_initial_elimination_offset (unsigned from, unsigned to)
8376 {
8377   if (to == HARD_FRAME_POINTER_REGNUM)
8378     {
8379       if (from == ARG_POINTER_REGNUM)
8380         return cfun->machine->frame.hard_fp_offset;
8381
8382       if (from == FRAME_POINTER_REGNUM)
8383         return cfun->machine->frame.hard_fp_offset
8384                - cfun->machine->frame.locals_offset;
8385     }
8386
8387   if (to == STACK_POINTER_REGNUM)
8388     {
8389       if (from == FRAME_POINTER_REGNUM)
8390           return cfun->machine->frame.frame_size
8391                  - cfun->machine->frame.locals_offset;
8392     }
8393
8394   return cfun->machine->frame.frame_size;
8395 }
8396
8397 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
8398    previous frame.  */
8399
8400 rtx
8401 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
8402 {
8403   if (count != 0)
8404     return const0_rtx;
8405   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
8406 }
8407
8408
8409 static void
8410 aarch64_asm_trampoline_template (FILE *f)
8411 {
8412   int offset1 = 16;
8413   int offset2 = 20;
8414
8415   if (aarch64_bti_enabled ())
8416     {
8417       asm_fprintf (f, "\thint\t34 // bti c\n");
8418       offset1 -= 4;
8419       offset2 -= 4;
8420     }
8421
8422   if (TARGET_ILP32)
8423     {
8424       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
8425       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
8426                    offset1);
8427     }
8428   else
8429     {
8430       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
8431       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
8432                    offset2);
8433     }
8434   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
8435
8436   /* The trampoline needs an extra padding instruction.  In case if BTI is
8437      enabled the padding instruction is replaced by the BTI instruction at
8438      the beginning.  */
8439   if (!aarch64_bti_enabled ())
8440     assemble_aligned_integer (4, const0_rtx);
8441
8442   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8443   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8444 }
8445
8446 static void
8447 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
8448 {
8449   rtx fnaddr, mem, a_tramp;
8450   const int tramp_code_sz = 16;
8451
8452   /* Don't need to copy the trailing D-words, we fill those in below.  */
8453   emit_block_move (m_tramp, assemble_trampoline_template (),
8454                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
8455   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
8456   fnaddr = XEXP (DECL_RTL (fndecl), 0);
8457   if (GET_MODE (fnaddr) != ptr_mode)
8458     fnaddr = convert_memory_address (ptr_mode, fnaddr);
8459   emit_move_insn (mem, fnaddr);
8460
8461   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
8462   emit_move_insn (mem, chain_value);
8463
8464   /* XXX We should really define a "clear_cache" pattern and use
8465      gen_clear_cache().  */
8466   a_tramp = XEXP (m_tramp, 0);
8467   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
8468                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
8469                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
8470                      ptr_mode);
8471 }
8472
8473 static unsigned char
8474 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
8475 {
8476   /* ??? Logically we should only need to provide a value when
8477      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
8478      can hold MODE, but at the moment we need to handle all modes.
8479      Just ignore any runtime parts for registers that can't store them.  */
8480   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
8481   unsigned int nregs;
8482   switch (regclass)
8483     {
8484     case TAILCALL_ADDR_REGS:
8485     case POINTER_REGS:
8486     case GENERAL_REGS:
8487     case ALL_REGS:
8488     case POINTER_AND_FP_REGS:
8489     case FP_REGS:
8490     case FP_LO_REGS:
8491       if (aarch64_sve_data_mode_p (mode)
8492           && constant_multiple_p (GET_MODE_SIZE (mode),
8493                                   BYTES_PER_SVE_VECTOR, &nregs))
8494         return nregs;
8495       return (aarch64_vector_data_mode_p (mode)
8496               ? CEIL (lowest_size, UNITS_PER_VREG)
8497               : CEIL (lowest_size, UNITS_PER_WORD));
8498     case STACK_REG:
8499     case PR_REGS:
8500     case PR_LO_REGS:
8501     case PR_HI_REGS:
8502       return 1;
8503
8504     case NO_REGS:
8505       return 0;
8506
8507     default:
8508       break;
8509     }
8510   gcc_unreachable ();
8511 }
8512
8513 static reg_class_t
8514 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
8515 {
8516   if (regclass == POINTER_REGS)
8517     return GENERAL_REGS;
8518
8519   if (regclass == STACK_REG)
8520     {
8521       if (REG_P(x)
8522           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
8523           return regclass;
8524
8525       return NO_REGS;
8526     }
8527
8528   /* Register eliminiation can result in a request for
8529      SP+constant->FP_REGS.  We cannot support such operations which
8530      use SP as source and an FP_REG as destination, so reject out
8531      right now.  */
8532   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
8533     {
8534       rtx lhs = XEXP (x, 0);
8535
8536       /* Look through a possible SUBREG introduced by ILP32.  */
8537       if (GET_CODE (lhs) == SUBREG)
8538         lhs = SUBREG_REG (lhs);
8539
8540       gcc_assert (REG_P (lhs));
8541       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
8542                                       POINTER_REGS));
8543       return NO_REGS;
8544     }
8545
8546   return regclass;
8547 }
8548
8549 void
8550 aarch64_asm_output_labelref (FILE* f, const char *name)
8551 {
8552   asm_fprintf (f, "%U%s", name);
8553 }
8554
8555 static void
8556 aarch64_elf_asm_constructor (rtx symbol, int priority)
8557 {
8558   if (priority == DEFAULT_INIT_PRIORITY)
8559     default_ctor_section_asm_out_constructor (symbol, priority);
8560   else
8561     {
8562       section *s;
8563       /* While priority is known to be in range [0, 65535], so 18 bytes
8564          would be enough, the compiler might not know that.  To avoid
8565          -Wformat-truncation false positive, use a larger size.  */
8566       char buf[23];
8567       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
8568       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8569       switch_to_section (s);
8570       assemble_align (POINTER_SIZE);
8571       assemble_aligned_integer (POINTER_BYTES, symbol);
8572     }
8573 }
8574
8575 static void
8576 aarch64_elf_asm_destructor (rtx symbol, int priority)
8577 {
8578   if (priority == DEFAULT_INIT_PRIORITY)
8579     default_dtor_section_asm_out_destructor (symbol, priority);
8580   else
8581     {
8582       section *s;
8583       /* While priority is known to be in range [0, 65535], so 18 bytes
8584          would be enough, the compiler might not know that.  To avoid
8585          -Wformat-truncation false positive, use a larger size.  */
8586       char buf[23];
8587       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
8588       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8589       switch_to_section (s);
8590       assemble_align (POINTER_SIZE);
8591       assemble_aligned_integer (POINTER_BYTES, symbol);
8592     }
8593 }
8594
8595 const char*
8596 aarch64_output_casesi (rtx *operands)
8597 {
8598   char buf[100];
8599   char label[100];
8600   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
8601   int index;
8602   static const char *const patterns[4][2] =
8603   {
8604     {
8605       "ldrb\t%w3, [%0,%w1,uxtw]",
8606       "add\t%3, %4, %w3, sxtb #2"
8607     },
8608     {
8609       "ldrh\t%w3, [%0,%w1,uxtw #1]",
8610       "add\t%3, %4, %w3, sxth #2"
8611     },
8612     {
8613       "ldr\t%w3, [%0,%w1,uxtw #2]",
8614       "add\t%3, %4, %w3, sxtw #2"
8615     },
8616     /* We assume that DImode is only generated when not optimizing and
8617        that we don't really need 64-bit address offsets.  That would
8618        imply an object file with 8GB of code in a single function!  */
8619     {
8620       "ldr\t%w3, [%0,%w1,uxtw #2]",
8621       "add\t%3, %4, %w3, sxtw #2"
8622     }
8623   };
8624
8625   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
8626
8627   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
8628   index = exact_log2 (GET_MODE_SIZE (mode));
8629
8630   gcc_assert (index >= 0 && index <= 3);
8631
8632   /* Need to implement table size reduction, by chaning the code below.  */
8633   output_asm_insn (patterns[index][0], operands);
8634   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
8635   snprintf (buf, sizeof (buf),
8636             "adr\t%%4, %s", targetm.strip_name_encoding (label));
8637   output_asm_insn (buf, operands);
8638   output_asm_insn (patterns[index][1], operands);
8639   output_asm_insn ("br\t%3", operands);
8640   assemble_label (asm_out_file, label);
8641   return "";
8642 }
8643
8644
8645 /* Return size in bits of an arithmetic operand which is shifted/scaled and
8646    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
8647    operator.  */
8648
8649 int
8650 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
8651 {
8652   if (shift >= 0 && shift <= 3)
8653     {
8654       int size;
8655       for (size = 8; size <= 32; size *= 2)
8656         {
8657           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
8658           if (mask == bits << shift)
8659             return size;
8660         }
8661     }
8662   return 0;
8663 }
8664
8665 /* Constant pools are per function only when PC relative
8666    literal loads are true or we are in the large memory
8667    model.  */
8668
8669 static inline bool
8670 aarch64_can_use_per_function_literal_pools_p (void)
8671 {
8672   return (aarch64_pcrelative_literal_loads
8673           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
8674 }
8675
8676 static bool
8677 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
8678 {
8679   /* We can't use blocks for constants when we're using a per-function
8680      constant pool.  */
8681   return !aarch64_can_use_per_function_literal_pools_p ();
8682 }
8683
8684 /* Select appropriate section for constants depending
8685    on where we place literal pools.  */
8686
8687 static section *
8688 aarch64_select_rtx_section (machine_mode mode,
8689                             rtx x,
8690                             unsigned HOST_WIDE_INT align)
8691 {
8692   if (aarch64_can_use_per_function_literal_pools_p ())
8693     return function_section (current_function_decl);
8694
8695   return default_elf_select_rtx_section (mode, x, align);
8696 }
8697
8698 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
8699 void
8700 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
8701                                   HOST_WIDE_INT offset)
8702 {
8703   /* When using per-function literal pools, we must ensure that any code
8704      section is aligned to the minimal instruction length, lest we get
8705      errors from the assembler re "unaligned instructions".  */
8706   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
8707     ASM_OUTPUT_ALIGN (f, 2);
8708 }
8709
8710 /* Costs.  */
8711
8712 /* Helper function for rtx cost calculation.  Strip a shift expression
8713    from X.  Returns the inner operand if successful, or the original
8714    expression on failure.  */
8715 static rtx
8716 aarch64_strip_shift (rtx x)
8717 {
8718   rtx op = x;
8719
8720   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
8721      we can convert both to ROR during final output.  */
8722   if ((GET_CODE (op) == ASHIFT
8723        || GET_CODE (op) == ASHIFTRT
8724        || GET_CODE (op) == LSHIFTRT
8725        || GET_CODE (op) == ROTATERT
8726        || GET_CODE (op) == ROTATE)
8727       && CONST_INT_P (XEXP (op, 1)))
8728     return XEXP (op, 0);
8729
8730   if (GET_CODE (op) == MULT
8731       && CONST_INT_P (XEXP (op, 1))
8732       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
8733     return XEXP (op, 0);
8734
8735   return x;
8736 }
8737
8738 /* Helper function for rtx cost calculation.  Strip an extend
8739    expression from X.  Returns the inner operand if successful, or the
8740    original expression on failure.  We deal with a number of possible
8741    canonicalization variations here. If STRIP_SHIFT is true, then
8742    we can strip off a shift also.  */
8743 static rtx
8744 aarch64_strip_extend (rtx x, bool strip_shift)
8745 {
8746   scalar_int_mode mode;
8747   rtx op = x;
8748
8749   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
8750     return op;
8751
8752   /* Zero and sign extraction of a widened value.  */
8753   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
8754       && XEXP (op, 2) == const0_rtx
8755       && GET_CODE (XEXP (op, 0)) == MULT
8756       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
8757                                          XEXP (op, 1)))
8758     return XEXP (XEXP (op, 0), 0);
8759
8760   /* It can also be represented (for zero-extend) as an AND with an
8761      immediate.  */
8762   if (GET_CODE (op) == AND
8763       && GET_CODE (XEXP (op, 0)) == MULT
8764       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
8765       && CONST_INT_P (XEXP (op, 1))
8766       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
8767                            INTVAL (XEXP (op, 1))) != 0)
8768     return XEXP (XEXP (op, 0), 0);
8769
8770   /* Now handle extended register, as this may also have an optional
8771      left shift by 1..4.  */
8772   if (strip_shift
8773       && GET_CODE (op) == ASHIFT
8774       && CONST_INT_P (XEXP (op, 1))
8775       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
8776     op = XEXP (op, 0);
8777
8778   if (GET_CODE (op) == ZERO_EXTEND
8779       || GET_CODE (op) == SIGN_EXTEND)
8780     op = XEXP (op, 0);
8781
8782   if (op != x)
8783     return op;
8784
8785   return x;
8786 }
8787
8788 /* Return true iff CODE is a shift supported in combination
8789    with arithmetic instructions.  */
8790
8791 static bool
8792 aarch64_shift_p (enum rtx_code code)
8793 {
8794   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
8795 }
8796
8797
8798 /* Return true iff X is a cheap shift without a sign extend. */
8799
8800 static bool
8801 aarch64_cheap_mult_shift_p (rtx x)
8802 {
8803   rtx op0, op1;
8804
8805   op0 = XEXP (x, 0);
8806   op1 = XEXP (x, 1);
8807
8808   if (!(aarch64_tune_params.extra_tuning_flags
8809                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
8810     return false;
8811
8812   if (GET_CODE (op0) == SIGN_EXTEND)
8813     return false;
8814
8815   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
8816       && UINTVAL (op1) <= 4)
8817     return true;
8818
8819   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
8820     return false;
8821
8822   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
8823
8824   if (l2 > 0 && l2 <= 4)
8825     return true;
8826
8827   return false;
8828 }
8829
8830 /* Helper function for rtx cost calculation.  Calculate the cost of
8831    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
8832    Return the calculated cost of the expression, recursing manually in to
8833    operands where needed.  */
8834
8835 static int
8836 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
8837 {
8838   rtx op0, op1;
8839   const struct cpu_cost_table *extra_cost
8840     = aarch64_tune_params.insn_extra_cost;
8841   int cost = 0;
8842   bool compound_p = (outer == PLUS || outer == MINUS);
8843   machine_mode mode = GET_MODE (x);
8844
8845   gcc_checking_assert (code == MULT);
8846
8847   op0 = XEXP (x, 0);
8848   op1 = XEXP (x, 1);
8849
8850   if (VECTOR_MODE_P (mode))
8851     mode = GET_MODE_INNER (mode);
8852
8853   /* Integer multiply/fma.  */
8854   if (GET_MODE_CLASS (mode) == MODE_INT)
8855     {
8856       /* The multiply will be canonicalized as a shift, cost it as such.  */
8857       if (aarch64_shift_p (GET_CODE (x))
8858           || (CONST_INT_P (op1)
8859               && exact_log2 (INTVAL (op1)) > 0))
8860         {
8861           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
8862                            || GET_CODE (op0) == SIGN_EXTEND;
8863           if (speed)
8864             {
8865               if (compound_p)
8866                 {
8867                   /* If the shift is considered cheap,
8868                      then don't add any cost. */
8869                   if (aarch64_cheap_mult_shift_p (x))
8870                     ;
8871                   else if (REG_P (op1))
8872                     /* ARITH + shift-by-register.  */
8873                     cost += extra_cost->alu.arith_shift_reg;
8874                   else if (is_extend)
8875                     /* ARITH + extended register.  We don't have a cost field
8876                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
8877                     cost += extra_cost->alu.extend_arith;
8878                   else
8879                     /* ARITH + shift-by-immediate.  */
8880                     cost += extra_cost->alu.arith_shift;
8881                 }
8882               else
8883                 /* LSL (immediate).  */
8884                 cost += extra_cost->alu.shift;
8885
8886             }
8887           /* Strip extends as we will have costed them in the case above.  */
8888           if (is_extend)
8889             op0 = aarch64_strip_extend (op0, true);
8890
8891           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
8892
8893           return cost;
8894         }
8895
8896       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
8897          compound and let the below cases handle it.  After all, MNEG is a
8898          special-case alias of MSUB.  */
8899       if (GET_CODE (op0) == NEG)
8900         {
8901           op0 = XEXP (op0, 0);
8902           compound_p = true;
8903         }
8904
8905       /* Integer multiplies or FMAs have zero/sign extending variants.  */
8906       if ((GET_CODE (op0) == ZERO_EXTEND
8907            && GET_CODE (op1) == ZERO_EXTEND)
8908           || (GET_CODE (op0) == SIGN_EXTEND
8909               && GET_CODE (op1) == SIGN_EXTEND))
8910         {
8911           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8912           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8913
8914           if (speed)
8915             {
8916               if (compound_p)
8917                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
8918                 cost += extra_cost->mult[0].extend_add;
8919               else
8920                 /* MUL/SMULL/UMULL.  */
8921                 cost += extra_cost->mult[0].extend;
8922             }
8923
8924           return cost;
8925         }
8926
8927       /* This is either an integer multiply or a MADD.  In both cases
8928          we want to recurse and cost the operands.  */
8929       cost += rtx_cost (op0, mode, MULT, 0, speed);
8930       cost += rtx_cost (op1, mode, MULT, 1, speed);
8931
8932       if (speed)
8933         {
8934           if (compound_p)
8935             /* MADD/MSUB.  */
8936             cost += extra_cost->mult[mode == DImode].add;
8937           else
8938             /* MUL.  */
8939             cost += extra_cost->mult[mode == DImode].simple;
8940         }
8941
8942       return cost;
8943     }
8944   else
8945     {
8946       if (speed)
8947         {
8948           /* Floating-point FMA/FMUL can also support negations of the
8949              operands, unless the rounding mode is upward or downward in
8950              which case FNMUL is different than FMUL with operand negation.  */
8951           bool neg0 = GET_CODE (op0) == NEG;
8952           bool neg1 = GET_CODE (op1) == NEG;
8953           if (compound_p || !flag_rounding_math || (neg0 && neg1))
8954             {
8955               if (neg0)
8956                 op0 = XEXP (op0, 0);
8957               if (neg1)
8958                 op1 = XEXP (op1, 0);
8959             }
8960
8961           if (compound_p)
8962             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
8963             cost += extra_cost->fp[mode == DFmode].fma;
8964           else
8965             /* FMUL/FNMUL.  */
8966             cost += extra_cost->fp[mode == DFmode].mult;
8967         }
8968
8969       cost += rtx_cost (op0, mode, MULT, 0, speed);
8970       cost += rtx_cost (op1, mode, MULT, 1, speed);
8971       return cost;
8972     }
8973 }
8974
8975 static int
8976 aarch64_address_cost (rtx x,
8977                       machine_mode mode,
8978                       addr_space_t as ATTRIBUTE_UNUSED,
8979                       bool speed)
8980 {
8981   enum rtx_code c = GET_CODE (x);
8982   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8983   struct aarch64_address_info info;
8984   int cost = 0;
8985   info.shift = 0;
8986
8987   if (!aarch64_classify_address (&info, x, mode, false))
8988     {
8989       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8990         {
8991           /* This is a CONST or SYMBOL ref which will be split
8992              in a different way depending on the code model in use.
8993              Cost it through the generic infrastructure.  */
8994           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8995           /* Divide through by the cost of one instruction to
8996              bring it to the same units as the address costs.  */
8997           cost_symbol_ref /= COSTS_N_INSNS (1);
8998           /* The cost is then the cost of preparing the address,
8999              followed by an immediate (possibly 0) offset.  */
9000           return cost_symbol_ref + addr_cost->imm_offset;
9001         }
9002       else
9003         {
9004           /* This is most likely a jump table from a case
9005              statement.  */
9006           return addr_cost->register_offset;
9007         }
9008     }
9009
9010   switch (info.type)
9011     {
9012       case ADDRESS_LO_SUM:
9013       case ADDRESS_SYMBOLIC:
9014       case ADDRESS_REG_IMM:
9015         cost += addr_cost->imm_offset;
9016         break;
9017
9018       case ADDRESS_REG_WB:
9019         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9020           cost += addr_cost->pre_modify;
9021         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9022           cost += addr_cost->post_modify;
9023         else
9024           gcc_unreachable ();
9025
9026         break;
9027
9028       case ADDRESS_REG_REG:
9029         cost += addr_cost->register_offset;
9030         break;
9031
9032       case ADDRESS_REG_SXTW:
9033         cost += addr_cost->register_sextend;
9034         break;
9035
9036       case ADDRESS_REG_UXTW:
9037         cost += addr_cost->register_zextend;
9038         break;
9039
9040       default:
9041         gcc_unreachable ();
9042     }
9043
9044
9045   if (info.shift > 0)
9046     {
9047       /* For the sake of calculating the cost of the shifted register
9048          component, we can treat same sized modes in the same way.  */
9049       if (known_eq (GET_MODE_BITSIZE (mode), 16))
9050         cost += addr_cost->addr_scale_costs.hi;
9051       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9052         cost += addr_cost->addr_scale_costs.si;
9053       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9054         cost += addr_cost->addr_scale_costs.di;
9055       else
9056         /* We can't tell, or this is a 128-bit vector.  */
9057         cost += addr_cost->addr_scale_costs.ti;
9058     }
9059
9060   return cost;
9061 }
9062
9063 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
9064    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
9065    to be taken.  */
9066
9067 int
9068 aarch64_branch_cost (bool speed_p, bool predictable_p)
9069 {
9070   /* When optimizing for speed, use the cost of unpredictable branches.  */
9071   const struct cpu_branch_cost *branch_costs =
9072     aarch64_tune_params.branch_costs;
9073
9074   if (!speed_p || predictable_p)
9075     return branch_costs->predictable;
9076   else
9077     return branch_costs->unpredictable;
9078 }
9079
9080 /* Return true if the RTX X in mode MODE is a zero or sign extract
9081    usable in an ADD or SUB (extended register) instruction.  */
9082 static bool
9083 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9084 {
9085   /* Catch add with a sign extract.
9086      This is add_<optab><mode>_multp2.  */
9087   if (GET_CODE (x) == SIGN_EXTRACT
9088       || GET_CODE (x) == ZERO_EXTRACT)
9089     {
9090       rtx op0 = XEXP (x, 0);
9091       rtx op1 = XEXP (x, 1);
9092       rtx op2 = XEXP (x, 2);
9093
9094       if (GET_CODE (op0) == MULT
9095           && CONST_INT_P (op1)
9096           && op2 == const0_rtx
9097           && CONST_INT_P (XEXP (op0, 1))
9098           && aarch64_is_extend_from_extract (mode,
9099                                              XEXP (op0, 1),
9100                                              op1))
9101         {
9102           return true;
9103         }
9104     }
9105   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9106      No shift.  */
9107   else if (GET_CODE (x) == SIGN_EXTEND
9108            || GET_CODE (x) == ZERO_EXTEND)
9109     return REG_P (XEXP (x, 0));
9110
9111   return false;
9112 }
9113
9114 static bool
9115 aarch64_frint_unspec_p (unsigned int u)
9116 {
9117   switch (u)
9118     {
9119       case UNSPEC_FRINTZ:
9120       case UNSPEC_FRINTP:
9121       case UNSPEC_FRINTM:
9122       case UNSPEC_FRINTA:
9123       case UNSPEC_FRINTN:
9124       case UNSPEC_FRINTX:
9125       case UNSPEC_FRINTI:
9126         return true;
9127
9128       default:
9129         return false;
9130     }
9131 }
9132
9133 /* Return true iff X is an rtx that will match an extr instruction
9134    i.e. as described in the *extr<mode>5_insn family of patterns.
9135    OP0 and OP1 will be set to the operands of the shifts involved
9136    on success and will be NULL_RTX otherwise.  */
9137
9138 static bool
9139 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9140 {
9141   rtx op0, op1;
9142   scalar_int_mode mode;
9143   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9144     return false;
9145
9146   *res_op0 = NULL_RTX;
9147   *res_op1 = NULL_RTX;
9148
9149   if (GET_CODE (x) != IOR)
9150     return false;
9151
9152   op0 = XEXP (x, 0);
9153   op1 = XEXP (x, 1);
9154
9155   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
9156       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
9157     {
9158      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
9159       if (GET_CODE (op1) == ASHIFT)
9160         std::swap (op0, op1);
9161
9162       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9163         return false;
9164
9165       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9166       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9167
9168       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9169           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9170         {
9171           *res_op0 = XEXP (op0, 0);
9172           *res_op1 = XEXP (op1, 0);
9173           return true;
9174         }
9175     }
9176
9177   return false;
9178 }
9179
9180 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9181    storing it in *COST.  Result is true if the total cost of the operation
9182    has now been calculated.  */
9183 static bool
9184 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
9185 {
9186   rtx inner;
9187   rtx comparator;
9188   enum rtx_code cmpcode;
9189
9190   if (COMPARISON_P (op0))
9191     {
9192       inner = XEXP (op0, 0);
9193       comparator = XEXP (op0, 1);
9194       cmpcode = GET_CODE (op0);
9195     }
9196   else
9197     {
9198       inner = op0;
9199       comparator = const0_rtx;
9200       cmpcode = NE;
9201     }
9202
9203   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
9204     {
9205       /* Conditional branch.  */
9206       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9207         return true;
9208       else
9209         {
9210           if (cmpcode == NE || cmpcode == EQ)
9211             {
9212               if (comparator == const0_rtx)
9213                 {
9214                   /* TBZ/TBNZ/CBZ/CBNZ.  */
9215                   if (GET_CODE (inner) == ZERO_EXTRACT)
9216                     /* TBZ/TBNZ.  */
9217                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
9218                                        ZERO_EXTRACT, 0, speed);
9219                   else
9220                     /* CBZ/CBNZ.  */
9221                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
9222
9223                 return true;
9224               }
9225             }
9226           else if (cmpcode == LT || cmpcode == GE)
9227             {
9228               /* TBZ/TBNZ.  */
9229               if (comparator == const0_rtx)
9230                 return true;
9231             }
9232         }
9233     }
9234   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9235     {
9236       /* CCMP.  */
9237       if (GET_CODE (op1) == COMPARE)
9238         {
9239           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
9240           if (XEXP (op1, 1) == const0_rtx)
9241             *cost += 1;
9242           if (speed)
9243             {
9244               machine_mode mode = GET_MODE (XEXP (op1, 0));
9245               const struct cpu_cost_table *extra_cost
9246                 = aarch64_tune_params.insn_extra_cost;
9247
9248               if (GET_MODE_CLASS (mode) == MODE_INT)
9249                 *cost += extra_cost->alu.arith;
9250               else
9251                 *cost += extra_cost->fp[mode == DFmode].compare;
9252             }
9253           return true;
9254         }
9255
9256       /* It's a conditional operation based on the status flags,
9257          so it must be some flavor of CSEL.  */
9258
9259       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
9260       if (GET_CODE (op1) == NEG
9261           || GET_CODE (op1) == NOT
9262           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
9263         op1 = XEXP (op1, 0);
9264       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
9265         {
9266           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
9267           op1 = XEXP (op1, 0);
9268           op2 = XEXP (op2, 0);
9269         }
9270
9271       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
9272       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
9273       return true;
9274     }
9275
9276   /* We don't know what this is, cost all operands.  */
9277   return false;
9278 }
9279
9280 /* Check whether X is a bitfield operation of the form shift + extend that
9281    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
9282    operand to which the bitfield operation is applied.  Otherwise return
9283    NULL_RTX.  */
9284
9285 static rtx
9286 aarch64_extend_bitfield_pattern_p (rtx x)
9287 {
9288   rtx_code outer_code = GET_CODE (x);
9289   machine_mode outer_mode = GET_MODE (x);
9290
9291   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
9292       && outer_mode != SImode && outer_mode != DImode)
9293     return NULL_RTX;
9294
9295   rtx inner = XEXP (x, 0);
9296   rtx_code inner_code = GET_CODE (inner);
9297   machine_mode inner_mode = GET_MODE (inner);
9298   rtx op = NULL_RTX;
9299
9300   switch (inner_code)
9301     {
9302       case ASHIFT:
9303         if (CONST_INT_P (XEXP (inner, 1))
9304             && (inner_mode == QImode || inner_mode == HImode))
9305           op = XEXP (inner, 0);
9306         break;
9307       case LSHIFTRT:
9308         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
9309             && (inner_mode == QImode || inner_mode == HImode))
9310           op = XEXP (inner, 0);
9311         break;
9312       case ASHIFTRT:
9313         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
9314             && (inner_mode == QImode || inner_mode == HImode))
9315           op = XEXP (inner, 0);
9316         break;
9317       default:
9318         break;
9319     }
9320
9321   return op;
9322 }
9323
9324 /* Return true if the mask and a shift amount from an RTX of the form
9325    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9326    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
9327
9328 bool
9329 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
9330                                     rtx shft_amnt)
9331 {
9332   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
9333          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
9334          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
9335          && (INTVAL (mask)
9336              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
9337 }
9338
9339 /* Calculate the cost of calculating X, storing it in *COST.  Result
9340    is true if the total cost of the operation has now been calculated.  */
9341 static bool
9342 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
9343                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
9344 {
9345   rtx op0, op1, op2;
9346   const struct cpu_cost_table *extra_cost
9347     = aarch64_tune_params.insn_extra_cost;
9348   int code = GET_CODE (x);
9349   scalar_int_mode int_mode;
9350
9351   /* By default, assume that everything has equivalent cost to the
9352      cheapest instruction.  Any additional costs are applied as a delta
9353      above this default.  */
9354   *cost = COSTS_N_INSNS (1);
9355
9356   switch (code)
9357     {
9358     case SET:
9359       /* The cost depends entirely on the operands to SET.  */
9360       *cost = 0;
9361       op0 = SET_DEST (x);
9362       op1 = SET_SRC (x);
9363
9364       switch (GET_CODE (op0))
9365         {
9366         case MEM:
9367           if (speed)
9368             {
9369               rtx address = XEXP (op0, 0);
9370               if (VECTOR_MODE_P (mode))
9371                 *cost += extra_cost->ldst.storev;
9372               else if (GET_MODE_CLASS (mode) == MODE_INT)
9373                 *cost += extra_cost->ldst.store;
9374               else if (mode == SFmode)
9375                 *cost += extra_cost->ldst.storef;
9376               else if (mode == DFmode)
9377                 *cost += extra_cost->ldst.stored;
9378
9379               *cost +=
9380                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9381                                                      0, speed));
9382             }
9383
9384           *cost += rtx_cost (op1, mode, SET, 1, speed);
9385           return true;
9386
9387         case SUBREG:
9388           if (! REG_P (SUBREG_REG (op0)))
9389             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
9390
9391           /* Fall through.  */
9392         case REG:
9393           /* The cost is one per vector-register copied.  */
9394           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
9395             {
9396               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
9397               *cost = COSTS_N_INSNS (nregs);
9398             }
9399           /* const0_rtx is in general free, but we will use an
9400              instruction to set a register to 0.  */
9401           else if (REG_P (op1) || op1 == const0_rtx)
9402             {
9403               /* The cost is 1 per register copied.  */
9404               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
9405               *cost = COSTS_N_INSNS (nregs);
9406             }
9407           else
9408             /* Cost is just the cost of the RHS of the set.  */
9409             *cost += rtx_cost (op1, mode, SET, 1, speed);
9410           return true;
9411
9412         case ZERO_EXTRACT:
9413         case SIGN_EXTRACT:
9414           /* Bit-field insertion.  Strip any redundant widening of
9415              the RHS to meet the width of the target.  */
9416           if (GET_CODE (op1) == SUBREG)
9417             op1 = SUBREG_REG (op1);
9418           if ((GET_CODE (op1) == ZERO_EXTEND
9419                || GET_CODE (op1) == SIGN_EXTEND)
9420               && CONST_INT_P (XEXP (op0, 1))
9421               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
9422               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
9423             op1 = XEXP (op1, 0);
9424
9425           if (CONST_INT_P (op1))
9426             {
9427               /* MOV immediate is assumed to always be cheap.  */
9428               *cost = COSTS_N_INSNS (1);
9429             }
9430           else
9431             {
9432               /* BFM.  */
9433               if (speed)
9434                 *cost += extra_cost->alu.bfi;
9435               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
9436             }
9437
9438           return true;
9439
9440         default:
9441           /* We can't make sense of this, assume default cost.  */
9442           *cost = COSTS_N_INSNS (1);
9443           return false;
9444         }
9445       return false;
9446
9447     case CONST_INT:
9448       /* If an instruction can incorporate a constant within the
9449          instruction, the instruction's expression avoids calling
9450          rtx_cost() on the constant.  If rtx_cost() is called on a
9451          constant, then it is usually because the constant must be
9452          moved into a register by one or more instructions.
9453
9454          The exception is constant 0, which can be expressed
9455          as XZR/WZR and is therefore free.  The exception to this is
9456          if we have (set (reg) (const0_rtx)) in which case we must cost
9457          the move.  However, we can catch that when we cost the SET, so
9458          we don't need to consider that here.  */
9459       if (x == const0_rtx)
9460         *cost = 0;
9461       else
9462         {
9463           /* To an approximation, building any other constant is
9464              proportionally expensive to the number of instructions
9465              required to build that constant.  This is true whether we
9466              are compiling for SPEED or otherwise.  */
9467           if (!is_a <scalar_int_mode> (mode, &int_mode))
9468             int_mode = word_mode;
9469           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
9470                                  (NULL_RTX, x, false, int_mode));
9471         }
9472       return true;
9473
9474     case CONST_DOUBLE:
9475
9476       /* First determine number of instructions to do the move
9477           as an integer constant.  */
9478       if (!aarch64_float_const_representable_p (x)
9479            && !aarch64_can_const_movi_rtx_p (x, mode)
9480            && aarch64_float_const_rtx_p (x))
9481         {
9482           unsigned HOST_WIDE_INT ival;
9483           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
9484           gcc_assert (succeed);
9485
9486           scalar_int_mode imode = (mode == HFmode
9487                                    ? SImode
9488                                    : int_mode_for_mode (mode).require ());
9489           int ncost = aarch64_internal_mov_immediate
9490                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9491           *cost += COSTS_N_INSNS (ncost);
9492           return true;
9493         }
9494
9495       if (speed)
9496         {
9497           /* mov[df,sf]_aarch64.  */
9498           if (aarch64_float_const_representable_p (x))
9499             /* FMOV (scalar immediate).  */
9500             *cost += extra_cost->fp[mode == DFmode].fpconst;
9501           else if (!aarch64_float_const_zero_rtx_p (x))
9502             {
9503               /* This will be a load from memory.  */
9504               if (mode == DFmode)
9505                 *cost += extra_cost->ldst.loadd;
9506               else
9507                 *cost += extra_cost->ldst.loadf;
9508             }
9509           else
9510             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
9511                or MOV v0.s[0], wzr - neither of which are modeled by the
9512                cost tables.  Just use the default cost.  */
9513             {
9514             }
9515         }
9516
9517       return true;
9518
9519     case MEM:
9520       if (speed)
9521         {
9522           /* For loads we want the base cost of a load, plus an
9523              approximation for the additional cost of the addressing
9524              mode.  */
9525           rtx address = XEXP (x, 0);
9526           if (VECTOR_MODE_P (mode))
9527             *cost += extra_cost->ldst.loadv;
9528           else if (GET_MODE_CLASS (mode) == MODE_INT)
9529             *cost += extra_cost->ldst.load;
9530           else if (mode == SFmode)
9531             *cost += extra_cost->ldst.loadf;
9532           else if (mode == DFmode)
9533             *cost += extra_cost->ldst.loadd;
9534
9535           *cost +=
9536                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9537                                                      0, speed));
9538         }
9539
9540       return true;
9541
9542     case NEG:
9543       op0 = XEXP (x, 0);
9544
9545       if (VECTOR_MODE_P (mode))
9546         {
9547           if (speed)
9548             {
9549               /* FNEG.  */
9550               *cost += extra_cost->vect.alu;
9551             }
9552           return false;
9553         }
9554
9555       if (GET_MODE_CLASS (mode) == MODE_INT)
9556         {
9557           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9558               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9559             {
9560               /* CSETM.  */
9561               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
9562               return true;
9563             }
9564
9565           /* Cost this as SUB wzr, X.  */
9566           op0 = CONST0_RTX (mode);
9567           op1 = XEXP (x, 0);
9568           goto cost_minus;
9569         }
9570
9571       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9572         {
9573           /* Support (neg(fma...)) as a single instruction only if
9574              sign of zeros is unimportant.  This matches the decision
9575              making in aarch64.md.  */
9576           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
9577             {
9578               /* FNMADD.  */
9579               *cost = rtx_cost (op0, mode, NEG, 0, speed);
9580               return true;
9581             }
9582           if (GET_CODE (op0) == MULT)
9583             {
9584               /* FNMUL.  */
9585               *cost = rtx_cost (op0, mode, NEG, 0, speed);
9586               return true;
9587             }
9588           if (speed)
9589             /* FNEG.  */
9590             *cost += extra_cost->fp[mode == DFmode].neg;
9591           return false;
9592         }
9593
9594       return false;
9595
9596     case CLRSB:
9597     case CLZ:
9598       if (speed)
9599         {
9600           if (VECTOR_MODE_P (mode))
9601             *cost += extra_cost->vect.alu;
9602           else
9603             *cost += extra_cost->alu.clz;
9604         }
9605
9606       return false;
9607
9608     case COMPARE:
9609       op0 = XEXP (x, 0);
9610       op1 = XEXP (x, 1);
9611
9612       if (op1 == const0_rtx
9613           && GET_CODE (op0) == AND)
9614         {
9615           x = op0;
9616           mode = GET_MODE (op0);
9617           goto cost_logic;
9618         }
9619
9620       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
9621         {
9622           /* TODO: A write to the CC flags possibly costs extra, this
9623              needs encoding in the cost tables.  */
9624
9625           mode = GET_MODE (op0);
9626           /* ANDS.  */
9627           if (GET_CODE (op0) == AND)
9628             {
9629               x = op0;
9630               goto cost_logic;
9631             }
9632
9633           if (GET_CODE (op0) == PLUS)
9634             {
9635               /* ADDS (and CMN alias).  */
9636               x = op0;
9637               goto cost_plus;
9638             }
9639
9640           if (GET_CODE (op0) == MINUS)
9641             {
9642               /* SUBS.  */
9643               x = op0;
9644               goto cost_minus;
9645             }
9646
9647           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
9648               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
9649               && CONST_INT_P (XEXP (op0, 2)))
9650             {
9651               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
9652                  Handle it here directly rather than going to cost_logic
9653                  since we know the immediate generated for the TST is valid
9654                  so we can avoid creating an intermediate rtx for it only
9655                  for costing purposes.  */
9656               if (speed)
9657                 *cost += extra_cost->alu.logical;
9658
9659               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
9660                                  ZERO_EXTRACT, 0, speed);
9661               return true;
9662             }
9663
9664           if (GET_CODE (op1) == NEG)
9665             {
9666               /* CMN.  */
9667               if (speed)
9668                 *cost += extra_cost->alu.arith;
9669
9670               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
9671               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
9672               return true;
9673             }
9674
9675           /* CMP.
9676
9677              Compare can freely swap the order of operands, and
9678              canonicalization puts the more complex operation first.
9679              But the integer MINUS logic expects the shift/extend
9680              operation in op1.  */
9681           if (! (REG_P (op0)
9682                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
9683           {
9684             op0 = XEXP (x, 1);
9685             op1 = XEXP (x, 0);
9686           }
9687           goto cost_minus;
9688         }
9689
9690       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
9691         {
9692           /* FCMP.  */
9693           if (speed)
9694             *cost += extra_cost->fp[mode == DFmode].compare;
9695
9696           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
9697             {
9698               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
9699               /* FCMP supports constant 0.0 for no extra cost. */
9700               return true;
9701             }
9702           return false;
9703         }
9704
9705       if (VECTOR_MODE_P (mode))
9706         {
9707           /* Vector compare.  */
9708           if (speed)
9709             *cost += extra_cost->vect.alu;
9710
9711           if (aarch64_float_const_zero_rtx_p (op1))
9712             {
9713               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
9714                  cost.  */
9715               return true;
9716             }
9717           return false;
9718         }
9719       return false;
9720
9721     case MINUS:
9722       {
9723         op0 = XEXP (x, 0);
9724         op1 = XEXP (x, 1);
9725
9726 cost_minus:
9727         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
9728
9729         /* Detect valid immediates.  */
9730         if ((GET_MODE_CLASS (mode) == MODE_INT
9731              || (GET_MODE_CLASS (mode) == MODE_CC
9732                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
9733             && CONST_INT_P (op1)
9734             && aarch64_uimm12_shift (INTVAL (op1)))
9735           {
9736             if (speed)
9737               /* SUB(S) (immediate).  */
9738               *cost += extra_cost->alu.arith;
9739             return true;
9740           }
9741
9742         /* Look for SUB (extended register).  */
9743         if (is_a <scalar_int_mode> (mode, &int_mode)
9744             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
9745           {
9746             if (speed)
9747               *cost += extra_cost->alu.extend_arith;
9748
9749             op1 = aarch64_strip_extend (op1, true);
9750             *cost += rtx_cost (op1, VOIDmode,
9751                                (enum rtx_code) GET_CODE (op1), 0, speed);
9752             return true;
9753           }
9754
9755         rtx new_op1 = aarch64_strip_extend (op1, false);
9756
9757         /* Cost this as an FMA-alike operation.  */
9758         if ((GET_CODE (new_op1) == MULT
9759              || aarch64_shift_p (GET_CODE (new_op1)))
9760             && code != COMPARE)
9761           {
9762             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
9763                                             (enum rtx_code) code,
9764                                             speed);
9765             return true;
9766           }
9767
9768         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
9769
9770         if (speed)
9771           {
9772             if (VECTOR_MODE_P (mode))
9773               {
9774                 /* Vector SUB.  */
9775                 *cost += extra_cost->vect.alu;
9776               }
9777             else if (GET_MODE_CLASS (mode) == MODE_INT)
9778               {
9779                 /* SUB(S).  */
9780                 *cost += extra_cost->alu.arith;
9781               }
9782             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9783               {
9784                 /* FSUB.  */
9785                 *cost += extra_cost->fp[mode == DFmode].addsub;
9786               }
9787           }
9788         return true;
9789       }
9790
9791     case PLUS:
9792       {
9793         rtx new_op0;
9794
9795         op0 = XEXP (x, 0);
9796         op1 = XEXP (x, 1);
9797
9798 cost_plus:
9799         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9800             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9801           {
9802             /* CSINC.  */
9803             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
9804             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9805             return true;
9806           }
9807
9808         if (GET_MODE_CLASS (mode) == MODE_INT
9809             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
9810                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
9811           {
9812             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
9813
9814             if (speed)
9815               /* ADD (immediate).  */
9816               *cost += extra_cost->alu.arith;
9817             return true;
9818           }
9819
9820         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9821
9822         /* Look for ADD (extended register).  */
9823         if (is_a <scalar_int_mode> (mode, &int_mode)
9824             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
9825           {
9826             if (speed)
9827               *cost += extra_cost->alu.extend_arith;
9828
9829             op0 = aarch64_strip_extend (op0, true);
9830             *cost += rtx_cost (op0, VOIDmode,
9831                                (enum rtx_code) GET_CODE (op0), 0, speed);
9832             return true;
9833           }
9834
9835         /* Strip any extend, leave shifts behind as we will
9836            cost them through mult_cost.  */
9837         new_op0 = aarch64_strip_extend (op0, false);
9838
9839         if (GET_CODE (new_op0) == MULT
9840             || aarch64_shift_p (GET_CODE (new_op0)))
9841           {
9842             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
9843                                             speed);
9844             return true;
9845           }
9846
9847         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
9848
9849         if (speed)
9850           {
9851             if (VECTOR_MODE_P (mode))
9852               {
9853                 /* Vector ADD.  */
9854                 *cost += extra_cost->vect.alu;
9855               }
9856             else if (GET_MODE_CLASS (mode) == MODE_INT)
9857               {
9858                 /* ADD.  */
9859                 *cost += extra_cost->alu.arith;
9860               }
9861             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9862               {
9863                 /* FADD.  */
9864                 *cost += extra_cost->fp[mode == DFmode].addsub;
9865               }
9866           }
9867         return true;
9868       }
9869
9870     case BSWAP:
9871       *cost = COSTS_N_INSNS (1);
9872
9873       if (speed)
9874         {
9875           if (VECTOR_MODE_P (mode))
9876             *cost += extra_cost->vect.alu;
9877           else
9878             *cost += extra_cost->alu.rev;
9879         }
9880       return false;
9881
9882     case IOR:
9883       if (aarch_rev16_p (x))
9884         {
9885           *cost = COSTS_N_INSNS (1);
9886
9887           if (speed)
9888             {
9889               if (VECTOR_MODE_P (mode))
9890                 *cost += extra_cost->vect.alu;
9891               else
9892                 *cost += extra_cost->alu.rev;
9893             }
9894           return true;
9895         }
9896
9897       if (aarch64_extr_rtx_p (x, &op0, &op1))
9898         {
9899           *cost += rtx_cost (op0, mode, IOR, 0, speed);
9900           *cost += rtx_cost (op1, mode, IOR, 1, speed);
9901           if (speed)
9902             *cost += extra_cost->alu.shift;
9903
9904           return true;
9905         }
9906     /* Fall through.  */
9907     case XOR:
9908     case AND:
9909     cost_logic:
9910       op0 = XEXP (x, 0);
9911       op1 = XEXP (x, 1);
9912
9913       if (VECTOR_MODE_P (mode))
9914         {
9915           if (speed)
9916             *cost += extra_cost->vect.alu;
9917           return true;
9918         }
9919
9920       if (code == AND
9921           && GET_CODE (op0) == MULT
9922           && CONST_INT_P (XEXP (op0, 1))
9923           && CONST_INT_P (op1)
9924           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9925                                INTVAL (op1)) != 0)
9926         {
9927           /* This is a UBFM/SBFM.  */
9928           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9929           if (speed)
9930             *cost += extra_cost->alu.bfx;
9931           return true;
9932         }
9933
9934       if (is_int_mode (mode, &int_mode))
9935         {
9936           if (CONST_INT_P (op1))
9937             {
9938               /* We have a mask + shift version of a UBFIZ
9939                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
9940               if (GET_CODE (op0) == ASHIFT
9941                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9942                                                          XEXP (op0, 1)))
9943                 {
9944                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
9945                                      (enum rtx_code) code, 0, speed);
9946                   if (speed)
9947                     *cost += extra_cost->alu.bfx;
9948
9949                   return true;
9950                 }
9951               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9952                 {
9953                 /* We possibly get the immediate for free, this is not
9954                    modelled.  */
9955                   *cost += rtx_cost (op0, int_mode,
9956                                      (enum rtx_code) code, 0, speed);
9957                   if (speed)
9958                     *cost += extra_cost->alu.logical;
9959
9960                   return true;
9961                 }
9962             }
9963           else
9964             {
9965               rtx new_op0 = op0;
9966
9967               /* Handle ORN, EON, or BIC.  */
9968               if (GET_CODE (op0) == NOT)
9969                 op0 = XEXP (op0, 0);
9970
9971               new_op0 = aarch64_strip_shift (op0);
9972
9973               /* If we had a shift on op0 then this is a logical-shift-
9974                  by-register/immediate operation.  Otherwise, this is just
9975                  a logical operation.  */
9976               if (speed)
9977                 {
9978                   if (new_op0 != op0)
9979                     {
9980                       /* Shift by immediate.  */
9981                       if (CONST_INT_P (XEXP (op0, 1)))
9982                         *cost += extra_cost->alu.log_shift;
9983                       else
9984                         *cost += extra_cost->alu.log_shift_reg;
9985                     }
9986                   else
9987                     *cost += extra_cost->alu.logical;
9988                 }
9989
9990               /* In both cases we want to cost both operands.  */
9991               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9992                                  0, speed);
9993               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9994                                  1, speed);
9995
9996               return true;
9997             }
9998         }
9999       return false;
10000
10001     case NOT:
10002       x = XEXP (x, 0);
10003       op0 = aarch64_strip_shift (x);
10004
10005       if (VECTOR_MODE_P (mode))
10006         {
10007           /* Vector NOT.  */
10008           *cost += extra_cost->vect.alu;
10009           return false;
10010         }
10011
10012       /* MVN-shifted-reg.  */
10013       if (op0 != x)
10014         {
10015           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10016
10017           if (speed)
10018             *cost += extra_cost->alu.log_shift;
10019
10020           return true;
10021         }
10022       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10023          Handle the second form here taking care that 'a' in the above can
10024          be a shift.  */
10025       else if (GET_CODE (op0) == XOR)
10026         {
10027           rtx newop0 = XEXP (op0, 0);
10028           rtx newop1 = XEXP (op0, 1);
10029           rtx op0_stripped = aarch64_strip_shift (newop0);
10030
10031           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10032           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10033
10034           if (speed)
10035             {
10036               if (op0_stripped != newop0)
10037                 *cost += extra_cost->alu.log_shift;
10038               else
10039                 *cost += extra_cost->alu.logical;
10040             }
10041
10042           return true;
10043         }
10044       /* MVN.  */
10045       if (speed)
10046         *cost += extra_cost->alu.logical;
10047
10048       return false;
10049
10050     case ZERO_EXTEND:
10051
10052       op0 = XEXP (x, 0);
10053       /* If a value is written in SI mode, then zero extended to DI
10054          mode, the operation will in general be free as a write to
10055          a 'w' register implicitly zeroes the upper bits of an 'x'
10056          register.  However, if this is
10057
10058            (set (reg) (zero_extend (reg)))
10059
10060          we must cost the explicit register move.  */
10061       if (mode == DImode
10062           && GET_MODE (op0) == SImode
10063           && outer == SET)
10064         {
10065           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10066
10067         /* If OP_COST is non-zero, then the cost of the zero extend
10068            is effectively the cost of the inner operation.  Otherwise
10069            we have a MOV instruction and we take the cost from the MOV
10070            itself.  This is true independently of whether we are
10071            optimizing for space or time.  */
10072           if (op_cost)
10073             *cost = op_cost;
10074
10075           return true;
10076         }
10077       else if (MEM_P (op0))
10078         {
10079           /* All loads can zero extend to any size for free.  */
10080           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10081           return true;
10082         }
10083
10084       op0 = aarch64_extend_bitfield_pattern_p (x);
10085       if (op0)
10086         {
10087           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
10088           if (speed)
10089             *cost += extra_cost->alu.bfx;
10090           return true;
10091         }
10092
10093       if (speed)
10094         {
10095           if (VECTOR_MODE_P (mode))
10096             {
10097               /* UMOV.  */
10098               *cost += extra_cost->vect.alu;
10099             }
10100           else
10101             {
10102               /* We generate an AND instead of UXTB/UXTH.  */
10103               *cost += extra_cost->alu.logical;
10104             }
10105         }
10106       return false;
10107
10108     case SIGN_EXTEND:
10109       if (MEM_P (XEXP (x, 0)))
10110         {
10111           /* LDRSH.  */
10112           if (speed)
10113             {
10114               rtx address = XEXP (XEXP (x, 0), 0);
10115               *cost += extra_cost->ldst.load_sign_extend;
10116
10117               *cost +=
10118                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10119                                                      0, speed));
10120             }
10121           return true;
10122         }
10123
10124       op0 = aarch64_extend_bitfield_pattern_p (x);
10125       if (op0)
10126         {
10127           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
10128           if (speed)
10129             *cost += extra_cost->alu.bfx;
10130           return true;
10131         }
10132
10133       if (speed)
10134         {
10135           if (VECTOR_MODE_P (mode))
10136             *cost += extra_cost->vect.alu;
10137           else
10138             *cost += extra_cost->alu.extend;
10139         }
10140       return false;
10141
10142     case ASHIFT:
10143       op0 = XEXP (x, 0);
10144       op1 = XEXP (x, 1);
10145
10146       if (CONST_INT_P (op1))
10147         {
10148           if (speed)
10149             {
10150               if (VECTOR_MODE_P (mode))
10151                 {
10152                   /* Vector shift (immediate).  */
10153                   *cost += extra_cost->vect.alu;
10154                 }
10155               else
10156                 {
10157                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
10158                      aliases.  */
10159                   *cost += extra_cost->alu.shift;
10160                 }
10161             }
10162
10163           /* We can incorporate zero/sign extend for free.  */
10164           if (GET_CODE (op0) == ZERO_EXTEND
10165               || GET_CODE (op0) == SIGN_EXTEND)
10166             op0 = XEXP (op0, 0);
10167
10168           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
10169           return true;
10170         }
10171       else
10172         {
10173           if (VECTOR_MODE_P (mode))
10174             {
10175               if (speed)
10176                 /* Vector shift (register).  */
10177                 *cost += extra_cost->vect.alu;
10178             }
10179           else
10180             {
10181               if (speed)
10182                 /* LSLV.  */
10183                 *cost += extra_cost->alu.shift_reg;
10184
10185               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10186                   && CONST_INT_P (XEXP (op1, 1))
10187                   && known_eq (INTVAL (XEXP (op1, 1)),
10188                                GET_MODE_BITSIZE (mode) - 1))
10189                 {
10190                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10191                   /* We already demanded XEXP (op1, 0) to be REG_P, so
10192                      don't recurse into it.  */
10193                   return true;
10194                 }
10195             }
10196           return false;  /* All arguments need to be in registers.  */
10197         }
10198
10199     case ROTATE:
10200     case ROTATERT:
10201     case LSHIFTRT:
10202     case ASHIFTRT:
10203       op0 = XEXP (x, 0);
10204       op1 = XEXP (x, 1);
10205
10206       if (CONST_INT_P (op1))
10207         {
10208           /* ASR (immediate) and friends.  */
10209           if (speed)
10210             {
10211               if (VECTOR_MODE_P (mode))
10212                 *cost += extra_cost->vect.alu;
10213               else
10214                 *cost += extra_cost->alu.shift;
10215             }
10216
10217           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10218           return true;
10219         }
10220       else
10221         {
10222           if (VECTOR_MODE_P (mode))
10223             {
10224               if (speed)
10225                 /* Vector shift (register).  */
10226                 *cost += extra_cost->vect.alu;
10227             }
10228           else
10229             {
10230               if (speed)
10231                 /* ASR (register) and friends.  */
10232                 *cost += extra_cost->alu.shift_reg;
10233
10234               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10235                   && CONST_INT_P (XEXP (op1, 1))
10236                   && known_eq (INTVAL (XEXP (op1, 1)),
10237                                GET_MODE_BITSIZE (mode) - 1))
10238                 {
10239                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10240                   /* We already demanded XEXP (op1, 0) to be REG_P, so
10241                      don't recurse into it.  */
10242                   return true;
10243                 }
10244             }
10245           return false;  /* All arguments need to be in registers.  */
10246         }
10247
10248     case SYMBOL_REF:
10249
10250       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
10251           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
10252         {
10253           /* LDR.  */
10254           if (speed)
10255             *cost += extra_cost->ldst.load;
10256         }
10257       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
10258                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
10259         {
10260           /* ADRP, followed by ADD.  */
10261           *cost += COSTS_N_INSNS (1);
10262           if (speed)
10263             *cost += 2 * extra_cost->alu.arith;
10264         }
10265       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
10266                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10267         {
10268           /* ADR.  */
10269           if (speed)
10270             *cost += extra_cost->alu.arith;
10271         }
10272
10273       if (flag_pic)
10274         {
10275           /* One extra load instruction, after accessing the GOT.  */
10276           *cost += COSTS_N_INSNS (1);
10277           if (speed)
10278             *cost += extra_cost->ldst.load;
10279         }
10280       return true;
10281
10282     case HIGH:
10283     case LO_SUM:
10284       /* ADRP/ADD (immediate).  */
10285       if (speed)
10286         *cost += extra_cost->alu.arith;
10287       return true;
10288
10289     case ZERO_EXTRACT:
10290     case SIGN_EXTRACT:
10291       /* UBFX/SBFX.  */
10292       if (speed)
10293         {
10294           if (VECTOR_MODE_P (mode))
10295             *cost += extra_cost->vect.alu;
10296           else
10297             *cost += extra_cost->alu.bfx;
10298         }
10299
10300       /* We can trust that the immediates used will be correct (there
10301          are no by-register forms), so we need only cost op0.  */
10302       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
10303       return true;
10304
10305     case MULT:
10306       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
10307       /* aarch64_rtx_mult_cost always handles recursion to its
10308          operands.  */
10309       return true;
10310
10311     case MOD:
10312     /* We can expand signed mod by power of 2 using a NEGS, two parallel
10313        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
10314        an unconditional negate.  This case should only ever be reached through
10315        the set_smod_pow2_cheap check in expmed.c.  */
10316       if (CONST_INT_P (XEXP (x, 1))
10317           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
10318           && (mode == SImode || mode == DImode))
10319         {
10320           /* We expand to 4 instructions.  Reset the baseline.  */
10321           *cost = COSTS_N_INSNS (4);
10322
10323           if (speed)
10324             *cost += 2 * extra_cost->alu.logical
10325                      + 2 * extra_cost->alu.arith;
10326
10327           return true;
10328         }
10329
10330     /* Fall-through.  */
10331     case UMOD:
10332       if (speed)
10333         {
10334           /* Slighly prefer UMOD over SMOD.  */
10335           if (VECTOR_MODE_P (mode))
10336             *cost += extra_cost->vect.alu;
10337           else if (GET_MODE_CLASS (mode) == MODE_INT)
10338             *cost += (extra_cost->mult[mode == DImode].add
10339                       + extra_cost->mult[mode == DImode].idiv
10340                       + (code == MOD ? 1 : 0));
10341         }
10342       return false;  /* All arguments need to be in registers.  */
10343
10344     case DIV:
10345     case UDIV:
10346     case SQRT:
10347       if (speed)
10348         {
10349           if (VECTOR_MODE_P (mode))
10350             *cost += extra_cost->vect.alu;
10351           else if (GET_MODE_CLASS (mode) == MODE_INT)
10352             /* There is no integer SQRT, so only DIV and UDIV can get
10353                here.  */
10354             *cost += (extra_cost->mult[mode == DImode].idiv
10355                      /* Slighly prefer UDIV over SDIV.  */
10356                      + (code == DIV ? 1 : 0));
10357           else
10358             *cost += extra_cost->fp[mode == DFmode].div;
10359         }
10360       return false;  /* All arguments need to be in registers.  */
10361
10362     case IF_THEN_ELSE:
10363       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
10364                                          XEXP (x, 2), cost, speed);
10365
10366     case EQ:
10367     case NE:
10368     case GT:
10369     case GTU:
10370     case LT:
10371     case LTU:
10372     case GE:
10373     case GEU:
10374     case LE:
10375     case LEU:
10376
10377       return false; /* All arguments must be in registers.  */
10378
10379     case FMA:
10380       op0 = XEXP (x, 0);
10381       op1 = XEXP (x, 1);
10382       op2 = XEXP (x, 2);
10383
10384       if (speed)
10385         {
10386           if (VECTOR_MODE_P (mode))
10387             *cost += extra_cost->vect.alu;
10388           else
10389             *cost += extra_cost->fp[mode == DFmode].fma;
10390         }
10391
10392       /* FMSUB, FNMADD, and FNMSUB are free.  */
10393       if (GET_CODE (op0) == NEG)
10394         op0 = XEXP (op0, 0);
10395
10396       if (GET_CODE (op2) == NEG)
10397         op2 = XEXP (op2, 0);
10398
10399       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
10400          and the by-element operand as operand 0.  */
10401       if (GET_CODE (op1) == NEG)
10402         op1 = XEXP (op1, 0);
10403
10404       /* Catch vector-by-element operations.  The by-element operand can
10405          either be (vec_duplicate (vec_select (x))) or just
10406          (vec_select (x)), depending on whether we are multiplying by
10407          a vector or a scalar.
10408
10409          Canonicalization is not very good in these cases, FMA4 will put the
10410          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
10411       if (GET_CODE (op0) == VEC_DUPLICATE)
10412         op0 = XEXP (op0, 0);
10413       else if (GET_CODE (op1) == VEC_DUPLICATE)
10414         op1 = XEXP (op1, 0);
10415
10416       if (GET_CODE (op0) == VEC_SELECT)
10417         op0 = XEXP (op0, 0);
10418       else if (GET_CODE (op1) == VEC_SELECT)
10419         op1 = XEXP (op1, 0);
10420
10421       /* If the remaining parameters are not registers,
10422          get the cost to put them into registers.  */
10423       *cost += rtx_cost (op0, mode, FMA, 0, speed);
10424       *cost += rtx_cost (op1, mode, FMA, 1, speed);
10425       *cost += rtx_cost (op2, mode, FMA, 2, speed);
10426       return true;
10427
10428     case FLOAT:
10429     case UNSIGNED_FLOAT:
10430       if (speed)
10431         *cost += extra_cost->fp[mode == DFmode].fromint;
10432       return false;
10433
10434     case FLOAT_EXTEND:
10435       if (speed)
10436         {
10437           if (VECTOR_MODE_P (mode))
10438             {
10439               /*Vector truncate.  */
10440               *cost += extra_cost->vect.alu;
10441             }
10442           else
10443             *cost += extra_cost->fp[mode == DFmode].widen;
10444         }
10445       return false;
10446
10447     case FLOAT_TRUNCATE:
10448       if (speed)
10449         {
10450           if (VECTOR_MODE_P (mode))
10451             {
10452               /*Vector conversion.  */
10453               *cost += extra_cost->vect.alu;
10454             }
10455           else
10456             *cost += extra_cost->fp[mode == DFmode].narrow;
10457         }
10458       return false;
10459
10460     case FIX:
10461     case UNSIGNED_FIX:
10462       x = XEXP (x, 0);
10463       /* Strip the rounding part.  They will all be implemented
10464          by the fcvt* family of instructions anyway.  */
10465       if (GET_CODE (x) == UNSPEC)
10466         {
10467           unsigned int uns_code = XINT (x, 1);
10468
10469           if (uns_code == UNSPEC_FRINTA
10470               || uns_code == UNSPEC_FRINTM
10471               || uns_code == UNSPEC_FRINTN
10472               || uns_code == UNSPEC_FRINTP
10473               || uns_code == UNSPEC_FRINTZ)
10474             x = XVECEXP (x, 0, 0);
10475         }
10476
10477       if (speed)
10478         {
10479           if (VECTOR_MODE_P (mode))
10480             *cost += extra_cost->vect.alu;
10481           else
10482             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
10483         }
10484
10485       /* We can combine fmul by a power of 2 followed by a fcvt into a single
10486          fixed-point fcvt.  */
10487       if (GET_CODE (x) == MULT
10488           && ((VECTOR_MODE_P (mode)
10489                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
10490               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
10491         {
10492           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
10493                              0, speed);
10494           return true;
10495         }
10496
10497       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
10498       return true;
10499
10500     case ABS:
10501       if (VECTOR_MODE_P (mode))
10502         {
10503           /* ABS (vector).  */
10504           if (speed)
10505             *cost += extra_cost->vect.alu;
10506         }
10507       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10508         {
10509           op0 = XEXP (x, 0);
10510
10511           /* FABD, which is analogous to FADD.  */
10512           if (GET_CODE (op0) == MINUS)
10513             {
10514               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
10515               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
10516               if (speed)
10517                 *cost += extra_cost->fp[mode == DFmode].addsub;
10518
10519               return true;
10520             }
10521           /* Simple FABS is analogous to FNEG.  */
10522           if (speed)
10523             *cost += extra_cost->fp[mode == DFmode].neg;
10524         }
10525       else
10526         {
10527           /* Integer ABS will either be split to
10528              two arithmetic instructions, or will be an ABS
10529              (scalar), which we don't model.  */
10530           *cost = COSTS_N_INSNS (2);
10531           if (speed)
10532             *cost += 2 * extra_cost->alu.arith;
10533         }
10534       return false;
10535
10536     case SMAX:
10537     case SMIN:
10538       if (speed)
10539         {
10540           if (VECTOR_MODE_P (mode))
10541             *cost += extra_cost->vect.alu;
10542           else
10543             {
10544               /* FMAXNM/FMINNM/FMAX/FMIN.
10545                  TODO: This may not be accurate for all implementations, but
10546                  we do not model this in the cost tables.  */
10547               *cost += extra_cost->fp[mode == DFmode].addsub;
10548             }
10549         }
10550       return false;
10551
10552     case UNSPEC:
10553       /* The floating point round to integer frint* instructions.  */
10554       if (aarch64_frint_unspec_p (XINT (x, 1)))
10555         {
10556           if (speed)
10557             *cost += extra_cost->fp[mode == DFmode].roundint;
10558
10559           return false;
10560         }
10561
10562       if (XINT (x, 1) == UNSPEC_RBIT)
10563         {
10564           if (speed)
10565             *cost += extra_cost->alu.rev;
10566
10567           return false;
10568         }
10569       break;
10570
10571     case TRUNCATE:
10572
10573       /* Decompose <su>muldi3_highpart.  */
10574       if (/* (truncate:DI  */
10575           mode == DImode
10576           /*   (lshiftrt:TI  */
10577           && GET_MODE (XEXP (x, 0)) == TImode
10578           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
10579           /*      (mult:TI  */
10580           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
10581           /*        (ANY_EXTEND:TI (reg:DI))
10582                     (ANY_EXTEND:TI (reg:DI)))  */
10583           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
10584                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
10585               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
10586                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
10587           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
10588           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
10589           /*     (const_int 64)  */
10590           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10591           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
10592         {
10593           /* UMULH/SMULH.  */
10594           if (speed)
10595             *cost += extra_cost->mult[mode == DImode].extend;
10596           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
10597                              mode, MULT, 0, speed);
10598           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
10599                              mode, MULT, 1, speed);
10600           return true;
10601         }
10602
10603       /* Fall through.  */
10604     default:
10605       break;
10606     }
10607
10608   if (dump_file
10609       && flag_aarch64_verbose_cost)
10610     fprintf (dump_file,
10611       "\nFailed to cost RTX.  Assuming default cost.\n");
10612
10613   return true;
10614 }
10615
10616 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
10617    calculated for X.  This cost is stored in *COST.  Returns true
10618    if the total cost of X was calculated.  */
10619 static bool
10620 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
10621                    int param, int *cost, bool speed)
10622 {
10623   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
10624
10625   if (dump_file
10626       && flag_aarch64_verbose_cost)
10627     {
10628       print_rtl_single (dump_file, x);
10629       fprintf (dump_file, "\n%s cost: %d (%s)\n",
10630                speed ? "Hot" : "Cold",
10631                *cost, result ? "final" : "partial");
10632     }
10633
10634   return result;
10635 }
10636
10637 static int
10638 aarch64_register_move_cost (machine_mode mode,
10639                             reg_class_t from_i, reg_class_t to_i)
10640 {
10641   enum reg_class from = (enum reg_class) from_i;
10642   enum reg_class to = (enum reg_class) to_i;
10643   const struct cpu_regmove_cost *regmove_cost
10644     = aarch64_tune_params.regmove_cost;
10645
10646   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
10647   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
10648     to = GENERAL_REGS;
10649
10650   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
10651     from = GENERAL_REGS;
10652
10653   /* Moving between GPR and stack cost is the same as GP2GP.  */
10654   if ((from == GENERAL_REGS && to == STACK_REG)
10655       || (to == GENERAL_REGS && from == STACK_REG))
10656     return regmove_cost->GP2GP;
10657
10658   /* To/From the stack register, we move via the gprs.  */
10659   if (to == STACK_REG || from == STACK_REG)
10660     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
10661             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
10662
10663   if (known_eq (GET_MODE_SIZE (mode), 16))
10664     {
10665       /* 128-bit operations on general registers require 2 instructions.  */
10666       if (from == GENERAL_REGS && to == GENERAL_REGS)
10667         return regmove_cost->GP2GP * 2;
10668       else if (from == GENERAL_REGS)
10669         return regmove_cost->GP2FP * 2;
10670       else if (to == GENERAL_REGS)
10671         return regmove_cost->FP2GP * 2;
10672
10673       /* When AdvSIMD instructions are disabled it is not possible to move
10674          a 128-bit value directly between Q registers.  This is handled in
10675          secondary reload.  A general register is used as a scratch to move
10676          the upper DI value and the lower DI value is moved directly,
10677          hence the cost is the sum of three moves. */
10678       if (! TARGET_SIMD)
10679         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
10680
10681       return regmove_cost->FP2FP;
10682     }
10683
10684   if (from == GENERAL_REGS && to == GENERAL_REGS)
10685     return regmove_cost->GP2GP;
10686   else if (from == GENERAL_REGS)
10687     return regmove_cost->GP2FP;
10688   else if (to == GENERAL_REGS)
10689     return regmove_cost->FP2GP;
10690
10691   return regmove_cost->FP2FP;
10692 }
10693
10694 static int
10695 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
10696                           reg_class_t rclass ATTRIBUTE_UNUSED,
10697                           bool in ATTRIBUTE_UNUSED)
10698 {
10699   return aarch64_tune_params.memmov_cost;
10700 }
10701
10702 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
10703    to optimize 1.0/sqrt.  */
10704
10705 static bool
10706 use_rsqrt_p (machine_mode mode)
10707 {
10708   return (!flag_trapping_math
10709           && flag_unsafe_math_optimizations
10710           && ((aarch64_tune_params.approx_modes->recip_sqrt
10711                & AARCH64_APPROX_MODE (mode))
10712               || flag_mrecip_low_precision_sqrt));
10713 }
10714
10715 /* Function to decide when to use the approximate reciprocal square root
10716    builtin.  */
10717
10718 static tree
10719 aarch64_builtin_reciprocal (tree fndecl)
10720 {
10721   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
10722
10723   if (!use_rsqrt_p (mode))
10724     return NULL_TREE;
10725   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
10726 }
10727
10728 /* Emit instruction sequence to compute either the approximate square root
10729    or its approximate reciprocal, depending on the flag RECP, and return
10730    whether the sequence was emitted or not.  */
10731
10732 bool
10733 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
10734 {
10735   machine_mode mode = GET_MODE (dst);
10736
10737   if (GET_MODE_INNER (mode) == HFmode)
10738     {
10739       gcc_assert (!recp);
10740       return false;
10741     }
10742
10743   if (!recp)
10744     {
10745       if (!(flag_mlow_precision_sqrt
10746             || (aarch64_tune_params.approx_modes->sqrt
10747                 & AARCH64_APPROX_MODE (mode))))
10748         return false;
10749
10750       if (flag_finite_math_only
10751           || flag_trapping_math
10752           || !flag_unsafe_math_optimizations
10753           || optimize_function_for_size_p (cfun))
10754         return false;
10755     }
10756   else
10757     /* Caller assumes we cannot fail.  */
10758     gcc_assert (use_rsqrt_p (mode));
10759
10760   machine_mode mmsk = mode_for_int_vector (mode).require ();
10761   rtx xmsk = gen_reg_rtx (mmsk);
10762   if (!recp)
10763     /* When calculating the approximate square root, compare the
10764        argument with 0.0 and create a mask.  */
10765     emit_insn (gen_rtx_SET (xmsk,
10766                             gen_rtx_NEG (mmsk,
10767                                          gen_rtx_EQ (mmsk, src,
10768                                                      CONST0_RTX (mode)))));
10769
10770   /* Estimate the approximate reciprocal square root.  */
10771   rtx xdst = gen_reg_rtx (mode);
10772   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
10773
10774   /* Iterate over the series twice for SF and thrice for DF.  */
10775   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10776
10777   /* Optionally iterate over the series once less for faster performance
10778      while sacrificing the accuracy.  */
10779   if ((recp && flag_mrecip_low_precision_sqrt)
10780       || (!recp && flag_mlow_precision_sqrt))
10781     iterations--;
10782
10783   /* Iterate over the series to calculate the approximate reciprocal square
10784      root.  */
10785   rtx x1 = gen_reg_rtx (mode);
10786   while (iterations--)
10787     {
10788       rtx x2 = gen_reg_rtx (mode);
10789       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
10790
10791       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
10792
10793       if (iterations > 0)
10794         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
10795     }
10796
10797   if (!recp)
10798     {
10799       /* Qualify the approximate reciprocal square root when the argument is
10800          0.0 by squashing the intermediary result to 0.0.  */
10801       rtx xtmp = gen_reg_rtx (mmsk);
10802       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
10803                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
10804       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
10805
10806       /* Calculate the approximate square root.  */
10807       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
10808     }
10809
10810   /* Finalize the approximation.  */
10811   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
10812
10813   return true;
10814 }
10815
10816 /* Emit the instruction sequence to compute the approximation for the division
10817    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
10818
10819 bool
10820 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10821 {
10822   machine_mode mode = GET_MODE (quo);
10823
10824   if (GET_MODE_INNER (mode) == HFmode)
10825     return false;
10826
10827   bool use_approx_division_p = (flag_mlow_precision_div
10828                                 || (aarch64_tune_params.approx_modes->division
10829                                     & AARCH64_APPROX_MODE (mode)));
10830
10831   if (!flag_finite_math_only
10832       || flag_trapping_math
10833       || !flag_unsafe_math_optimizations
10834       || optimize_function_for_size_p (cfun)
10835       || !use_approx_division_p)
10836     return false;
10837
10838   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10839     return false;
10840
10841   /* Estimate the approximate reciprocal.  */
10842   rtx xrcp = gen_reg_rtx (mode);
10843   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
10844
10845   /* Iterate over the series twice for SF and thrice for DF.  */
10846   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10847
10848   /* Optionally iterate over the series once less for faster performance,
10849      while sacrificing the accuracy.  */
10850   if (flag_mlow_precision_div)
10851     iterations--;
10852
10853   /* Iterate over the series to calculate the approximate reciprocal.  */
10854   rtx xtmp = gen_reg_rtx (mode);
10855   while (iterations--)
10856     {
10857       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
10858
10859       if (iterations > 0)
10860         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10861     }
10862
10863   if (num != CONST1_RTX (mode))
10864     {
10865       /* As the approximate reciprocal of DEN is already calculated, only
10866          calculate the approximate division when NUM is not 1.0.  */
10867       rtx xnum = force_reg (mode, num);
10868       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10869     }
10870
10871   /* Finalize the approximation.  */
10872   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10873   return true;
10874 }
10875
10876 /* Return the number of instructions that can be issued per cycle.  */
10877 static int
10878 aarch64_sched_issue_rate (void)
10879 {
10880   return aarch64_tune_params.issue_rate;
10881 }
10882
10883 static int
10884 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10885 {
10886   int issue_rate = aarch64_sched_issue_rate ();
10887
10888   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10889 }
10890
10891
10892 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10893    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
10894    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
10895
10896 static int
10897 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10898                                                     int ready_index)
10899 {
10900   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10901 }
10902
10903
10904 /* Vectorizer cost model target hooks.  */
10905
10906 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
10907 static int
10908 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10909                                     tree vectype,
10910                                     int misalign ATTRIBUTE_UNUSED)
10911 {
10912   unsigned elements;
10913   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10914   bool fp = false;
10915
10916   if (vectype != NULL)
10917     fp = FLOAT_TYPE_P (vectype);
10918
10919   switch (type_of_cost)
10920     {
10921       case scalar_stmt:
10922         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10923
10924       case scalar_load:
10925         return costs->scalar_load_cost;
10926
10927       case scalar_store:
10928         return costs->scalar_store_cost;
10929
10930       case vector_stmt:
10931         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10932
10933       case vector_load:
10934         return costs->vec_align_load_cost;
10935
10936       case vector_store:
10937         return costs->vec_store_cost;
10938
10939       case vec_to_scalar:
10940         return costs->vec_to_scalar_cost;
10941
10942       case scalar_to_vec:
10943         return costs->scalar_to_vec_cost;
10944
10945       case unaligned_load:
10946       case vector_gather_load:
10947         return costs->vec_unalign_load_cost;
10948
10949       case unaligned_store:
10950       case vector_scatter_store:
10951         return costs->vec_unalign_store_cost;
10952
10953       case cond_branch_taken:
10954         return costs->cond_taken_branch_cost;
10955
10956       case cond_branch_not_taken:
10957         return costs->cond_not_taken_branch_cost;
10958
10959       case vec_perm:
10960         return costs->vec_permute_cost;
10961
10962       case vec_promote_demote:
10963         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10964
10965       case vec_construct:
10966         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10967         return elements / 2 + 1;
10968
10969       default:
10970         gcc_unreachable ();
10971     }
10972 }
10973
10974 /* Implement targetm.vectorize.add_stmt_cost.  */
10975 static unsigned
10976 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10977                        struct _stmt_vec_info *stmt_info, int misalign,
10978                        enum vect_cost_model_location where)
10979 {
10980   unsigned *cost = (unsigned *) data;
10981   unsigned retval = 0;
10982
10983   if (flag_vect_cost_model)
10984     {
10985       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10986       int stmt_cost =
10987             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10988
10989       /* Statements in an inner loop relative to the loop being
10990          vectorized are weighted more heavily.  The value here is
10991          arbitrary and could potentially be improved with analysis.  */
10992       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10993         count *= 50; /*  FIXME  */
10994
10995       retval = (unsigned) (count * stmt_cost);
10996       cost[where] += retval;
10997     }
10998
10999   return retval;
11000 }
11001
11002 static void initialize_aarch64_code_model (struct gcc_options *);
11003
11004 /* Parse the TO_PARSE string and put the architecture struct that it
11005    selects into RES and the architectural features into ISA_FLAGS.
11006    Return an aarch64_parse_opt_result describing the parse result.
11007    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11008    When the TO_PARSE string contains an invalid extension,
11009    a copy of the string is created and stored to INVALID_EXTENSION.  */
11010
11011 static enum aarch64_parse_opt_result
11012 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11013                     unsigned long *isa_flags, std::string *invalid_extension)
11014 {
11015   const char *ext;
11016   const struct processor *arch;
11017   size_t len;
11018
11019   ext = strchr (to_parse, '+');
11020
11021   if (ext != NULL)
11022     len = ext - to_parse;
11023   else
11024     len = strlen (to_parse);
11025
11026   if (len == 0)
11027     return AARCH64_PARSE_MISSING_ARG;
11028
11029
11030   /* Loop through the list of supported ARCHes to find a match.  */
11031   for (arch = all_architectures; arch->name != NULL; arch++)
11032     {
11033       if (strlen (arch->name) == len
11034           && strncmp (arch->name, to_parse, len) == 0)
11035         {
11036           unsigned long isa_temp = arch->flags;
11037
11038           if (ext != NULL)
11039             {
11040               /* TO_PARSE string contains at least one extension.  */
11041               enum aarch64_parse_opt_result ext_res
11042                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11043
11044               if (ext_res != AARCH64_PARSE_OK)
11045                 return ext_res;
11046             }
11047           /* Extension parsing was successful.  Confirm the result
11048              arch and ISA flags.  */
11049           *res = arch;
11050           *isa_flags = isa_temp;
11051           return AARCH64_PARSE_OK;
11052         }
11053     }
11054
11055   /* ARCH name not found in list.  */
11056   return AARCH64_PARSE_INVALID_ARG;
11057 }
11058
11059 /* Parse the TO_PARSE string and put the result tuning in RES and the
11060    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
11061    describing the parse result.  If there is an error parsing, RES and
11062    ISA_FLAGS are left unchanged.
11063    When the TO_PARSE string contains an invalid extension,
11064    a copy of the string is created and stored to INVALID_EXTENSION.  */
11065
11066 static enum aarch64_parse_opt_result
11067 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11068                    unsigned long *isa_flags, std::string *invalid_extension)
11069 {
11070   const char *ext;
11071   const struct processor *cpu;
11072   size_t len;
11073
11074   ext = strchr (to_parse, '+');
11075
11076   if (ext != NULL)
11077     len = ext - to_parse;
11078   else
11079     len = strlen (to_parse);
11080
11081   if (len == 0)
11082     return AARCH64_PARSE_MISSING_ARG;
11083
11084
11085   /* Loop through the list of supported CPUs to find a match.  */
11086   for (cpu = all_cores; cpu->name != NULL; cpu++)
11087     {
11088       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
11089         {
11090           unsigned long isa_temp = cpu->flags;
11091
11092
11093           if (ext != NULL)
11094             {
11095               /* TO_PARSE string contains at least one extension.  */
11096               enum aarch64_parse_opt_result ext_res
11097                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11098
11099               if (ext_res != AARCH64_PARSE_OK)
11100                 return ext_res;
11101             }
11102           /* Extension parsing was successfull.  Confirm the result
11103              cpu and ISA flags.  */
11104           *res = cpu;
11105           *isa_flags = isa_temp;
11106           return AARCH64_PARSE_OK;
11107         }
11108     }
11109
11110   /* CPU name not found in list.  */
11111   return AARCH64_PARSE_INVALID_ARG;
11112 }
11113
11114 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11115    Return an aarch64_parse_opt_result describing the parse result.
11116    If the parsing fails the RES does not change.  */
11117
11118 static enum aarch64_parse_opt_result
11119 aarch64_parse_tune (const char *to_parse, const struct processor **res)
11120 {
11121   const struct processor *cpu;
11122
11123   /* Loop through the list of supported CPUs to find a match.  */
11124   for (cpu = all_cores; cpu->name != NULL; cpu++)
11125     {
11126       if (strcmp (cpu->name, to_parse) == 0)
11127         {
11128           *res = cpu;
11129           return AARCH64_PARSE_OK;
11130         }
11131     }
11132
11133   /* CPU name not found in list.  */
11134   return AARCH64_PARSE_INVALID_ARG;
11135 }
11136
11137 /* Parse TOKEN, which has length LENGTH to see if it is an option
11138    described in FLAG.  If it is, return the index bit for that fusion type.
11139    If not, error (printing OPTION_NAME) and return zero.  */
11140
11141 static unsigned int
11142 aarch64_parse_one_option_token (const char *token,
11143                                 size_t length,
11144                                 const struct aarch64_flag_desc *flag,
11145                                 const char *option_name)
11146 {
11147   for (; flag->name != NULL; flag++)
11148     {
11149       if (length == strlen (flag->name)
11150           && !strncmp (flag->name, token, length))
11151         return flag->flag;
11152     }
11153
11154   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
11155   return 0;
11156 }
11157
11158 /* Parse OPTION which is a comma-separated list of flags to enable.
11159    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11160    default state we inherit from the CPU tuning structures.  OPTION_NAME
11161    gives the top-level option we are parsing in the -moverride string,
11162    for use in error messages.  */
11163
11164 static unsigned int
11165 aarch64_parse_boolean_options (const char *option,
11166                                const struct aarch64_flag_desc *flags,
11167                                unsigned int initial_state,
11168                                const char *option_name)
11169 {
11170   const char separator = '.';
11171   const char* specs = option;
11172   const char* ntoken = option;
11173   unsigned int found_flags = initial_state;
11174
11175   while ((ntoken = strchr (specs, separator)))
11176     {
11177       size_t token_length = ntoken - specs;
11178       unsigned token_ops = aarch64_parse_one_option_token (specs,
11179                                                            token_length,
11180                                                            flags,
11181                                                            option_name);
11182       /* If we find "none" (or, for simplicity's sake, an error) anywhere
11183          in the token stream, reset the supported operations.  So:
11184
11185            adrp+add.cmp+branch.none.adrp+add
11186
11187            would have the result of turning on only adrp+add fusion.  */
11188       if (!token_ops)
11189         found_flags = 0;
11190
11191       found_flags |= token_ops;
11192       specs = ++ntoken;
11193     }
11194
11195   /* We ended with a comma, print something.  */
11196   if (!(*specs))
11197     {
11198       error ("%s string ill-formed\n", option_name);
11199       return 0;
11200     }
11201
11202   /* We still have one more token to parse.  */
11203   size_t token_length = strlen (specs);
11204   unsigned token_ops = aarch64_parse_one_option_token (specs,
11205                                                        token_length,
11206                                                        flags,
11207                                                        option_name);
11208    if (!token_ops)
11209      found_flags = 0;
11210
11211   found_flags |= token_ops;
11212   return found_flags;
11213 }
11214
11215 /* Support for overriding instruction fusion.  */
11216
11217 static void
11218 aarch64_parse_fuse_string (const char *fuse_string,
11219                             struct tune_params *tune)
11220 {
11221   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
11222                                                      aarch64_fusible_pairs,
11223                                                      tune->fusible_ops,
11224                                                      "fuse=");
11225 }
11226
11227 /* Support for overriding other tuning flags.  */
11228
11229 static void
11230 aarch64_parse_tune_string (const char *tune_string,
11231                             struct tune_params *tune)
11232 {
11233   tune->extra_tuning_flags
11234     = aarch64_parse_boolean_options (tune_string,
11235                                      aarch64_tuning_flags,
11236                                      tune->extra_tuning_flags,
11237                                      "tune=");
11238 }
11239
11240 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11241    Accept the valid SVE vector widths allowed by
11242    aarch64_sve_vector_bits_enum and use it to override sve_width
11243    in TUNE.  */
11244
11245 static void
11246 aarch64_parse_sve_width_string (const char *tune_string,
11247                                 struct tune_params *tune)
11248 {
11249   int width = -1;
11250
11251   int n = sscanf (tune_string, "%d", &width);
11252   if (n == EOF)
11253     {
11254       error ("invalid format for sve_width");
11255       return;
11256     }
11257   switch (width)
11258     {
11259     case SVE_128:
11260     case SVE_256:
11261     case SVE_512:
11262     case SVE_1024:
11263     case SVE_2048:
11264       break;
11265     default:
11266       error ("invalid sve_width value: %d", width);
11267     }
11268   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
11269 }
11270
11271 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11272    we understand.  If it is, extract the option string and handoff to
11273    the appropriate function.  */
11274
11275 void
11276 aarch64_parse_one_override_token (const char* token,
11277                                   size_t length,
11278                                   struct tune_params *tune)
11279 {
11280   const struct aarch64_tuning_override_function *fn
11281     = aarch64_tuning_override_functions;
11282
11283   const char *option_part = strchr (token, '=');
11284   if (!option_part)
11285     {
11286       error ("tuning string missing in option (%s)", token);
11287       return;
11288     }
11289
11290   /* Get the length of the option name.  */
11291   length = option_part - token;
11292   /* Skip the '=' to get to the option string.  */
11293   option_part++;
11294
11295   for (; fn->name != NULL; fn++)
11296     {
11297       if (!strncmp (fn->name, token, length))
11298         {
11299           fn->parse_override (option_part, tune);
11300           return;
11301         }
11302     }
11303
11304   error ("unknown tuning option (%s)",token);
11305   return;
11306 }
11307
11308 /* A checking mechanism for the implementation of the tls size.  */
11309
11310 static void
11311 initialize_aarch64_tls_size (struct gcc_options *opts)
11312 {
11313   if (aarch64_tls_size == 0)
11314     aarch64_tls_size = 24;
11315
11316   switch (opts->x_aarch64_cmodel_var)
11317     {
11318     case AARCH64_CMODEL_TINY:
11319       /* Both the default and maximum TLS size allowed under tiny is 1M which
11320          needs two instructions to address, so we clamp the size to 24.  */
11321       if (aarch64_tls_size > 24)
11322         aarch64_tls_size = 24;
11323       break;
11324     case AARCH64_CMODEL_SMALL:
11325       /* The maximum TLS size allowed under small is 4G.  */
11326       if (aarch64_tls_size > 32)
11327         aarch64_tls_size = 32;
11328       break;
11329     case AARCH64_CMODEL_LARGE:
11330       /* The maximum TLS size allowed under large is 16E.
11331          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
11332       if (aarch64_tls_size > 48)
11333         aarch64_tls_size = 48;
11334       break;
11335     default:
11336       gcc_unreachable ();
11337     }
11338
11339   return;
11340 }
11341
11342 /* Parse STRING looking for options in the format:
11343      string     :: option:string
11344      option     :: name=substring
11345      name       :: {a-z}
11346      substring  :: defined by option.  */
11347
11348 static void
11349 aarch64_parse_override_string (const char* input_string,
11350                                struct tune_params* tune)
11351 {
11352   const char separator = ':';
11353   size_t string_length = strlen (input_string) + 1;
11354   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
11355   char *string = string_root;
11356   strncpy (string, input_string, string_length);
11357   string[string_length - 1] = '\0';
11358
11359   char* ntoken = string;
11360
11361   while ((ntoken = strchr (string, separator)))
11362     {
11363       size_t token_length = ntoken - string;
11364       /* Make this substring look like a string.  */
11365       *ntoken = '\0';
11366       aarch64_parse_one_override_token (string, token_length, tune);
11367       string = ++ntoken;
11368     }
11369
11370   /* One last option to parse.  */
11371   aarch64_parse_one_override_token (string, strlen (string), tune);
11372   free (string_root);
11373 }
11374
11375
11376 static void
11377 aarch64_override_options_after_change_1 (struct gcc_options *opts)
11378 {
11379   if (accepted_branch_protection_string)
11380     {
11381       opts->x_aarch64_branch_protection_string
11382         = xstrdup (accepted_branch_protection_string);
11383     }
11384
11385   /* PR 70044: We have to be careful about being called multiple times for the
11386      same function.  This means all changes should be repeatable.  */
11387
11388   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
11389      Disable the frame pointer flag so the mid-end will not use a frame
11390      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
11391      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
11392      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
11393   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
11394   if (opts->x_flag_omit_frame_pointer == 0)
11395     opts->x_flag_omit_frame_pointer = 2;
11396
11397   /* If not optimizing for size, set the default
11398      alignment to what the target wants.  */
11399   if (!opts->x_optimize_size)
11400     {
11401       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
11402         opts->x_str_align_loops = aarch64_tune_params.loop_align;
11403       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
11404         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
11405       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
11406         opts->x_str_align_functions = aarch64_tune_params.function_align;
11407     }
11408
11409   /* We default to no pc-relative literal loads.  */
11410
11411   aarch64_pcrelative_literal_loads = false;
11412
11413   /* If -mpc-relative-literal-loads is set on the command line, this
11414      implies that the user asked for PC relative literal loads.  */
11415   if (opts->x_pcrelative_literal_loads == 1)
11416     aarch64_pcrelative_literal_loads = true;
11417
11418   /* In the tiny memory model it makes no sense to disallow PC relative
11419      literal pool loads.  */
11420   if (aarch64_cmodel == AARCH64_CMODEL_TINY
11421       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11422     aarch64_pcrelative_literal_loads = true;
11423
11424   /* When enabling the lower precision Newton series for the square root, also
11425      enable it for the reciprocal square root, since the latter is an
11426      intermediary step for the former.  */
11427   if (flag_mlow_precision_sqrt)
11428     flag_mrecip_low_precision_sqrt = true;
11429 }
11430
11431 /* 'Unpack' up the internal tuning structs and update the options
11432     in OPTS.  The caller must have set up selected_tune and selected_arch
11433     as all the other target-specific codegen decisions are
11434     derived from them.  */
11435
11436 void
11437 aarch64_override_options_internal (struct gcc_options *opts)
11438 {
11439   aarch64_tune_flags = selected_tune->flags;
11440   aarch64_tune = selected_tune->sched_core;
11441   /* Make a copy of the tuning parameters attached to the core, which
11442      we may later overwrite.  */
11443   aarch64_tune_params = *(selected_tune->tune);
11444   aarch64_architecture_version = selected_arch->architecture_version;
11445
11446   if (opts->x_aarch64_override_tune_string)
11447     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
11448                                   &aarch64_tune_params);
11449
11450   /* This target defaults to strict volatile bitfields.  */
11451   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
11452     opts->x_flag_strict_volatile_bitfields = 1;
11453
11454   if (aarch64_stack_protector_guard == SSP_GLOBAL
11455       && opts->x_aarch64_stack_protector_guard_offset_str)
11456     {
11457       error ("incompatible options %<-mstack-protector-guard=global%> and"
11458              "%<-mstack-protector-guard-offset=%qs%>",
11459              aarch64_stack_protector_guard_offset_str);
11460     }
11461
11462   if (aarch64_stack_protector_guard == SSP_SYSREG
11463       && !(opts->x_aarch64_stack_protector_guard_offset_str
11464            && opts->x_aarch64_stack_protector_guard_reg_str))
11465     {
11466       error ("both %<-mstack-protector-guard-offset%> and "
11467              "%<-mstack-protector-guard-reg%> must be used "
11468              "with %<-mstack-protector-guard=sysreg%>");
11469     }
11470
11471   if (opts->x_aarch64_stack_protector_guard_reg_str)
11472     {
11473       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
11474           error ("specify a system register with a small string length.");
11475     }
11476
11477   if (opts->x_aarch64_stack_protector_guard_offset_str)
11478     {
11479       char *end;
11480       const char *str = aarch64_stack_protector_guard_offset_str;
11481       errno = 0;
11482       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
11483       if (!*str || *end || errno)
11484         error ("%qs is not a valid offset in %qs", str,
11485                "%<-mstack-protector-guard-offset=%>");
11486       aarch64_stack_protector_guard_offset = offs;
11487     }
11488
11489   initialize_aarch64_code_model (opts);
11490   initialize_aarch64_tls_size (opts);
11491
11492   int queue_depth = 0;
11493   switch (aarch64_tune_params.autoprefetcher_model)
11494     {
11495       case tune_params::AUTOPREFETCHER_OFF:
11496         queue_depth = -1;
11497         break;
11498       case tune_params::AUTOPREFETCHER_WEAK:
11499         queue_depth = 0;
11500         break;
11501       case tune_params::AUTOPREFETCHER_STRONG:
11502         queue_depth = max_insn_queue_index + 1;
11503         break;
11504       default:
11505         gcc_unreachable ();
11506     }
11507
11508   /* We don't mind passing in global_options_set here as we don't use
11509      the *options_set structs anyway.  */
11510   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
11511                          queue_depth,
11512                          opts->x_param_values,
11513                          global_options_set.x_param_values);
11514
11515   /* Set up parameters to be used in prefetching algorithm.  Do not
11516      override the defaults unless we are tuning for a core we have
11517      researched values for.  */
11518   if (aarch64_tune_params.prefetch->num_slots > 0)
11519     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
11520                            aarch64_tune_params.prefetch->num_slots,
11521                            opts->x_param_values,
11522                            global_options_set.x_param_values);
11523   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
11524     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
11525                            aarch64_tune_params.prefetch->l1_cache_size,
11526                            opts->x_param_values,
11527                            global_options_set.x_param_values);
11528   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
11529     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
11530                            aarch64_tune_params.prefetch->l1_cache_line_size,
11531                            opts->x_param_values,
11532                            global_options_set.x_param_values);
11533   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
11534     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
11535                            aarch64_tune_params.prefetch->l2_cache_size,
11536                            opts->x_param_values,
11537                            global_options_set.x_param_values);
11538   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
11539     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
11540                            0,
11541                            opts->x_param_values,
11542                            global_options_set.x_param_values);
11543   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
11544     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
11545                            aarch64_tune_params.prefetch->minimum_stride,
11546                            opts->x_param_values,
11547                            global_options_set.x_param_values);
11548
11549   /* Use the alternative scheduling-pressure algorithm by default.  */
11550   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
11551                          opts->x_param_values,
11552                          global_options_set.x_param_values);
11553
11554   /* If the user hasn't changed it via configure then set the default to 64 KB
11555      for the backend.  */
11556   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
11557                          DEFAULT_STK_CLASH_GUARD_SIZE == 0
11558                            ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
11559                          opts->x_param_values,
11560                          global_options_set.x_param_values);
11561
11562   /* Validate the guard size.  */
11563   int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
11564
11565   /* Enforce that interval is the same size as size so the mid-end does the
11566      right thing.  */
11567   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
11568                          guard_size,
11569                          opts->x_param_values,
11570                          global_options_set.x_param_values);
11571
11572   /* The maybe_set calls won't update the value if the user has explicitly set
11573      one.  Which means we need to validate that probing interval and guard size
11574      are equal.  */
11575   int probe_interval
11576     = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
11577   if (guard_size != probe_interval)
11578     error ("stack clash guard size '%d' must be equal to probing interval "
11579            "'%d'", guard_size, probe_interval);
11580
11581   /* Enable sw prefetching at specified optimization level for
11582      CPUS that have prefetch.  Lower optimization level threshold by 1
11583      when profiling is enabled.  */
11584   if (opts->x_flag_prefetch_loop_arrays < 0
11585       && !opts->x_optimize_size
11586       && aarch64_tune_params.prefetch->default_opt_level >= 0
11587       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
11588     opts->x_flag_prefetch_loop_arrays = 1;
11589
11590   if (opts->x_aarch64_arch_string == NULL)
11591     opts->x_aarch64_arch_string = selected_arch->name;
11592   if (opts->x_aarch64_cpu_string == NULL)
11593     opts->x_aarch64_cpu_string = selected_cpu->name;
11594   if (opts->x_aarch64_tune_string == NULL)
11595     opts->x_aarch64_tune_string = selected_tune->name;
11596
11597   aarch64_override_options_after_change_1 (opts);
11598 }
11599
11600 /* Print a hint with a suggestion for a core or architecture name that
11601    most closely resembles what the user passed in STR.  ARCH is true if
11602    the user is asking for an architecture name.  ARCH is false if the user
11603    is asking for a core name.  */
11604
11605 static void
11606 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
11607 {
11608   auto_vec<const char *> candidates;
11609   const struct processor *entry = arch ? all_architectures : all_cores;
11610   for (; entry->name != NULL; entry++)
11611     candidates.safe_push (entry->name);
11612
11613 #ifdef HAVE_LOCAL_CPU_DETECT
11614   /* Add also "native" as possible value.  */
11615   if (arch)
11616     candidates.safe_push ("native");
11617 #endif
11618
11619   char *s;
11620   const char *hint = candidates_list_and_hint (str, s, candidates);
11621   if (hint)
11622     inform (input_location, "valid arguments are: %s;"
11623                              " did you mean %qs?", s, hint);
11624   else
11625     inform (input_location, "valid arguments are: %s", s);
11626
11627   XDELETEVEC (s);
11628 }
11629
11630 /* Print a hint with a suggestion for a core name that most closely resembles
11631    what the user passed in STR.  */
11632
11633 inline static void
11634 aarch64_print_hint_for_core (const char *str)
11635 {
11636   aarch64_print_hint_for_core_or_arch (str, false);
11637 }
11638
11639 /* Print a hint with a suggestion for an architecture name that most closely
11640    resembles what the user passed in STR.  */
11641
11642 inline static void
11643 aarch64_print_hint_for_arch (const char *str)
11644 {
11645   aarch64_print_hint_for_core_or_arch (str, true);
11646 }
11647
11648
11649 /* Print a hint with a suggestion for an extension name
11650    that most closely resembles what the user passed in STR.  */
11651
11652 void
11653 aarch64_print_hint_for_extensions (const std::string &str)
11654 {
11655   auto_vec<const char *> candidates;
11656   aarch64_get_all_extension_candidates (&candidates);
11657   char *s;
11658   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
11659   if (hint)
11660     inform (input_location, "valid arguments are: %s;"
11661                              " did you mean %qs?", s, hint);
11662   else
11663     inform (input_location, "valid arguments are: %s;", s);
11664
11665   XDELETEVEC (s);
11666 }
11667
11668 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
11669    specified in STR and throw errors if appropriate.  Put the results if
11670    they are valid in RES and ISA_FLAGS.  Return whether the option is
11671    valid.  */
11672
11673 static bool
11674 aarch64_validate_mcpu (const char *str, const struct processor **res,
11675                        unsigned long *isa_flags)
11676 {
11677   std::string invalid_extension;
11678   enum aarch64_parse_opt_result parse_res
11679     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
11680
11681   if (parse_res == AARCH64_PARSE_OK)
11682     return true;
11683
11684   switch (parse_res)
11685     {
11686       case AARCH64_PARSE_MISSING_ARG:
11687         error ("missing cpu name in %<-mcpu=%s%>", str);
11688         break;
11689       case AARCH64_PARSE_INVALID_ARG:
11690         error ("unknown value %qs for %<-mcpu%>", str);
11691         aarch64_print_hint_for_core (str);
11692         break;
11693       case AARCH64_PARSE_INVALID_FEATURE:
11694         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
11695                invalid_extension.c_str (), str);
11696         aarch64_print_hint_for_extensions (invalid_extension);
11697         break;
11698       default:
11699         gcc_unreachable ();
11700     }
11701
11702   return false;
11703 }
11704
11705 /* Parses CONST_STR for branch protection features specified in
11706    aarch64_branch_protect_types, and set any global variables required.  Returns
11707    the parsing result and assigns LAST_STR to the last processed token from
11708    CONST_STR so that it can be used for error reporting.  */
11709
11710 static enum
11711 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
11712                                                           char** last_str)
11713 {
11714   char *str_root = xstrdup (const_str);
11715   char* token_save = NULL;
11716   char *str = strtok_r (str_root, "+", &token_save);
11717   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
11718   if (!str)
11719     res = AARCH64_PARSE_MISSING_ARG;
11720   else
11721     {
11722       char *next_str = strtok_r (NULL, "+", &token_save);
11723       /* Reset the branch protection features to their defaults.  */
11724       aarch64_handle_no_branch_protection (NULL, NULL);
11725
11726       while (str && res == AARCH64_PARSE_OK)
11727         {
11728           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
11729           bool found = false;
11730           /* Search for this type.  */
11731           while (type && type->name && !found && res == AARCH64_PARSE_OK)
11732             {
11733               if (strcmp (str, type->name) == 0)
11734                 {
11735                   found = true;
11736                   res = type->handler (str, next_str);
11737                   str = next_str;
11738                   next_str = strtok_r (NULL, "+", &token_save);
11739                 }
11740               else
11741                 type++;
11742             }
11743           if (found && res == AARCH64_PARSE_OK)
11744             {
11745               bool found_subtype = true;
11746               /* Loop through each token until we find one that isn't a
11747                  subtype.  */
11748               while (found_subtype)
11749                 {
11750                   found_subtype = false;
11751                   const aarch64_branch_protect_type *subtype = type->subtypes;
11752                   /* Search for the subtype.  */
11753                   while (str && subtype && subtype->name && !found_subtype
11754                           && res == AARCH64_PARSE_OK)
11755                     {
11756                       if (strcmp (str, subtype->name) == 0)
11757                         {
11758                           found_subtype = true;
11759                           res = subtype->handler (str, next_str);
11760                           str = next_str;
11761                           next_str = strtok_r (NULL, "+", &token_save);
11762                         }
11763                       else
11764                         subtype++;
11765                     }
11766                 }
11767             }
11768           else if (!found)
11769             res = AARCH64_PARSE_INVALID_ARG;
11770         }
11771     }
11772   /* Copy the last processed token into the argument to pass it back.
11773     Used by option and attribute validation to print the offending token.  */
11774   if (last_str)
11775     {
11776       if (str) strcpy (*last_str, str);
11777       else *last_str = NULL;
11778     }
11779   if (res == AARCH64_PARSE_OK)
11780     {
11781       /* If needed, alloc the accepted string then copy in const_str.
11782         Used by override_option_after_change_1.  */
11783       if (!accepted_branch_protection_string)
11784         accepted_branch_protection_string = (char *) xmalloc (
11785                                                       BRANCH_PROTECT_STR_MAX
11786                                                         + 1);
11787       strncpy (accepted_branch_protection_string, const_str,
11788                 BRANCH_PROTECT_STR_MAX + 1);
11789       /* Forcibly null-terminate.  */
11790       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
11791     }
11792   return res;
11793 }
11794
11795 static bool
11796 aarch64_validate_mbranch_protection (const char *const_str)
11797 {
11798   char *str = (char *) xmalloc (strlen (const_str));
11799   enum aarch64_parse_opt_result res =
11800     aarch64_parse_branch_protection (const_str, &str);
11801   if (res == AARCH64_PARSE_INVALID_ARG)
11802     error ("invalid arg %<%s%> for %<-mbranch-protection=%>", str);
11803   else if (res == AARCH64_PARSE_MISSING_ARG)
11804     error ("missing arg for %<-mbranch-protection=%>");
11805   free (str);
11806   return res == AARCH64_PARSE_OK;
11807 }
11808
11809 /* Validate a command-line -march option.  Parse the arch and extensions
11810    (if any) specified in STR and throw errors if appropriate.  Put the
11811    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
11812    option is valid.  */
11813
11814 static bool
11815 aarch64_validate_march (const char *str, const struct processor **res,
11816                          unsigned long *isa_flags)
11817 {
11818   std::string invalid_extension;
11819   enum aarch64_parse_opt_result parse_res
11820     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
11821
11822   if (parse_res == AARCH64_PARSE_OK)
11823     return true;
11824
11825   switch (parse_res)
11826     {
11827       case AARCH64_PARSE_MISSING_ARG:
11828         error ("missing arch name in %<-march=%s%>", str);
11829         break;
11830       case AARCH64_PARSE_INVALID_ARG:
11831         error ("unknown value %qs for %<-march%>", str);
11832         aarch64_print_hint_for_arch (str);
11833         break;
11834       case AARCH64_PARSE_INVALID_FEATURE:
11835         error ("invalid feature modifier %qs in %<-march=%s%>",
11836                invalid_extension.c_str (), str);
11837         aarch64_print_hint_for_extensions (invalid_extension);
11838         break;
11839       default:
11840         gcc_unreachable ();
11841     }
11842
11843   return false;
11844 }
11845
11846 /* Validate a command-line -mtune option.  Parse the cpu
11847    specified in STR and throw errors if appropriate.  Put the
11848    result, if it is valid, in RES.  Return whether the option is
11849    valid.  */
11850
11851 static bool
11852 aarch64_validate_mtune (const char *str, const struct processor **res)
11853 {
11854   enum aarch64_parse_opt_result parse_res
11855     = aarch64_parse_tune (str, res);
11856
11857   if (parse_res == AARCH64_PARSE_OK)
11858     return true;
11859
11860   switch (parse_res)
11861     {
11862       case AARCH64_PARSE_MISSING_ARG:
11863         error ("missing cpu name in %<-mtune=%s%>", str);
11864         break;
11865       case AARCH64_PARSE_INVALID_ARG:
11866         error ("unknown value %qs for %<-mtune%>", str);
11867         aarch64_print_hint_for_core (str);
11868         break;
11869       default:
11870         gcc_unreachable ();
11871     }
11872   return false;
11873 }
11874
11875 /* Return the CPU corresponding to the enum CPU.
11876    If it doesn't specify a cpu, return the default.  */
11877
11878 static const struct processor *
11879 aarch64_get_tune_cpu (enum aarch64_processor cpu)
11880 {
11881   if (cpu != aarch64_none)
11882     return &all_cores[cpu];
11883
11884   /* The & 0x3f is to extract the bottom 6 bits that encode the
11885      default cpu as selected by the --with-cpu GCC configure option
11886      in config.gcc.
11887      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
11888      flags mechanism should be reworked to make it more sane.  */
11889   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11890 }
11891
11892 /* Return the architecture corresponding to the enum ARCH.
11893    If it doesn't specify a valid architecture, return the default.  */
11894
11895 static const struct processor *
11896 aarch64_get_arch (enum aarch64_arch arch)
11897 {
11898   if (arch != aarch64_no_arch)
11899     return &all_architectures[arch];
11900
11901   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11902
11903   return &all_architectures[cpu->arch];
11904 }
11905
11906 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
11907
11908 static poly_uint16
11909 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
11910 {
11911   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
11912      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
11913      deciding which .md file patterns to use and when deciding whether
11914      something is a legitimate address or constant.  */
11915   if (value == SVE_SCALABLE || value == SVE_128)
11916     return poly_uint16 (2, 2);
11917   else
11918     return (int) value / 64;
11919 }
11920
11921 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
11922    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
11923    tuning structs.  In particular it must set selected_tune and
11924    aarch64_isa_flags that define the available ISA features and tuning
11925    decisions.  It must also set selected_arch as this will be used to
11926    output the .arch asm tags for each function.  */
11927
11928 static void
11929 aarch64_override_options (void)
11930 {
11931   unsigned long cpu_isa = 0;
11932   unsigned long arch_isa = 0;
11933   aarch64_isa_flags = 0;
11934
11935   bool valid_cpu = true;
11936   bool valid_tune = true;
11937   bool valid_arch = true;
11938
11939   selected_cpu = NULL;
11940   selected_arch = NULL;
11941   selected_tune = NULL;
11942
11943   if (aarch64_branch_protection_string)
11944     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
11945
11946   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
11947      If either of -march or -mtune is given, they override their
11948      respective component of -mcpu.  */
11949   if (aarch64_cpu_string)
11950     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
11951                                         &cpu_isa);
11952
11953   if (aarch64_arch_string)
11954     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
11955                                           &arch_isa);
11956
11957   if (aarch64_tune_string)
11958     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
11959
11960 #ifdef SUBTARGET_OVERRIDE_OPTIONS
11961   SUBTARGET_OVERRIDE_OPTIONS;
11962 #endif
11963
11964   /* If the user did not specify a processor, choose the default
11965      one for them.  This will be the CPU set during configuration using
11966      --with-cpu, otherwise it is "generic".  */
11967   if (!selected_cpu)
11968     {
11969       if (selected_arch)
11970         {
11971           selected_cpu = &all_cores[selected_arch->ident];
11972           aarch64_isa_flags = arch_isa;
11973           explicit_arch = selected_arch->arch;
11974         }
11975       else
11976         {
11977           /* Get default configure-time CPU.  */
11978           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
11979           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
11980         }
11981
11982       if (selected_tune)
11983         explicit_tune_core = selected_tune->ident;
11984     }
11985   /* If both -mcpu and -march are specified check that they are architecturally
11986      compatible, warn if they're not and prefer the -march ISA flags.  */
11987   else if (selected_arch)
11988     {
11989       if (selected_arch->arch != selected_cpu->arch)
11990         {
11991           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
11992                        all_architectures[selected_cpu->arch].name,
11993                        selected_arch->name);
11994         }
11995       aarch64_isa_flags = arch_isa;
11996       explicit_arch = selected_arch->arch;
11997       explicit_tune_core = selected_tune ? selected_tune->ident
11998                                           : selected_cpu->ident;
11999     }
12000   else
12001     {
12002       /* -mcpu but no -march.  */
12003       aarch64_isa_flags = cpu_isa;
12004       explicit_tune_core = selected_tune ? selected_tune->ident
12005                                           : selected_cpu->ident;
12006       gcc_assert (selected_cpu);
12007       selected_arch = &all_architectures[selected_cpu->arch];
12008       explicit_arch = selected_arch->arch;
12009     }
12010
12011   /* Set the arch as well as we will need it when outputing
12012      the .arch directive in assembly.  */
12013   if (!selected_arch)
12014     {
12015       gcc_assert (selected_cpu);
12016       selected_arch = &all_architectures[selected_cpu->arch];
12017     }
12018
12019   if (!selected_tune)
12020     selected_tune = selected_cpu;
12021
12022   if (aarch64_enable_bti == 2)
12023     {
12024 #ifdef TARGET_ENABLE_BTI
12025       aarch64_enable_bti = 1;
12026 #else
12027       aarch64_enable_bti = 0;
12028 #endif
12029     }
12030
12031   /* Return address signing is currently not supported for ILP32 targets.  For
12032      LP64 targets use the configured option in the absence of a command-line
12033      option for -mbranch-protection.  */
12034   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12035     {
12036 #ifdef TARGET_ENABLE_PAC_RET
12037       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12038 #else
12039       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12040 #endif
12041     }
12042
12043 #ifndef HAVE_AS_MABI_OPTION
12044   /* The compiler may have been configured with 2.23.* binutils, which does
12045      not have support for ILP32.  */
12046   if (TARGET_ILP32)
12047     error ("assembler does not support %<-mabi=ilp32%>");
12048 #endif
12049
12050   /* Convert -msve-vector-bits to a VG count.  */
12051   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12052
12053   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12054     sorry ("return address signing is only supported for %<-mabi=lp64%>");
12055
12056   /* Make sure we properly set up the explicit options.  */
12057   if ((aarch64_cpu_string && valid_cpu)
12058        || (aarch64_tune_string && valid_tune))
12059     gcc_assert (explicit_tune_core != aarch64_none);
12060
12061   if ((aarch64_cpu_string && valid_cpu)
12062        || (aarch64_arch_string && valid_arch))
12063     gcc_assert (explicit_arch != aarch64_no_arch);
12064
12065   /* The pass to insert speculation tracking runs before
12066      shrink-wrapping and the latter does not know how to update the
12067      tracking status.  So disable it in this case.  */
12068   if (aarch64_track_speculation)
12069     flag_shrink_wrap = 0;
12070
12071   aarch64_override_options_internal (&global_options);
12072
12073   /* Save these options as the default ones in case we push and pop them later
12074      while processing functions with potential target attributes.  */
12075   target_option_default_node = target_option_current_node
12076       = build_target_option_node (&global_options);
12077 }
12078
12079 /* Implement targetm.override_options_after_change.  */
12080
12081 static void
12082 aarch64_override_options_after_change (void)
12083 {
12084   aarch64_override_options_after_change_1 (&global_options);
12085 }
12086
12087 static struct machine_function *
12088 aarch64_init_machine_status (void)
12089 {
12090   struct machine_function *machine;
12091   machine = ggc_cleared_alloc<machine_function> ();
12092   return machine;
12093 }
12094
12095 void
12096 aarch64_init_expanders (void)
12097 {
12098   init_machine_status = aarch64_init_machine_status;
12099 }
12100
12101 /* A checking mechanism for the implementation of the various code models.  */
12102 static void
12103 initialize_aarch64_code_model (struct gcc_options *opts)
12104 {
12105    if (opts->x_flag_pic)
12106      {
12107        switch (opts->x_aarch64_cmodel_var)
12108          {
12109          case AARCH64_CMODEL_TINY:
12110            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
12111            break;
12112          case AARCH64_CMODEL_SMALL:
12113 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12114            aarch64_cmodel = (flag_pic == 2
12115                              ? AARCH64_CMODEL_SMALL_PIC
12116                              : AARCH64_CMODEL_SMALL_SPIC);
12117 #else
12118            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
12119 #endif
12120            break;
12121          case AARCH64_CMODEL_LARGE:
12122            sorry ("code model %qs with %<-f%s%>", "large",
12123                   opts->x_flag_pic > 1 ? "PIC" : "pic");
12124            break;
12125          default:
12126            gcc_unreachable ();
12127          }
12128      }
12129    else
12130      aarch64_cmodel = opts->x_aarch64_cmodel_var;
12131 }
12132
12133 /* Implement TARGET_OPTION_SAVE.  */
12134
12135 static void
12136 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
12137 {
12138   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
12139   ptr->x_aarch64_branch_protection_string
12140     = opts->x_aarch64_branch_protection_string;
12141 }
12142
12143 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
12144    using the information saved in PTR.  */
12145
12146 static void
12147 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
12148 {
12149   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
12150   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12151   opts->x_explicit_arch = ptr->x_explicit_arch;
12152   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
12153   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
12154   opts->x_aarch64_branch_protection_string
12155     = ptr->x_aarch64_branch_protection_string;
12156   if (opts->x_aarch64_branch_protection_string)
12157     {
12158       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
12159                                         NULL);
12160     }
12161
12162   aarch64_override_options_internal (opts);
12163 }
12164
12165 /* Implement TARGET_OPTION_PRINT.  */
12166
12167 static void
12168 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
12169 {
12170   const struct processor *cpu
12171     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12172   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
12173   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
12174   std::string extension
12175     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
12176
12177   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
12178   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
12179            arch->name, extension.c_str ());
12180 }
12181
12182 static GTY(()) tree aarch64_previous_fndecl;
12183
12184 void
12185 aarch64_reset_previous_fndecl (void)
12186 {
12187   aarch64_previous_fndecl = NULL;
12188 }
12189
12190 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
12191    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
12192    make sure optab availability predicates are recomputed when necessary.  */
12193
12194 void
12195 aarch64_save_restore_target_globals (tree new_tree)
12196 {
12197   if (TREE_TARGET_GLOBALS (new_tree))
12198     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
12199   else if (new_tree == target_option_default_node)
12200     restore_target_globals (&default_target_globals);
12201   else
12202     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
12203 }
12204
12205 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
12206    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
12207    of the function, if such exists.  This function may be called multiple
12208    times on a single function so use aarch64_previous_fndecl to avoid
12209    setting up identical state.  */
12210
12211 static void
12212 aarch64_set_current_function (tree fndecl)
12213 {
12214   if (!fndecl || fndecl == aarch64_previous_fndecl)
12215     return;
12216
12217   tree old_tree = (aarch64_previous_fndecl
12218                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
12219                    : NULL_TREE);
12220
12221   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12222
12223   /* If current function has no attributes but the previous one did,
12224      use the default node.  */
12225   if (!new_tree && old_tree)
12226     new_tree = target_option_default_node;
12227
12228   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
12229      the default have been handled by aarch64_save_restore_target_globals from
12230      aarch64_pragma_target_parse.  */
12231   if (old_tree == new_tree)
12232     return;
12233
12234   aarch64_previous_fndecl = fndecl;
12235
12236   /* First set the target options.  */
12237   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
12238
12239   aarch64_save_restore_target_globals (new_tree);
12240 }
12241
12242 /* Enum describing the various ways we can handle attributes.
12243    In many cases we can reuse the generic option handling machinery.  */
12244
12245 enum aarch64_attr_opt_type
12246 {
12247   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
12248   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
12249   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
12250   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
12251 };
12252
12253 /* All the information needed to handle a target attribute.
12254    NAME is the name of the attribute.
12255    ATTR_TYPE specifies the type of behavior of the attribute as described
12256    in the definition of enum aarch64_attr_opt_type.
12257    ALLOW_NEG is true if the attribute supports a "no-" form.
12258    HANDLER is the function that takes the attribute string as an argument
12259    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12260    OPT_NUM is the enum specifying the option that the attribute modifies.
12261    This is needed for attributes that mirror the behavior of a command-line
12262    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12263    aarch64_attr_enum.  */
12264
12265 struct aarch64_attribute_info
12266 {
12267   const char *name;
12268   enum aarch64_attr_opt_type attr_type;
12269   bool allow_neg;
12270   bool (*handler) (const char *);
12271   enum opt_code opt_num;
12272 };
12273
12274 /* Handle the ARCH_STR argument to the arch= target attribute.  */
12275
12276 static bool
12277 aarch64_handle_attr_arch (const char *str)
12278 {
12279   const struct processor *tmp_arch = NULL;
12280   std::string invalid_extension;
12281   enum aarch64_parse_opt_result parse_res
12282     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
12283
12284   if (parse_res == AARCH64_PARSE_OK)
12285     {
12286       gcc_assert (tmp_arch);
12287       selected_arch = tmp_arch;
12288       explicit_arch = selected_arch->arch;
12289       return true;
12290     }
12291
12292   switch (parse_res)
12293     {
12294       case AARCH64_PARSE_MISSING_ARG:
12295         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12296         break;
12297       case AARCH64_PARSE_INVALID_ARG:
12298         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
12299         aarch64_print_hint_for_arch (str);
12300         break;
12301       case AARCH64_PARSE_INVALID_FEATURE:
12302         error ("invalid feature modifier %s of value (\"%s\") in "
12303                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12304         aarch64_print_hint_for_extensions (invalid_extension);
12305         break;
12306       default:
12307         gcc_unreachable ();
12308     }
12309
12310   return false;
12311 }
12312
12313 /* Handle the argument CPU_STR to the cpu= target attribute.  */
12314
12315 static bool
12316 aarch64_handle_attr_cpu (const char *str)
12317 {
12318   const struct processor *tmp_cpu = NULL;
12319   std::string invalid_extension;
12320   enum aarch64_parse_opt_result parse_res
12321     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
12322
12323   if (parse_res == AARCH64_PARSE_OK)
12324     {
12325       gcc_assert (tmp_cpu);
12326       selected_tune = tmp_cpu;
12327       explicit_tune_core = selected_tune->ident;
12328
12329       selected_arch = &all_architectures[tmp_cpu->arch];
12330       explicit_arch = selected_arch->arch;
12331       return true;
12332     }
12333
12334   switch (parse_res)
12335     {
12336       case AARCH64_PARSE_MISSING_ARG:
12337         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12338         break;
12339       case AARCH64_PARSE_INVALID_ARG:
12340         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
12341         aarch64_print_hint_for_core (str);
12342         break;
12343       case AARCH64_PARSE_INVALID_FEATURE:
12344         error ("invalid feature modifier %s of value (\"%s\") in "
12345                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12346         aarch64_print_hint_for_extensions (invalid_extension);
12347         break;
12348       default:
12349         gcc_unreachable ();
12350     }
12351
12352   return false;
12353 }
12354
12355 /* Handle the argument STR to the branch-protection= attribute.  */
12356
12357  static bool
12358  aarch64_handle_attr_branch_protection (const char* str)
12359  {
12360   char *err_str = (char *) xmalloc (strlen (str));
12361   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
12362                                                                       &err_str);
12363   bool success = false;
12364   switch (res)
12365     {
12366      case AARCH64_PARSE_MISSING_ARG:
12367        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12368               " attribute");
12369        break;
12370      case AARCH64_PARSE_INVALID_ARG:
12371        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12372               "=\")%> pragma or attribute", err_str);
12373        break;
12374      case AARCH64_PARSE_OK:
12375        success = true;
12376       /* Fall through.  */
12377      case AARCH64_PARSE_INVALID_FEATURE:
12378        break;
12379      default:
12380        gcc_unreachable ();
12381     }
12382   free (err_str);
12383   return success;
12384  }
12385
12386 /* Handle the argument STR to the tune= target attribute.  */
12387
12388 static bool
12389 aarch64_handle_attr_tune (const char *str)
12390 {
12391   const struct processor *tmp_tune = NULL;
12392   enum aarch64_parse_opt_result parse_res
12393     = aarch64_parse_tune (str, &tmp_tune);
12394
12395   if (parse_res == AARCH64_PARSE_OK)
12396     {
12397       gcc_assert (tmp_tune);
12398       selected_tune = tmp_tune;
12399       explicit_tune_core = selected_tune->ident;
12400       return true;
12401     }
12402
12403   switch (parse_res)
12404     {
12405       case AARCH64_PARSE_INVALID_ARG:
12406         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
12407         aarch64_print_hint_for_core (str);
12408         break;
12409       default:
12410         gcc_unreachable ();
12411     }
12412
12413   return false;
12414 }
12415
12416 /* Parse an architecture extensions target attribute string specified in STR.
12417    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
12418    if successful.  Update aarch64_isa_flags to reflect the ISA features
12419    modified.  */
12420
12421 static bool
12422 aarch64_handle_attr_isa_flags (char *str)
12423 {
12424   enum aarch64_parse_opt_result parse_res;
12425   unsigned long isa_flags = aarch64_isa_flags;
12426
12427   /* We allow "+nothing" in the beginning to clear out all architectural
12428      features if the user wants to handpick specific features.  */
12429   if (strncmp ("+nothing", str, 8) == 0)
12430     {
12431       isa_flags = 0;
12432       str += 8;
12433     }
12434
12435   std::string invalid_extension;
12436   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
12437
12438   if (parse_res == AARCH64_PARSE_OK)
12439     {
12440       aarch64_isa_flags = isa_flags;
12441       return true;
12442     }
12443
12444   switch (parse_res)
12445     {
12446       case AARCH64_PARSE_MISSING_ARG:
12447         error ("missing value in %<target()%> pragma or attribute");
12448         break;
12449
12450       case AARCH64_PARSE_INVALID_FEATURE:
12451         error ("invalid feature modifier %s of value (\"%s\") in "
12452                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12453         break;
12454
12455       default:
12456         gcc_unreachable ();
12457     }
12458
12459  return false;
12460 }
12461
12462 /* The target attributes that we support.  On top of these we also support just
12463    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
12464    handled explicitly in aarch64_process_one_target_attr.  */
12465
12466 static const struct aarch64_attribute_info aarch64_attributes[] =
12467 {
12468   { "general-regs-only", aarch64_attr_mask, false, NULL,
12469      OPT_mgeneral_regs_only },
12470   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
12471      OPT_mfix_cortex_a53_835769 },
12472   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
12473      OPT_mfix_cortex_a53_843419 },
12474   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
12475   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
12476   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
12477      OPT_momit_leaf_frame_pointer },
12478   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
12479   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
12480      OPT_march_ },
12481   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
12482   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
12483      OPT_mtune_ },
12484   { "branch-protection", aarch64_attr_custom, false,
12485      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
12486   { "sign-return-address", aarch64_attr_enum, false, NULL,
12487      OPT_msign_return_address_ },
12488   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
12489 };
12490
12491 /* Parse ARG_STR which contains the definition of one target attribute.
12492    Show appropriate errors if any or return true if the attribute is valid.  */
12493
12494 static bool
12495 aarch64_process_one_target_attr (char *arg_str)
12496 {
12497   bool invert = false;
12498
12499   size_t len = strlen (arg_str);
12500
12501   if (len == 0)
12502     {
12503       error ("malformed %<target()%> pragma or attribute");
12504       return false;
12505     }
12506
12507   char *str_to_check = (char *) alloca (len + 1);
12508   strcpy (str_to_check, arg_str);
12509
12510   /* Skip leading whitespace.  */
12511   while (*str_to_check == ' ' || *str_to_check == '\t')
12512     str_to_check++;
12513
12514   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
12515      It is easier to detect and handle it explicitly here rather than going
12516      through the machinery for the rest of the target attributes in this
12517      function.  */
12518   if (*str_to_check == '+')
12519     return aarch64_handle_attr_isa_flags (str_to_check);
12520
12521   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
12522     {
12523       invert = true;
12524       str_to_check += 3;
12525     }
12526   char *arg = strchr (str_to_check, '=');
12527
12528   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
12529      and point ARG to "foo".  */
12530   if (arg)
12531     {
12532       *arg = '\0';
12533       arg++;
12534     }
12535   const struct aarch64_attribute_info *p_attr;
12536   bool found = false;
12537   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
12538     {
12539       /* If the names don't match up, or the user has given an argument
12540          to an attribute that doesn't accept one, or didn't give an argument
12541          to an attribute that expects one, fail to match.  */
12542       if (strcmp (str_to_check, p_attr->name) != 0)
12543         continue;
12544
12545       found = true;
12546       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
12547                               || p_attr->attr_type == aarch64_attr_enum;
12548
12549       if (attr_need_arg_p ^ (arg != NULL))
12550         {
12551           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
12552           return false;
12553         }
12554
12555       /* If the name matches but the attribute does not allow "no-" versions
12556          then we can't match.  */
12557       if (invert && !p_attr->allow_neg)
12558         {
12559           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
12560           return false;
12561         }
12562
12563       switch (p_attr->attr_type)
12564         {
12565         /* Has a custom handler registered.
12566            For example, cpu=, arch=, tune=.  */
12567           case aarch64_attr_custom:
12568             gcc_assert (p_attr->handler);
12569             if (!p_attr->handler (arg))
12570               return false;
12571             break;
12572
12573           /* Either set or unset a boolean option.  */
12574           case aarch64_attr_bool:
12575             {
12576               struct cl_decoded_option decoded;
12577
12578               generate_option (p_attr->opt_num, NULL, !invert,
12579                                CL_TARGET, &decoded);
12580               aarch64_handle_option (&global_options, &global_options_set,
12581                                       &decoded, input_location);
12582               break;
12583             }
12584           /* Set or unset a bit in the target_flags.  aarch64_handle_option
12585              should know what mask to apply given the option number.  */
12586           case aarch64_attr_mask:
12587             {
12588               struct cl_decoded_option decoded;
12589               /* We only need to specify the option number.
12590                  aarch64_handle_option will know which mask to apply.  */
12591               decoded.opt_index = p_attr->opt_num;
12592               decoded.value = !invert;
12593               aarch64_handle_option (&global_options, &global_options_set,
12594                                       &decoded, input_location);
12595               break;
12596             }
12597           /* Use the option setting machinery to set an option to an enum.  */
12598           case aarch64_attr_enum:
12599             {
12600               gcc_assert (arg);
12601               bool valid;
12602               int value;
12603               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
12604                                               &value, CL_TARGET);
12605               if (valid)
12606                 {
12607                   set_option (&global_options, NULL, p_attr->opt_num, value,
12608                               NULL, DK_UNSPECIFIED, input_location,
12609                               global_dc);
12610                 }
12611               else
12612                 {
12613                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
12614                 }
12615               break;
12616             }
12617           default:
12618             gcc_unreachable ();
12619         }
12620     }
12621
12622   /* If we reached here we either have found an attribute and validated
12623      it or didn't match any.  If we matched an attribute but its arguments
12624      were malformed we will have returned false already.  */
12625   return found;
12626 }
12627
12628 /* Count how many times the character C appears in
12629    NULL-terminated string STR.  */
12630
12631 static unsigned int
12632 num_occurences_in_str (char c, char *str)
12633 {
12634   unsigned int res = 0;
12635   while (*str != '\0')
12636     {
12637       if (*str == c)
12638         res++;
12639
12640       str++;
12641     }
12642
12643   return res;
12644 }
12645
12646 /* Parse the tree in ARGS that contains the target attribute information
12647    and update the global target options space.  */
12648
12649 bool
12650 aarch64_process_target_attr (tree args)
12651 {
12652   if (TREE_CODE (args) == TREE_LIST)
12653     {
12654       do
12655         {
12656           tree head = TREE_VALUE (args);
12657           if (head)
12658             {
12659               if (!aarch64_process_target_attr (head))
12660                 return false;
12661             }
12662           args = TREE_CHAIN (args);
12663         } while (args);
12664
12665       return true;
12666     }
12667
12668   if (TREE_CODE (args) != STRING_CST)
12669     {
12670       error ("attribute %<target%> argument not a string");
12671       return false;
12672     }
12673
12674   size_t len = strlen (TREE_STRING_POINTER (args));
12675   char *str_to_check = (char *) alloca (len + 1);
12676   strcpy (str_to_check, TREE_STRING_POINTER (args));
12677
12678   if (len == 0)
12679     {
12680       error ("malformed %<target()%> pragma or attribute");
12681       return false;
12682     }
12683
12684   /* Used to catch empty spaces between commas i.e.
12685      attribute ((target ("attr1,,attr2"))).  */
12686   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
12687
12688   /* Handle multiple target attributes separated by ','.  */
12689   char *token = strtok_r (str_to_check, ",", &str_to_check);
12690
12691   unsigned int num_attrs = 0;
12692   while (token)
12693     {
12694       num_attrs++;
12695       if (!aarch64_process_one_target_attr (token))
12696         {
12697           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
12698           return false;
12699         }
12700
12701       token = strtok_r (NULL, ",", &str_to_check);
12702     }
12703
12704   if (num_attrs != num_commas + 1)
12705     {
12706       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
12707       return false;
12708     }
12709
12710   return true;
12711 }
12712
12713 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
12714    process attribute ((target ("..."))).  */
12715
12716 static bool
12717 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
12718 {
12719   struct cl_target_option cur_target;
12720   bool ret;
12721   tree old_optimize;
12722   tree new_target, new_optimize;
12723   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12724
12725   /* If what we're processing is the current pragma string then the
12726      target option node is already stored in target_option_current_node
12727      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
12728      having to re-parse the string.  This is especially useful to keep
12729      arm_neon.h compile times down since that header contains a lot
12730      of intrinsics enclosed in pragmas.  */
12731   if (!existing_target && args == current_target_pragma)
12732     {
12733       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
12734       return true;
12735     }
12736   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12737
12738   old_optimize = build_optimization_node (&global_options);
12739   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12740
12741   /* If the function changed the optimization levels as well as setting
12742      target options, start with the optimizations specified.  */
12743   if (func_optimize && func_optimize != old_optimize)
12744     cl_optimization_restore (&global_options,
12745                              TREE_OPTIMIZATION (func_optimize));
12746
12747   /* Save the current target options to restore at the end.  */
12748   cl_target_option_save (&cur_target, &global_options);
12749
12750   /* If fndecl already has some target attributes applied to it, unpack
12751      them so that we add this attribute on top of them, rather than
12752      overwriting them.  */
12753   if (existing_target)
12754     {
12755       struct cl_target_option *existing_options
12756         = TREE_TARGET_OPTION (existing_target);
12757
12758       if (existing_options)
12759         cl_target_option_restore (&global_options, existing_options);
12760     }
12761   else
12762     cl_target_option_restore (&global_options,
12763                         TREE_TARGET_OPTION (target_option_current_node));
12764
12765   ret = aarch64_process_target_attr (args);
12766
12767   /* Set up any additional state.  */
12768   if (ret)
12769     {
12770       aarch64_override_options_internal (&global_options);
12771       /* Initialize SIMD builtins if we haven't already.
12772          Set current_target_pragma to NULL for the duration so that
12773          the builtin initialization code doesn't try to tag the functions
12774          being built with the attributes specified by any current pragma, thus
12775          going into an infinite recursion.  */
12776       if (TARGET_SIMD)
12777         {
12778           tree saved_current_target_pragma = current_target_pragma;
12779           current_target_pragma = NULL;
12780           aarch64_init_simd_builtins ();
12781           current_target_pragma = saved_current_target_pragma;
12782         }
12783       new_target = build_target_option_node (&global_options);
12784     }
12785   else
12786     new_target = NULL;
12787
12788   new_optimize = build_optimization_node (&global_options);
12789
12790   if (fndecl && ret)
12791     {
12792       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
12793
12794       if (old_optimize != new_optimize)
12795         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
12796     }
12797
12798   cl_target_option_restore (&global_options, &cur_target);
12799
12800   if (old_optimize != new_optimize)
12801     cl_optimization_restore (&global_options,
12802                              TREE_OPTIMIZATION (old_optimize));
12803   return ret;
12804 }
12805
12806 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
12807    tri-bool options (yes, no, don't care) and the default value is
12808    DEF, determine whether to reject inlining.  */
12809
12810 static bool
12811 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
12812                                      int dont_care, int def)
12813 {
12814   /* If the callee doesn't care, always allow inlining.  */
12815   if (callee == dont_care)
12816     return true;
12817
12818   /* If the caller doesn't care, always allow inlining.  */
12819   if (caller == dont_care)
12820     return true;
12821
12822   /* Otherwise, allow inlining if either the callee and caller values
12823      agree, or if the callee is using the default value.  */
12824   return (callee == caller || callee == def);
12825 }
12826
12827 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
12828    to inline CALLEE into CALLER based on target-specific info.
12829    Make sure that the caller and callee have compatible architectural
12830    features.  Then go through the other possible target attributes
12831    and see if they can block inlining.  Try not to reject always_inline
12832    callees unless they are incompatible architecturally.  */
12833
12834 static bool
12835 aarch64_can_inline_p (tree caller, tree callee)
12836 {
12837   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
12838   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
12839
12840   struct cl_target_option *caller_opts
12841         = TREE_TARGET_OPTION (caller_tree ? caller_tree
12842                                            : target_option_default_node);
12843
12844   struct cl_target_option *callee_opts
12845         = TREE_TARGET_OPTION (callee_tree ? callee_tree
12846                                            : target_option_default_node);
12847
12848   /* Callee's ISA flags should be a subset of the caller's.  */
12849   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
12850        != callee_opts->x_aarch64_isa_flags)
12851     return false;
12852
12853   /* Allow non-strict aligned functions inlining into strict
12854      aligned ones.  */
12855   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
12856        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
12857       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
12858            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
12859     return false;
12860
12861   bool always_inline = lookup_attribute ("always_inline",
12862                                           DECL_ATTRIBUTES (callee));
12863
12864   /* If the architectural features match up and the callee is always_inline
12865      then the other attributes don't matter.  */
12866   if (always_inline)
12867     return true;
12868
12869   if (caller_opts->x_aarch64_cmodel_var
12870       != callee_opts->x_aarch64_cmodel_var)
12871     return false;
12872
12873   if (caller_opts->x_aarch64_tls_dialect
12874       != callee_opts->x_aarch64_tls_dialect)
12875     return false;
12876
12877   /* Honour explicit requests to workaround errata.  */
12878   if (!aarch64_tribools_ok_for_inlining_p (
12879           caller_opts->x_aarch64_fix_a53_err835769,
12880           callee_opts->x_aarch64_fix_a53_err835769,
12881           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
12882     return false;
12883
12884   if (!aarch64_tribools_ok_for_inlining_p (
12885           caller_opts->x_aarch64_fix_a53_err843419,
12886           callee_opts->x_aarch64_fix_a53_err843419,
12887           2, TARGET_FIX_ERR_A53_843419))
12888     return false;
12889
12890   /* If the user explicitly specified -momit-leaf-frame-pointer for the
12891      caller and calle and they don't match up, reject inlining.  */
12892   if (!aarch64_tribools_ok_for_inlining_p (
12893           caller_opts->x_flag_omit_leaf_frame_pointer,
12894           callee_opts->x_flag_omit_leaf_frame_pointer,
12895           2, 1))
12896     return false;
12897
12898   /* If the callee has specific tuning overrides, respect them.  */
12899   if (callee_opts->x_aarch64_override_tune_string != NULL
12900       && caller_opts->x_aarch64_override_tune_string == NULL)
12901     return false;
12902
12903   /* If the user specified tuning override strings for the
12904      caller and callee and they don't match up, reject inlining.
12905      We just do a string compare here, we don't analyze the meaning
12906      of the string, as it would be too costly for little gain.  */
12907   if (callee_opts->x_aarch64_override_tune_string
12908       && caller_opts->x_aarch64_override_tune_string
12909       && (strcmp (callee_opts->x_aarch64_override_tune_string,
12910                   caller_opts->x_aarch64_override_tune_string) != 0))
12911     return false;
12912
12913   return true;
12914 }
12915
12916 /* Return true if SYMBOL_REF X binds locally.  */
12917
12918 static bool
12919 aarch64_symbol_binds_local_p (const_rtx x)
12920 {
12921   return (SYMBOL_REF_DECL (x)
12922           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
12923           : SYMBOL_REF_LOCAL_P (x));
12924 }
12925
12926 /* Return true if SYMBOL_REF X is thread local */
12927 static bool
12928 aarch64_tls_symbol_p (rtx x)
12929 {
12930   if (! TARGET_HAVE_TLS)
12931     return false;
12932
12933   if (GET_CODE (x) != SYMBOL_REF)
12934     return false;
12935
12936   return SYMBOL_REF_TLS_MODEL (x) != 0;
12937 }
12938
12939 /* Classify a TLS symbol into one of the TLS kinds.  */
12940 enum aarch64_symbol_type
12941 aarch64_classify_tls_symbol (rtx x)
12942 {
12943   enum tls_model tls_kind = tls_symbolic_operand_type (x);
12944
12945   switch (tls_kind)
12946     {
12947     case TLS_MODEL_GLOBAL_DYNAMIC:
12948     case TLS_MODEL_LOCAL_DYNAMIC:
12949       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
12950
12951     case TLS_MODEL_INITIAL_EXEC:
12952       switch (aarch64_cmodel)
12953         {
12954         case AARCH64_CMODEL_TINY:
12955         case AARCH64_CMODEL_TINY_PIC:
12956           return SYMBOL_TINY_TLSIE;
12957         default:
12958           return SYMBOL_SMALL_TLSIE;
12959         }
12960
12961     case TLS_MODEL_LOCAL_EXEC:
12962       if (aarch64_tls_size == 12)
12963         return SYMBOL_TLSLE12;
12964       else if (aarch64_tls_size == 24)
12965         return SYMBOL_TLSLE24;
12966       else if (aarch64_tls_size == 32)
12967         return SYMBOL_TLSLE32;
12968       else if (aarch64_tls_size == 48)
12969         return SYMBOL_TLSLE48;
12970       else
12971         gcc_unreachable ();
12972
12973     case TLS_MODEL_EMULATED:
12974     case TLS_MODEL_NONE:
12975       return SYMBOL_FORCE_TO_MEM;
12976
12977     default:
12978       gcc_unreachable ();
12979     }
12980 }
12981
12982 /* Return the correct method for accessing X + OFFSET, where X is either
12983    a SYMBOL_REF or LABEL_REF.  */
12984
12985 enum aarch64_symbol_type
12986 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
12987 {
12988   if (GET_CODE (x) == LABEL_REF)
12989     {
12990       switch (aarch64_cmodel)
12991         {
12992         case AARCH64_CMODEL_LARGE:
12993           return SYMBOL_FORCE_TO_MEM;
12994
12995         case AARCH64_CMODEL_TINY_PIC:
12996         case AARCH64_CMODEL_TINY:
12997           return SYMBOL_TINY_ABSOLUTE;
12998
12999         case AARCH64_CMODEL_SMALL_SPIC:
13000         case AARCH64_CMODEL_SMALL_PIC:
13001         case AARCH64_CMODEL_SMALL:
13002           return SYMBOL_SMALL_ABSOLUTE;
13003
13004         default:
13005           gcc_unreachable ();
13006         }
13007     }
13008
13009   if (GET_CODE (x) == SYMBOL_REF)
13010     {
13011       if (aarch64_tls_symbol_p (x))
13012         return aarch64_classify_tls_symbol (x);
13013
13014       switch (aarch64_cmodel)
13015         {
13016         case AARCH64_CMODEL_TINY:
13017           /* When we retrieve symbol + offset address, we have to make sure
13018              the offset does not cause overflow of the final address.  But
13019              we have no way of knowing the address of symbol at compile time
13020              so we can't accurately say if the distance between the PC and
13021              symbol + offset is outside the addressible range of +/-1M in the
13022              TINY code model.  So we rely on images not being greater than
13023              1M and cap the offset at 1M and anything beyond 1M will have to
13024              be loaded using an alternative mechanism.  Furthermore if the
13025              symbol is a weak reference to something that isn't known to
13026              resolve to a symbol in this module, then force to memory.  */
13027           if ((SYMBOL_REF_WEAK (x)
13028                && !aarch64_symbol_binds_local_p (x))
13029               || !IN_RANGE (offset, -1048575, 1048575))
13030             return SYMBOL_FORCE_TO_MEM;
13031           return SYMBOL_TINY_ABSOLUTE;
13032
13033         case AARCH64_CMODEL_SMALL:
13034           /* Same reasoning as the tiny code model, but the offset cap here is
13035              4G.  */
13036           if ((SYMBOL_REF_WEAK (x)
13037                && !aarch64_symbol_binds_local_p (x))
13038               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13039                             HOST_WIDE_INT_C (4294967264)))
13040             return SYMBOL_FORCE_TO_MEM;
13041           return SYMBOL_SMALL_ABSOLUTE;
13042
13043         case AARCH64_CMODEL_TINY_PIC:
13044           if (!aarch64_symbol_binds_local_p (x))
13045             return SYMBOL_TINY_GOT;
13046           return SYMBOL_TINY_ABSOLUTE;
13047
13048         case AARCH64_CMODEL_SMALL_SPIC:
13049         case AARCH64_CMODEL_SMALL_PIC:
13050           if (!aarch64_symbol_binds_local_p (x))
13051             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13052                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13053           return SYMBOL_SMALL_ABSOLUTE;
13054
13055         case AARCH64_CMODEL_LARGE:
13056           /* This is alright even in PIC code as the constant
13057              pool reference is always PC relative and within
13058              the same translation unit.  */
13059           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13060             return SYMBOL_SMALL_ABSOLUTE;
13061           else
13062             return SYMBOL_FORCE_TO_MEM;
13063
13064         default:
13065           gcc_unreachable ();
13066         }
13067     }
13068
13069   /* By default push everything into the constant pool.  */
13070   return SYMBOL_FORCE_TO_MEM;
13071 }
13072
13073 bool
13074 aarch64_constant_address_p (rtx x)
13075 {
13076   return (CONSTANT_P (x) && memory_address_p (DImode, x));
13077 }
13078
13079 bool
13080 aarch64_legitimate_pic_operand_p (rtx x)
13081 {
13082   if (GET_CODE (x) == SYMBOL_REF
13083       || (GET_CODE (x) == CONST
13084           && GET_CODE (XEXP (x, 0)) == PLUS
13085           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13086      return false;
13087
13088   return true;
13089 }
13090
13091 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
13092    that should be rematerialized rather than spilled.  */
13093
13094 static bool
13095 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
13096 {
13097   /* Support CSE and rematerialization of common constants.  */
13098   if (CONST_INT_P (x)
13099       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
13100       || GET_CODE (x) == CONST_VECTOR)
13101     return true;
13102
13103   /* Do not allow vector struct mode constants for Advanced SIMD.
13104      We could support 0 and -1 easily, but they need support in
13105      aarch64-simd.md.  */
13106   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13107   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13108     return false;
13109
13110   /* Only accept variable-length vector constants if they can be
13111      handled directly.
13112
13113      ??? It would be possible to handle rematerialization of other
13114      constants via secondary reloads.  */
13115   if (vec_flags & VEC_ANY_SVE)
13116     return aarch64_simd_valid_immediate (x, NULL);
13117
13118   if (GET_CODE (x) == HIGH)
13119     x = XEXP (x, 0);
13120
13121   /* Accept polynomial constants that can be calculated by using the
13122      destination of a move as the sole temporary.  Constants that
13123      require a second temporary cannot be rematerialized (they can't be
13124      forced to memory and also aren't legitimate constants).  */
13125   poly_int64 offset;
13126   if (poly_int_rtx_p (x, &offset))
13127     return aarch64_offset_temporaries (false, offset) <= 1;
13128
13129   /* If an offset is being added to something else, we need to allow the
13130      base to be moved into the destination register, meaning that there
13131      are no free temporaries for the offset.  */
13132   x = strip_offset (x, &offset);
13133   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
13134     return false;
13135
13136   /* Do not allow const (plus (anchor_symbol, const_int)).  */
13137   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
13138     return false;
13139
13140   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
13141      so spilling them is better than rematerialization.  */
13142   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
13143     return true;
13144
13145   /* Label references are always constant.  */
13146   if (GET_CODE (x) == LABEL_REF)
13147     return true;
13148
13149   return false;
13150 }
13151
13152 rtx
13153 aarch64_load_tp (rtx target)
13154 {
13155   if (!target
13156       || GET_MODE (target) != Pmode
13157       || !register_operand (target, Pmode))
13158     target = gen_reg_rtx (Pmode);
13159
13160   /* Can return in any reg.  */
13161   emit_insn (gen_aarch64_load_tp_hard (target));
13162   return target;
13163 }
13164
13165 /* On AAPCS systems, this is the "struct __va_list".  */
13166 static GTY(()) tree va_list_type;
13167
13168 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13169    Return the type to use as __builtin_va_list.
13170
13171    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
13172
13173    struct __va_list
13174    {
13175      void *__stack;
13176      void *__gr_top;
13177      void *__vr_top;
13178      int   __gr_offs;
13179      int   __vr_offs;
13180    };  */
13181
13182 static tree
13183 aarch64_build_builtin_va_list (void)
13184 {
13185   tree va_list_name;
13186   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13187
13188   /* Create the type.  */
13189   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
13190   /* Give it the required name.  */
13191   va_list_name = build_decl (BUILTINS_LOCATION,
13192                              TYPE_DECL,
13193                              get_identifier ("__va_list"),
13194                              va_list_type);
13195   DECL_ARTIFICIAL (va_list_name) = 1;
13196   TYPE_NAME (va_list_type) = va_list_name;
13197   TYPE_STUB_DECL (va_list_type) = va_list_name;
13198
13199   /* Create the fields.  */
13200   f_stack = build_decl (BUILTINS_LOCATION,
13201                         FIELD_DECL, get_identifier ("__stack"),
13202                         ptr_type_node);
13203   f_grtop = build_decl (BUILTINS_LOCATION,
13204                         FIELD_DECL, get_identifier ("__gr_top"),
13205                         ptr_type_node);
13206   f_vrtop = build_decl (BUILTINS_LOCATION,
13207                         FIELD_DECL, get_identifier ("__vr_top"),
13208                         ptr_type_node);
13209   f_groff = build_decl (BUILTINS_LOCATION,
13210                         FIELD_DECL, get_identifier ("__gr_offs"),
13211                         integer_type_node);
13212   f_vroff = build_decl (BUILTINS_LOCATION,
13213                         FIELD_DECL, get_identifier ("__vr_offs"),
13214                         integer_type_node);
13215
13216   /* Tell tree-stdarg pass about our internal offset fields.
13217      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
13218      purpose to identify whether the code is updating va_list internal
13219      offset fields through irregular way.  */
13220   va_list_gpr_counter_field = f_groff;
13221   va_list_fpr_counter_field = f_vroff;
13222
13223   DECL_ARTIFICIAL (f_stack) = 1;
13224   DECL_ARTIFICIAL (f_grtop) = 1;
13225   DECL_ARTIFICIAL (f_vrtop) = 1;
13226   DECL_ARTIFICIAL (f_groff) = 1;
13227   DECL_ARTIFICIAL (f_vroff) = 1;
13228
13229   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
13230   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
13231   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
13232   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
13233   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
13234
13235   TYPE_FIELDS (va_list_type) = f_stack;
13236   DECL_CHAIN (f_stack) = f_grtop;
13237   DECL_CHAIN (f_grtop) = f_vrtop;
13238   DECL_CHAIN (f_vrtop) = f_groff;
13239   DECL_CHAIN (f_groff) = f_vroff;
13240
13241   /* Compute its layout.  */
13242   layout_type (va_list_type);
13243
13244   return va_list_type;
13245 }
13246
13247 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
13248 static void
13249 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
13250 {
13251   const CUMULATIVE_ARGS *cum;
13252   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13253   tree stack, grtop, vrtop, groff, vroff;
13254   tree t;
13255   int gr_save_area_size = cfun->va_list_gpr_size;
13256   int vr_save_area_size = cfun->va_list_fpr_size;
13257   int vr_offset;
13258
13259   cum = &crtl->args.info;
13260   if (cfun->va_list_gpr_size)
13261     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
13262                              cfun->va_list_gpr_size);
13263   if (cfun->va_list_fpr_size)
13264     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
13265                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
13266
13267   if (!TARGET_FLOAT)
13268     {
13269       gcc_assert (cum->aapcs_nvrn == 0);
13270       vr_save_area_size = 0;
13271     }
13272
13273   f_stack = TYPE_FIELDS (va_list_type_node);
13274   f_grtop = DECL_CHAIN (f_stack);
13275   f_vrtop = DECL_CHAIN (f_grtop);
13276   f_groff = DECL_CHAIN (f_vrtop);
13277   f_vroff = DECL_CHAIN (f_groff);
13278
13279   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
13280                   NULL_TREE);
13281   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
13282                   NULL_TREE);
13283   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
13284                   NULL_TREE);
13285   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
13286                   NULL_TREE);
13287   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
13288                   NULL_TREE);
13289
13290   /* Emit code to initialize STACK, which points to the next varargs stack
13291      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
13292      by named arguments.  STACK is 8-byte aligned.  */
13293   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
13294   if (cum->aapcs_stack_size > 0)
13295     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
13296   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
13297   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13298
13299   /* Emit code to initialize GRTOP, the top of the GR save area.
13300      virtual_incoming_args_rtx should have been 16 byte aligned.  */
13301   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
13302   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
13303   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13304
13305   /* Emit code to initialize VRTOP, the top of the VR save area.
13306      This address is gr_save_area_bytes below GRTOP, rounded
13307      down to the next 16-byte boundary.  */
13308   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
13309   vr_offset = ROUND_UP (gr_save_area_size,
13310                         STACK_BOUNDARY / BITS_PER_UNIT);
13311
13312   if (vr_offset)
13313     t = fold_build_pointer_plus_hwi (t, -vr_offset);
13314   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
13315   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13316
13317   /* Emit code to initialize GROFF, the offset from GRTOP of the
13318      next GPR argument.  */
13319   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
13320               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
13321   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13322
13323   /* Likewise emit code to initialize VROFF, the offset from FTOP
13324      of the next VR argument.  */
13325   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
13326               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
13327   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13328 }
13329
13330 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
13331
13332 static tree
13333 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
13334                               gimple_seq *post_p ATTRIBUTE_UNUSED)
13335 {
13336   tree addr;
13337   bool indirect_p;
13338   bool is_ha;           /* is HFA or HVA.  */
13339   bool dw_align;        /* double-word align.  */
13340   machine_mode ag_mode = VOIDmode;
13341   int nregs;
13342   machine_mode mode;
13343
13344   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13345   tree stack, f_top, f_off, off, arg, roundup, on_stack;
13346   HOST_WIDE_INT size, rsize, adjust, align;
13347   tree t, u, cond1, cond2;
13348
13349   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
13350   if (indirect_p)
13351     type = build_pointer_type (type);
13352
13353   mode = TYPE_MODE (type);
13354
13355   f_stack = TYPE_FIELDS (va_list_type_node);
13356   f_grtop = DECL_CHAIN (f_stack);
13357   f_vrtop = DECL_CHAIN (f_grtop);
13358   f_groff = DECL_CHAIN (f_vrtop);
13359   f_vroff = DECL_CHAIN (f_groff);
13360
13361   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
13362                   f_stack, NULL_TREE);
13363   size = int_size_in_bytes (type);
13364
13365   bool abi_break;
13366   align
13367     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
13368
13369   dw_align = false;
13370   adjust = 0;
13371   if (aarch64_vfp_is_call_or_return_candidate (mode,
13372                                                type,
13373                                                &ag_mode,
13374                                                &nregs,
13375                                                &is_ha))
13376     {
13377       /* No frontends can create types with variable-sized modes, so we
13378          shouldn't be asked to pass or return them.  */
13379       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
13380
13381       /* TYPE passed in fp/simd registers.  */
13382       if (!TARGET_FLOAT)
13383         aarch64_err_no_fpadvsimd (mode);
13384
13385       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
13386                       unshare_expr (valist), f_vrtop, NULL_TREE);
13387       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
13388                       unshare_expr (valist), f_vroff, NULL_TREE);
13389
13390       rsize = nregs * UNITS_PER_VREG;
13391
13392       if (is_ha)
13393         {
13394           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
13395             adjust = UNITS_PER_VREG - ag_size;
13396         }
13397       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13398                && size < UNITS_PER_VREG)
13399         {
13400           adjust = UNITS_PER_VREG - size;
13401         }
13402     }
13403   else
13404     {
13405       /* TYPE passed in general registers.  */
13406       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
13407                       unshare_expr (valist), f_grtop, NULL_TREE);
13408       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
13409                       unshare_expr (valist), f_groff, NULL_TREE);
13410       rsize = ROUND_UP (size, UNITS_PER_WORD);
13411       nregs = rsize / UNITS_PER_WORD;
13412
13413       if (align > 8)
13414         {
13415           if (abi_break && warn_psabi)
13416             inform (input_location, "parameter passing for argument of type "
13417                     "%qT changed in GCC 9.1", type);
13418           dw_align = true;
13419         }
13420
13421       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13422           && size < UNITS_PER_WORD)
13423         {
13424           adjust = UNITS_PER_WORD  - size;
13425         }
13426     }
13427
13428   /* Get a local temporary for the field value.  */
13429   off = get_initialized_tmp_var (f_off, pre_p, NULL);
13430
13431   /* Emit code to branch if off >= 0.  */
13432   t = build2 (GE_EXPR, boolean_type_node, off,
13433               build_int_cst (TREE_TYPE (off), 0));
13434   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
13435
13436   if (dw_align)
13437     {
13438       /* Emit: offs = (offs + 15) & -16.  */
13439       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13440                   build_int_cst (TREE_TYPE (off), 15));
13441       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
13442                   build_int_cst (TREE_TYPE (off), -16));
13443       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
13444     }
13445   else
13446     roundup = NULL;
13447
13448   /* Update ap.__[g|v]r_offs  */
13449   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13450               build_int_cst (TREE_TYPE (off), rsize));
13451   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
13452
13453   /* String up.  */
13454   if (roundup)
13455     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13456
13457   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
13458   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
13459               build_int_cst (TREE_TYPE (f_off), 0));
13460   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
13461
13462   /* String up: make sure the assignment happens before the use.  */
13463   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
13464   COND_EXPR_ELSE (cond1) = t;
13465
13466   /* Prepare the trees handling the argument that is passed on the stack;
13467      the top level node will store in ON_STACK.  */
13468   arg = get_initialized_tmp_var (stack, pre_p, NULL);
13469   if (align > 8)
13470     {
13471       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
13472       t = fold_build_pointer_plus_hwi (arg, 15);
13473       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13474                   build_int_cst (TREE_TYPE (t), -16));
13475       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
13476     }
13477   else
13478     roundup = NULL;
13479   /* Advance ap.__stack  */
13480   t = fold_build_pointer_plus_hwi (arg, size + 7);
13481   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13482               build_int_cst (TREE_TYPE (t), -8));
13483   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
13484   /* String up roundup and advance.  */
13485   if (roundup)
13486     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13487   /* String up with arg */
13488   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
13489   /* Big-endianness related address adjustment.  */
13490   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13491       && size < UNITS_PER_WORD)
13492   {
13493     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
13494                 size_int (UNITS_PER_WORD - size));
13495     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
13496   }
13497
13498   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
13499   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
13500
13501   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
13502   t = off;
13503   if (adjust)
13504     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
13505                 build_int_cst (TREE_TYPE (off), adjust));
13506
13507   t = fold_convert (sizetype, t);
13508   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
13509
13510   if (is_ha)
13511     {
13512       /* type ha; // treat as "struct {ftype field[n];}"
13513          ... [computing offs]
13514          for (i = 0; i <nregs; ++i, offs += 16)
13515            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
13516          return ha;  */
13517       int i;
13518       tree tmp_ha, field_t, field_ptr_t;
13519
13520       /* Declare a local variable.  */
13521       tmp_ha = create_tmp_var_raw (type, "ha");
13522       gimple_add_tmp_var (tmp_ha);
13523
13524       /* Establish the base type.  */
13525       switch (ag_mode)
13526         {
13527         case E_SFmode:
13528           field_t = float_type_node;
13529           field_ptr_t = float_ptr_type_node;
13530           break;
13531         case E_DFmode:
13532           field_t = double_type_node;
13533           field_ptr_t = double_ptr_type_node;
13534           break;
13535         case E_TFmode:
13536           field_t = long_double_type_node;
13537           field_ptr_t = long_double_ptr_type_node;
13538           break;
13539         case E_HFmode:
13540           field_t = aarch64_fp16_type_node;
13541           field_ptr_t = aarch64_fp16_ptr_type_node;
13542           break;
13543         case E_V2SImode:
13544         case E_V4SImode:
13545             {
13546               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
13547               field_t = build_vector_type_for_mode (innertype, ag_mode);
13548               field_ptr_t = build_pointer_type (field_t);
13549             }
13550           break;
13551         default:
13552           gcc_assert (0);
13553         }
13554
13555       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
13556       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
13557       addr = t;
13558       t = fold_convert (field_ptr_t, addr);
13559       t = build2 (MODIFY_EXPR, field_t,
13560                   build1 (INDIRECT_REF, field_t, tmp_ha),
13561                   build1 (INDIRECT_REF, field_t, t));
13562
13563       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
13564       for (i = 1; i < nregs; ++i)
13565         {
13566           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
13567           u = fold_convert (field_ptr_t, addr);
13568           u = build2 (MODIFY_EXPR, field_t,
13569                       build2 (MEM_REF, field_t, tmp_ha,
13570                               build_int_cst (field_ptr_t,
13571                                              (i *
13572                                               int_size_in_bytes (field_t)))),
13573                       build1 (INDIRECT_REF, field_t, u));
13574           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
13575         }
13576
13577       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
13578       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
13579     }
13580
13581   COND_EXPR_ELSE (cond2) = t;
13582   addr = fold_convert (build_pointer_type (type), cond1);
13583   addr = build_va_arg_indirect_ref (addr);
13584
13585   if (indirect_p)
13586     addr = build_va_arg_indirect_ref (addr);
13587
13588   return addr;
13589 }
13590
13591 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
13592
13593 static void
13594 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
13595                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
13596                                 int no_rtl)
13597 {
13598   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
13599   CUMULATIVE_ARGS local_cum;
13600   int gr_saved = cfun->va_list_gpr_size;
13601   int vr_saved = cfun->va_list_fpr_size;
13602
13603   /* The caller has advanced CUM up to, but not beyond, the last named
13604      argument.  Advance a local copy of CUM past the last "real" named
13605      argument, to find out how many registers are left over.  */
13606   local_cum = *cum;
13607   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
13608
13609   /* Found out how many registers we need to save.
13610      Honor tree-stdvar analysis results.  */
13611   if (cfun->va_list_gpr_size)
13612     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
13613                     cfun->va_list_gpr_size / UNITS_PER_WORD);
13614   if (cfun->va_list_fpr_size)
13615     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
13616                     cfun->va_list_fpr_size / UNITS_PER_VREG);
13617
13618   if (!TARGET_FLOAT)
13619     {
13620       gcc_assert (local_cum.aapcs_nvrn == 0);
13621       vr_saved = 0;
13622     }
13623
13624   if (!no_rtl)
13625     {
13626       if (gr_saved > 0)
13627         {
13628           rtx ptr, mem;
13629
13630           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
13631           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
13632                                - gr_saved * UNITS_PER_WORD);
13633           mem = gen_frame_mem (BLKmode, ptr);
13634           set_mem_alias_set (mem, get_varargs_alias_set ());
13635
13636           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
13637                                mem, gr_saved);
13638         }
13639       if (vr_saved > 0)
13640         {
13641           /* We can't use move_block_from_reg, because it will use
13642              the wrong mode, storing D regs only.  */
13643           machine_mode mode = TImode;
13644           int off, i, vr_start;
13645
13646           /* Set OFF to the offset from virtual_incoming_args_rtx of
13647              the first vector register.  The VR save area lies below
13648              the GR one, and is aligned to 16 bytes.  */
13649           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
13650                            STACK_BOUNDARY / BITS_PER_UNIT);
13651           off -= vr_saved * UNITS_PER_VREG;
13652
13653           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
13654           for (i = 0; i < vr_saved; ++i)
13655             {
13656               rtx ptr, mem;
13657
13658               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
13659               mem = gen_frame_mem (mode, ptr);
13660               set_mem_alias_set (mem, get_varargs_alias_set ());
13661               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
13662               off += UNITS_PER_VREG;
13663             }
13664         }
13665     }
13666
13667   /* We don't save the size into *PRETEND_SIZE because we want to avoid
13668      any complication of having crtl->args.pretend_args_size changed.  */
13669   cfun->machine->frame.saved_varargs_size
13670     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
13671                  STACK_BOUNDARY / BITS_PER_UNIT)
13672        + vr_saved * UNITS_PER_VREG);
13673 }
13674
13675 static void
13676 aarch64_conditional_register_usage (void)
13677 {
13678   int i;
13679   if (!TARGET_FLOAT)
13680     {
13681       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
13682         {
13683           fixed_regs[i] = 1;
13684           call_used_regs[i] = 1;
13685         }
13686     }
13687   if (!TARGET_SVE)
13688     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
13689       {
13690         fixed_regs[i] = 1;
13691         call_used_regs[i] = 1;
13692       }
13693
13694   /* When tracking speculation, we need a couple of call-clobbered registers
13695      to track the speculation state.  It would be nice to just use
13696      IP0 and IP1, but currently there are numerous places that just
13697      assume these registers are free for other uses (eg pointer
13698      authentication).  */
13699   if (aarch64_track_speculation)
13700     {
13701       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
13702       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
13703       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13704       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13705     }
13706 }
13707
13708 /* Walk down the type tree of TYPE counting consecutive base elements.
13709    If *MODEP is VOIDmode, then set it to the first valid floating point
13710    type.  If a non-floating point type is found, or if a floating point
13711    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
13712    otherwise return the count in the sub-tree.  */
13713 static int
13714 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
13715 {
13716   machine_mode mode;
13717   HOST_WIDE_INT size;
13718
13719   switch (TREE_CODE (type))
13720     {
13721     case REAL_TYPE:
13722       mode = TYPE_MODE (type);
13723       if (mode != DFmode && mode != SFmode
13724           && mode != TFmode && mode != HFmode)
13725         return -1;
13726
13727       if (*modep == VOIDmode)
13728         *modep = mode;
13729
13730       if (*modep == mode)
13731         return 1;
13732
13733       break;
13734
13735     case COMPLEX_TYPE:
13736       mode = TYPE_MODE (TREE_TYPE (type));
13737       if (mode != DFmode && mode != SFmode
13738           && mode != TFmode && mode != HFmode)
13739         return -1;
13740
13741       if (*modep == VOIDmode)
13742         *modep = mode;
13743
13744       if (*modep == mode)
13745         return 2;
13746
13747       break;
13748
13749     case VECTOR_TYPE:
13750       /* Use V2SImode and V4SImode as representatives of all 64-bit
13751          and 128-bit vector types.  */
13752       size = int_size_in_bytes (type);
13753       switch (size)
13754         {
13755         case 8:
13756           mode = V2SImode;
13757           break;
13758         case 16:
13759           mode = V4SImode;
13760           break;
13761         default:
13762           return -1;
13763         }
13764
13765       if (*modep == VOIDmode)
13766         *modep = mode;
13767
13768       /* Vector modes are considered to be opaque: two vectors are
13769          equivalent for the purposes of being homogeneous aggregates
13770          if they are the same size.  */
13771       if (*modep == mode)
13772         return 1;
13773
13774       break;
13775
13776     case ARRAY_TYPE:
13777       {
13778         int count;
13779         tree index = TYPE_DOMAIN (type);
13780
13781         /* Can't handle incomplete types nor sizes that are not
13782            fixed.  */
13783         if (!COMPLETE_TYPE_P (type)
13784             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13785           return -1;
13786
13787         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
13788         if (count == -1
13789             || !index
13790             || !TYPE_MAX_VALUE (index)
13791             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
13792             || !TYPE_MIN_VALUE (index)
13793             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
13794             || count < 0)
13795           return -1;
13796
13797         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
13798                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
13799
13800         /* There must be no padding.  */
13801         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13802                       count * GET_MODE_BITSIZE (*modep)))
13803           return -1;
13804
13805         return count;
13806       }
13807
13808     case RECORD_TYPE:
13809       {
13810         int count = 0;
13811         int sub_count;
13812         tree field;
13813
13814         /* Can't handle incomplete types nor sizes that are not
13815            fixed.  */
13816         if (!COMPLETE_TYPE_P (type)
13817             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13818           return -1;
13819
13820         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13821           {
13822             if (TREE_CODE (field) != FIELD_DECL)
13823               continue;
13824
13825             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13826             if (sub_count < 0)
13827               return -1;
13828             count += sub_count;
13829           }
13830
13831         /* There must be no padding.  */
13832         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13833                       count * GET_MODE_BITSIZE (*modep)))
13834           return -1;
13835
13836         return count;
13837       }
13838
13839     case UNION_TYPE:
13840     case QUAL_UNION_TYPE:
13841       {
13842         /* These aren't very interesting except in a degenerate case.  */
13843         int count = 0;
13844         int sub_count;
13845         tree field;
13846
13847         /* Can't handle incomplete types nor sizes that are not
13848            fixed.  */
13849         if (!COMPLETE_TYPE_P (type)
13850             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13851           return -1;
13852
13853         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13854           {
13855             if (TREE_CODE (field) != FIELD_DECL)
13856               continue;
13857
13858             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13859             if (sub_count < 0)
13860               return -1;
13861             count = count > sub_count ? count : sub_count;
13862           }
13863
13864         /* There must be no padding.  */
13865         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13866                       count * GET_MODE_BITSIZE (*modep)))
13867           return -1;
13868
13869         return count;
13870       }
13871
13872     default:
13873       break;
13874     }
13875
13876   return -1;
13877 }
13878
13879 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
13880    type as described in AAPCS64 \S 4.1.2.
13881
13882    See the comment above aarch64_composite_type_p for the notes on MODE.  */
13883
13884 static bool
13885 aarch64_short_vector_p (const_tree type,
13886                         machine_mode mode)
13887 {
13888   poly_int64 size = -1;
13889
13890   if (type && TREE_CODE (type) == VECTOR_TYPE)
13891     size = int_size_in_bytes (type);
13892   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
13893             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
13894     size = GET_MODE_SIZE (mode);
13895
13896   return known_eq (size, 8) || known_eq (size, 16);
13897 }
13898
13899 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
13900    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
13901    array types.  The C99 floating-point complex types are also considered
13902    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
13903    types, which are GCC extensions and out of the scope of AAPCS64, are
13904    treated as composite types here as well.
13905
13906    Note that MODE itself is not sufficient in determining whether a type
13907    is such a composite type or not.  This is because
13908    stor-layout.c:compute_record_mode may have already changed the MODE
13909    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
13910    structure with only one field may have its MODE set to the mode of the
13911    field.  Also an integer mode whose size matches the size of the
13912    RECORD_TYPE type may be used to substitute the original mode
13913    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
13914    solely relied on.  */
13915
13916 static bool
13917 aarch64_composite_type_p (const_tree type,
13918                           machine_mode mode)
13919 {
13920   if (aarch64_short_vector_p (type, mode))
13921     return false;
13922
13923   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
13924     return true;
13925
13926   if (mode == BLKmode
13927       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
13928       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
13929     return true;
13930
13931   return false;
13932 }
13933
13934 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
13935    shall be passed or returned in simd/fp register(s) (providing these
13936    parameter passing registers are available).
13937
13938    Upon successful return, *COUNT returns the number of needed registers,
13939    *BASE_MODE returns the mode of the individual register and when IS_HAF
13940    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
13941    floating-point aggregate or a homogeneous short-vector aggregate.  */
13942
13943 static bool
13944 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
13945                                          const_tree type,
13946                                          machine_mode *base_mode,
13947                                          int *count,
13948                                          bool *is_ha)
13949 {
13950   machine_mode new_mode = VOIDmode;
13951   bool composite_p = aarch64_composite_type_p (type, mode);
13952
13953   if (is_ha != NULL) *is_ha = false;
13954
13955   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
13956       || aarch64_short_vector_p (type, mode))
13957     {
13958       *count = 1;
13959       new_mode = mode;
13960     }
13961   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
13962     {
13963       if (is_ha != NULL) *is_ha = true;
13964       *count = 2;
13965       new_mode = GET_MODE_INNER (mode);
13966     }
13967   else if (type && composite_p)
13968     {
13969       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
13970
13971       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
13972         {
13973           if (is_ha != NULL) *is_ha = true;
13974           *count = ag_count;
13975         }
13976       else
13977         return false;
13978     }
13979   else
13980     return false;
13981
13982   *base_mode = new_mode;
13983   return true;
13984 }
13985
13986 /* Implement TARGET_STRUCT_VALUE_RTX.  */
13987
13988 static rtx
13989 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
13990                           int incoming ATTRIBUTE_UNUSED)
13991 {
13992   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
13993 }
13994
13995 /* Implements target hook vector_mode_supported_p.  */
13996 static bool
13997 aarch64_vector_mode_supported_p (machine_mode mode)
13998 {
13999   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14000   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14001 }
14002
14003 /* Return appropriate SIMD container
14004    for MODE within a vector of WIDTH bits.  */
14005 static machine_mode
14006 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14007 {
14008   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14009     switch (mode)
14010       {
14011       case E_DFmode:
14012         return VNx2DFmode;
14013       case E_SFmode:
14014         return VNx4SFmode;
14015       case E_HFmode:
14016         return VNx8HFmode;
14017       case E_DImode:
14018         return VNx2DImode;
14019       case E_SImode:
14020         return VNx4SImode;
14021       case E_HImode:
14022         return VNx8HImode;
14023       case E_QImode:
14024         return VNx16QImode;
14025       default:
14026         return word_mode;
14027       }
14028
14029   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14030   if (TARGET_SIMD)
14031     {
14032       if (known_eq (width, 128))
14033         switch (mode)
14034           {
14035           case E_DFmode:
14036             return V2DFmode;
14037           case E_SFmode:
14038             return V4SFmode;
14039           case E_HFmode:
14040             return V8HFmode;
14041           case E_SImode:
14042             return V4SImode;
14043           case E_HImode:
14044             return V8HImode;
14045           case E_QImode:
14046             return V16QImode;
14047           case E_DImode:
14048             return V2DImode;
14049           default:
14050             break;
14051           }
14052       else
14053         switch (mode)
14054           {
14055           case E_SFmode:
14056             return V2SFmode;
14057           case E_HFmode:
14058             return V4HFmode;
14059           case E_SImode:
14060             return V2SImode;
14061           case E_HImode:
14062             return V4HImode;
14063           case E_QImode:
14064             return V8QImode;
14065           default:
14066             break;
14067           }
14068     }
14069   return word_mode;
14070 }
14071
14072 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
14073 static machine_mode
14074 aarch64_preferred_simd_mode (scalar_mode mode)
14075 {
14076   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
14077   return aarch64_simd_container_mode (mode, bits);
14078 }
14079
14080 /* Return a list of possible vector sizes for the vectorizer
14081    to iterate over.  */
14082 static void
14083 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
14084 {
14085   if (TARGET_SVE)
14086     sizes->safe_push (BYTES_PER_SVE_VECTOR);
14087   sizes->safe_push (16);
14088   sizes->safe_push (8);
14089 }
14090
14091 /* Implement TARGET_MANGLE_TYPE.  */
14092
14093 static const char *
14094 aarch64_mangle_type (const_tree type)
14095 {
14096   /* The AArch64 ABI documents say that "__va_list" has to be
14097      mangled as if it is in the "std" namespace.  */
14098   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
14099     return "St9__va_list";
14100
14101   /* Half-precision float.  */
14102   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
14103     return "Dh";
14104
14105   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
14106      builtin types.  */
14107   if (TYPE_NAME (type) != NULL)
14108     return aarch64_mangle_builtin_type (type);
14109
14110   /* Use the default mangling.  */
14111   return NULL;
14112 }
14113
14114 /* Find the first rtx_insn before insn that will generate an assembly
14115    instruction.  */
14116
14117 static rtx_insn *
14118 aarch64_prev_real_insn (rtx_insn *insn)
14119 {
14120   if (!insn)
14121     return NULL;
14122
14123   do
14124     {
14125       insn = prev_real_insn (insn);
14126     }
14127   while (insn && recog_memoized (insn) < 0);
14128
14129   return insn;
14130 }
14131
14132 static bool
14133 is_madd_op (enum attr_type t1)
14134 {
14135   unsigned int i;
14136   /* A number of these may be AArch32 only.  */
14137   enum attr_type mlatypes[] = {
14138     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
14139     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
14140     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
14141   };
14142
14143   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
14144     {
14145       if (t1 == mlatypes[i])
14146         return true;
14147     }
14148
14149   return false;
14150 }
14151
14152 /* Check if there is a register dependency between a load and the insn
14153    for which we hold recog_data.  */
14154
14155 static bool
14156 dep_between_memop_and_curr (rtx memop)
14157 {
14158   rtx load_reg;
14159   int opno;
14160
14161   gcc_assert (GET_CODE (memop) == SET);
14162
14163   if (!REG_P (SET_DEST (memop)))
14164     return false;
14165
14166   load_reg = SET_DEST (memop);
14167   for (opno = 1; opno < recog_data.n_operands; opno++)
14168     {
14169       rtx operand = recog_data.operand[opno];
14170       if (REG_P (operand)
14171           && reg_overlap_mentioned_p (load_reg, operand))
14172         return true;
14173
14174     }
14175   return false;
14176 }
14177
14178
14179 /* When working around the Cortex-A53 erratum 835769,
14180    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
14181    instruction and has a preceding memory instruction such that a NOP
14182    should be inserted between them.  */
14183
14184 bool
14185 aarch64_madd_needs_nop (rtx_insn* insn)
14186 {
14187   enum attr_type attr_type;
14188   rtx_insn *prev;
14189   rtx body;
14190
14191   if (!TARGET_FIX_ERR_A53_835769)
14192     return false;
14193
14194   if (!INSN_P (insn) || recog_memoized (insn) < 0)
14195     return false;
14196
14197   attr_type = get_attr_type (insn);
14198   if (!is_madd_op (attr_type))
14199     return false;
14200
14201   prev = aarch64_prev_real_insn (insn);
14202   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
14203      Restore recog state to INSN to avoid state corruption.  */
14204   extract_constrain_insn_cached (insn);
14205
14206   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
14207     return false;
14208
14209   body = single_set (prev);
14210
14211   /* If the previous insn is a memory op and there is no dependency between
14212      it and the DImode madd, emit a NOP between them.  If body is NULL then we
14213      have a complex memory operation, probably a load/store pair.
14214      Be conservative for now and emit a NOP.  */
14215   if (GET_MODE (recog_data.operand[0]) == DImode
14216       && (!body || !dep_between_memop_and_curr (body)))
14217     return true;
14218
14219   return false;
14220
14221 }
14222
14223
14224 /* Implement FINAL_PRESCAN_INSN.  */
14225
14226 void
14227 aarch64_final_prescan_insn (rtx_insn *insn)
14228 {
14229   if (aarch64_madd_needs_nop (insn))
14230     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
14231 }
14232
14233
14234 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
14235    instruction.  */
14236
14237 bool
14238 aarch64_sve_index_immediate_p (rtx base_or_step)
14239 {
14240   return (CONST_INT_P (base_or_step)
14241           && IN_RANGE (INTVAL (base_or_step), -16, 15));
14242 }
14243
14244 /* Return true if X is a valid immediate for the SVE ADD and SUB
14245    instructions.  Negate X first if NEGATE_P is true.  */
14246
14247 bool
14248 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
14249 {
14250   rtx elt;
14251
14252   if (!const_vec_duplicate_p (x, &elt)
14253       || !CONST_INT_P (elt))
14254     return false;
14255
14256   HOST_WIDE_INT val = INTVAL (elt);
14257   if (negate_p)
14258     val = -val;
14259   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
14260
14261   if (val & 0xff)
14262     return IN_RANGE (val, 0, 0xff);
14263   return IN_RANGE (val, 0, 0xff00);
14264 }
14265
14266 /* Return true if X is a valid immediate operand for an SVE logical
14267    instruction such as AND.  */
14268
14269 bool
14270 aarch64_sve_bitmask_immediate_p (rtx x)
14271 {
14272   rtx elt;
14273
14274   return (const_vec_duplicate_p (x, &elt)
14275           && CONST_INT_P (elt)
14276           && aarch64_bitmask_imm (INTVAL (elt),
14277                                   GET_MODE_INNER (GET_MODE (x))));
14278 }
14279
14280 /* Return true if X is a valid immediate for the SVE DUP and CPY
14281    instructions.  */
14282
14283 bool
14284 aarch64_sve_dup_immediate_p (rtx x)
14285 {
14286   rtx elt;
14287
14288   if (!const_vec_duplicate_p (x, &elt)
14289       || !CONST_INT_P (elt))
14290     return false;
14291
14292   HOST_WIDE_INT val = INTVAL (elt);
14293   if (val & 0xff)
14294     return IN_RANGE (val, -0x80, 0x7f);
14295   return IN_RANGE (val, -0x8000, 0x7f00);
14296 }
14297
14298 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14299    SIGNED_P says whether the operand is signed rather than unsigned.  */
14300
14301 bool
14302 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
14303 {
14304   rtx elt;
14305
14306   return (const_vec_duplicate_p (x, &elt)
14307           && CONST_INT_P (elt)
14308           && (signed_p
14309               ? IN_RANGE (INTVAL (elt), -16, 15)
14310               : IN_RANGE (INTVAL (elt), 0, 127)));
14311 }
14312
14313 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14314    instruction.  Negate X first if NEGATE_P is true.  */
14315
14316 bool
14317 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
14318 {
14319   rtx elt;
14320   REAL_VALUE_TYPE r;
14321
14322   if (!const_vec_duplicate_p (x, &elt)
14323       || GET_CODE (elt) != CONST_DOUBLE)
14324     return false;
14325
14326   r = *CONST_DOUBLE_REAL_VALUE (elt);
14327
14328   if (negate_p)
14329     r = real_value_negate (&r);
14330
14331   if (real_equal (&r, &dconst1))
14332     return true;
14333   if (real_equal (&r, &dconsthalf))
14334     return true;
14335   return false;
14336 }
14337
14338 /* Return true if X is a valid immediate operand for an SVE FMUL
14339    instruction.  */
14340
14341 bool
14342 aarch64_sve_float_mul_immediate_p (rtx x)
14343 {
14344   rtx elt;
14345
14346   /* GCC will never generate a multiply with an immediate of 2, so there is no
14347      point testing for it (even though it is a valid constant).  */
14348   return (const_vec_duplicate_p (x, &elt)
14349           && GET_CODE (elt) == CONST_DOUBLE
14350           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
14351 }
14352
14353 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14354    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
14355    is nonnull, use it to describe valid immediates.  */
14356 static bool
14357 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
14358                                     simd_immediate_info *info,
14359                                     enum simd_immediate_check which,
14360                                     simd_immediate_info::insn_type insn)
14361 {
14362   /* Try a 4-byte immediate with LSL.  */
14363   for (unsigned int shift = 0; shift < 32; shift += 8)
14364     if ((val32 & (0xff << shift)) == val32)
14365       {
14366         if (info)
14367           *info = simd_immediate_info (SImode, val32 >> shift, insn,
14368                                        simd_immediate_info::LSL, shift);
14369         return true;
14370       }
14371
14372   /* Try a 2-byte immediate with LSL.  */
14373   unsigned int imm16 = val32 & 0xffff;
14374   if (imm16 == (val32 >> 16))
14375     for (unsigned int shift = 0; shift < 16; shift += 8)
14376       if ((imm16 & (0xff << shift)) == imm16)
14377         {
14378           if (info)
14379             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
14380                                          simd_immediate_info::LSL, shift);
14381           return true;
14382         }
14383
14384   /* Try a 4-byte immediate with MSL, except for cases that MVN
14385      can handle.  */
14386   if (which == AARCH64_CHECK_MOV)
14387     for (unsigned int shift = 8; shift < 24; shift += 8)
14388       {
14389         unsigned int low = (1 << shift) - 1;
14390         if (((val32 & (0xff << shift)) | low) == val32)
14391           {
14392             if (info)
14393               *info = simd_immediate_info (SImode, val32 >> shift, insn,
14394                                            simd_immediate_info::MSL, shift);
14395             return true;
14396           }
14397       }
14398
14399   return false;
14400 }
14401
14402 /* Return true if replicating VAL64 is a valid immediate for the
14403    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
14404    use it to describe valid immediates.  */
14405 static bool
14406 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
14407                                  simd_immediate_info *info,
14408                                  enum simd_immediate_check which)
14409 {
14410   unsigned int val32 = val64 & 0xffffffff;
14411   unsigned int val16 = val64 & 0xffff;
14412   unsigned int val8 = val64 & 0xff;
14413
14414   if (val32 == (val64 >> 32))
14415     {
14416       if ((which & AARCH64_CHECK_ORR) != 0
14417           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
14418                                                  simd_immediate_info::MOV))
14419         return true;
14420
14421       if ((which & AARCH64_CHECK_BIC) != 0
14422           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
14423                                                  simd_immediate_info::MVN))
14424         return true;
14425
14426       /* Try using a replicated byte.  */
14427       if (which == AARCH64_CHECK_MOV
14428           && val16 == (val32 >> 16)
14429           && val8 == (val16 >> 8))
14430         {
14431           if (info)
14432             *info = simd_immediate_info (QImode, val8);
14433           return true;
14434         }
14435     }
14436
14437   /* Try using a bit-to-bytemask.  */
14438   if (which == AARCH64_CHECK_MOV)
14439     {
14440       unsigned int i;
14441       for (i = 0; i < 64; i += 8)
14442         {
14443           unsigned char byte = (val64 >> i) & 0xff;
14444           if (byte != 0 && byte != 0xff)
14445             break;
14446         }
14447       if (i == 64)
14448         {
14449           if (info)
14450             *info = simd_immediate_info (DImode, val64);
14451           return true;
14452         }
14453     }
14454   return false;
14455 }
14456
14457 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
14458    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
14459
14460 static bool
14461 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
14462                              simd_immediate_info *info)
14463 {
14464   scalar_int_mode mode = DImode;
14465   unsigned int val32 = val64 & 0xffffffff;
14466   if (val32 == (val64 >> 32))
14467     {
14468       mode = SImode;
14469       unsigned int val16 = val32 & 0xffff;
14470       if (val16 == (val32 >> 16))
14471         {
14472           mode = HImode;
14473           unsigned int val8 = val16 & 0xff;
14474           if (val8 == (val16 >> 8))
14475             mode = QImode;
14476         }
14477     }
14478   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
14479   if (IN_RANGE (val, -0x80, 0x7f))
14480     {
14481       /* DUP with no shift.  */
14482       if (info)
14483         *info = simd_immediate_info (mode, val);
14484       return true;
14485     }
14486   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
14487     {
14488       /* DUP with LSL #8.  */
14489       if (info)
14490         *info = simd_immediate_info (mode, val);
14491       return true;
14492     }
14493   if (aarch64_bitmask_imm (val64, mode))
14494     {
14495       /* DUPM.  */
14496       if (info)
14497         *info = simd_immediate_info (mode, val);
14498       return true;
14499     }
14500   return false;
14501 }
14502
14503 /* Return true if OP is a valid SIMD immediate for the operation
14504    described by WHICH.  If INFO is nonnull, use it to describe valid
14505    immediates.  */
14506 bool
14507 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
14508                               enum simd_immediate_check which)
14509 {
14510   machine_mode mode = GET_MODE (op);
14511   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14512   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14513     return false;
14514
14515   scalar_mode elt_mode = GET_MODE_INNER (mode);
14516   rtx base, step;
14517   unsigned int n_elts;
14518   if (GET_CODE (op) == CONST_VECTOR
14519       && CONST_VECTOR_DUPLICATE_P (op))
14520     n_elts = CONST_VECTOR_NPATTERNS (op);
14521   else if ((vec_flags & VEC_SVE_DATA)
14522            && const_vec_series_p (op, &base, &step))
14523     {
14524       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
14525       if (!aarch64_sve_index_immediate_p (base)
14526           || !aarch64_sve_index_immediate_p (step))
14527         return false;
14528
14529       if (info)
14530         *info = simd_immediate_info (elt_mode, base, step);
14531       return true;
14532     }
14533   else if (GET_CODE (op) == CONST_VECTOR
14534            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
14535     /* N_ELTS set above.  */;
14536   else
14537     return false;
14538
14539   /* Handle PFALSE and PTRUE.  */
14540   if (vec_flags & VEC_SVE_PRED)
14541     return (op == CONST0_RTX (mode)
14542             || op == CONSTM1_RTX (mode));
14543
14544   scalar_float_mode elt_float_mode;
14545   if (n_elts == 1
14546       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
14547     {
14548       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
14549       if (aarch64_float_const_zero_rtx_p (elt)
14550           || aarch64_float_const_representable_p (elt))
14551         {
14552           if (info)
14553             *info = simd_immediate_info (elt_float_mode, elt);
14554           return true;
14555         }
14556     }
14557
14558   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
14559   if (elt_size > 8)
14560     return false;
14561
14562   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
14563
14564   /* Expand the vector constant out into a byte vector, with the least
14565      significant byte of the register first.  */
14566   auto_vec<unsigned char, 16> bytes;
14567   bytes.reserve (n_elts * elt_size);
14568   for (unsigned int i = 0; i < n_elts; i++)
14569     {
14570       /* The vector is provided in gcc endian-neutral fashion.
14571          For aarch64_be Advanced SIMD, it must be laid out in the vector
14572          register in reverse order.  */
14573       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
14574       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
14575
14576       if (elt_mode != elt_int_mode)
14577         elt = gen_lowpart (elt_int_mode, elt);
14578
14579       if (!CONST_INT_P (elt))
14580         return false;
14581
14582       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
14583       for (unsigned int byte = 0; byte < elt_size; byte++)
14584         {
14585           bytes.quick_push (elt_val & 0xff);
14586           elt_val >>= BITS_PER_UNIT;
14587         }
14588     }
14589
14590   /* The immediate must repeat every eight bytes.  */
14591   unsigned int nbytes = bytes.length ();
14592   for (unsigned i = 8; i < nbytes; ++i)
14593     if (bytes[i] != bytes[i - 8])
14594       return false;
14595
14596   /* Get the repeating 8-byte value as an integer.  No endian correction
14597      is needed here because bytes is already in lsb-first order.  */
14598   unsigned HOST_WIDE_INT val64 = 0;
14599   for (unsigned int i = 0; i < 8; i++)
14600     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
14601               << (i * BITS_PER_UNIT));
14602
14603   if (vec_flags & VEC_SVE_DATA)
14604     return aarch64_sve_valid_immediate (val64, info);
14605   else
14606     return aarch64_advsimd_valid_immediate (val64, info, which);
14607 }
14608
14609 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
14610    has a step in the range of INDEX.  Return the index expression if so,
14611    otherwise return null.  */
14612 rtx
14613 aarch64_check_zero_based_sve_index_immediate (rtx x)
14614 {
14615   rtx base, step;
14616   if (const_vec_series_p (x, &base, &step)
14617       && base == const0_rtx
14618       && aarch64_sve_index_immediate_p (step))
14619     return step;
14620   return NULL_RTX;
14621 }
14622
14623 /* Check of immediate shift constants are within range.  */
14624 bool
14625 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
14626 {
14627   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
14628   if (left)
14629     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
14630   else
14631     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
14632 }
14633
14634 /* Return the bitmask CONST_INT to select the bits required by a zero extract
14635    operation of width WIDTH at bit position POS.  */
14636
14637 rtx
14638 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
14639 {
14640   gcc_assert (CONST_INT_P (width));
14641   gcc_assert (CONST_INT_P (pos));
14642
14643   unsigned HOST_WIDE_INT mask
14644     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
14645   return GEN_INT (mask << UINTVAL (pos));
14646 }
14647
14648 bool
14649 aarch64_mov_operand_p (rtx x, machine_mode mode)
14650 {
14651   if (GET_CODE (x) == HIGH
14652       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
14653     return true;
14654
14655   if (CONST_INT_P (x))
14656     return true;
14657
14658   if (VECTOR_MODE_P (GET_MODE (x)))
14659     return aarch64_simd_valid_immediate (x, NULL);
14660
14661   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
14662     return true;
14663
14664   if (aarch64_sve_cnt_immediate_p (x))
14665     return true;
14666
14667   return aarch64_classify_symbolic_expression (x)
14668     == SYMBOL_TINY_ABSOLUTE;
14669 }
14670
14671 /* Return a const_int vector of VAL.  */
14672 rtx
14673 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
14674 {
14675   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
14676   return gen_const_vec_duplicate (mode, c);
14677 }
14678
14679 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
14680
14681 bool
14682 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
14683 {
14684   machine_mode vmode;
14685
14686   vmode = aarch64_simd_container_mode (mode, 64);
14687   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
14688   return aarch64_simd_valid_immediate (op_v, NULL);
14689 }
14690
14691 /* Construct and return a PARALLEL RTX vector with elements numbering the
14692    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
14693    the vector - from the perspective of the architecture.  This does not
14694    line up with GCC's perspective on lane numbers, so we end up with
14695    different masks depending on our target endian-ness.  The diagram
14696    below may help.  We must draw the distinction when building masks
14697    which select one half of the vector.  An instruction selecting
14698    architectural low-lanes for a big-endian target, must be described using
14699    a mask selecting GCC high-lanes.
14700
14701                  Big-Endian             Little-Endian
14702
14703 GCC             0   1   2   3           3   2   1   0
14704               | x | x | x | x |       | x | x | x | x |
14705 Architecture    3   2   1   0           3   2   1   0
14706
14707 Low Mask:         { 2, 3 }                { 0, 1 }
14708 High Mask:        { 0, 1 }                { 2, 3 }
14709
14710    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
14711
14712 rtx
14713 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
14714 {
14715   rtvec v = rtvec_alloc (nunits / 2);
14716   int high_base = nunits / 2;
14717   int low_base = 0;
14718   int base;
14719   rtx t1;
14720   int i;
14721
14722   if (BYTES_BIG_ENDIAN)
14723     base = high ? low_base : high_base;
14724   else
14725     base = high ? high_base : low_base;
14726
14727   for (i = 0; i < nunits / 2; i++)
14728     RTVEC_ELT (v, i) = GEN_INT (base + i);
14729
14730   t1 = gen_rtx_PARALLEL (mode, v);
14731   return t1;
14732 }
14733
14734 /* Check OP for validity as a PARALLEL RTX vector with elements
14735    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
14736    from the perspective of the architecture.  See the diagram above
14737    aarch64_simd_vect_par_cnst_half for more details.  */
14738
14739 bool
14740 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
14741                                        bool high)
14742 {
14743   int nelts;
14744   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
14745     return false;
14746
14747   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
14748   HOST_WIDE_INT count_op = XVECLEN (op, 0);
14749   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
14750   int i = 0;
14751
14752   if (count_op != count_ideal)
14753     return false;
14754
14755   for (i = 0; i < count_ideal; i++)
14756     {
14757       rtx elt_op = XVECEXP (op, 0, i);
14758       rtx elt_ideal = XVECEXP (ideal, 0, i);
14759
14760       if (!CONST_INT_P (elt_op)
14761           || INTVAL (elt_ideal) != INTVAL (elt_op))
14762         return false;
14763     }
14764   return true;
14765 }
14766
14767 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
14768    HIGH (exclusive).  */
14769 void
14770 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
14771                           const_tree exp)
14772 {
14773   HOST_WIDE_INT lane;
14774   gcc_assert (CONST_INT_P (operand));
14775   lane = INTVAL (operand);
14776
14777   if (lane < low || lane >= high)
14778   {
14779     if (exp)
14780       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
14781     else
14782       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
14783   }
14784 }
14785
14786 /* Peform endian correction on lane number N, which indexes a vector
14787    of mode MODE, and return the result as an SImode rtx.  */
14788
14789 rtx
14790 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
14791 {
14792   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
14793 }
14794
14795 /* Return TRUE if OP is a valid vector addressing mode.  */
14796
14797 bool
14798 aarch64_simd_mem_operand_p (rtx op)
14799 {
14800   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
14801                         || REG_P (XEXP (op, 0)));
14802 }
14803
14804 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
14805
14806 bool
14807 aarch64_sve_ld1r_operand_p (rtx op)
14808 {
14809   struct aarch64_address_info addr;
14810   scalar_mode mode;
14811
14812   return (MEM_P (op)
14813           && is_a <scalar_mode> (GET_MODE (op), &mode)
14814           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
14815           && addr.type == ADDRESS_REG_IMM
14816           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
14817 }
14818
14819 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
14820    The conditions for STR are the same.  */
14821 bool
14822 aarch64_sve_ldr_operand_p (rtx op)
14823 {
14824   struct aarch64_address_info addr;
14825
14826   return (MEM_P (op)
14827           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
14828                                        false, ADDR_QUERY_ANY)
14829           && addr.type == ADDRESS_REG_IMM);
14830 }
14831
14832 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
14833    We need to be able to access the individual pieces, so the range
14834    is different from LD[234] and ST[234].  */
14835 bool
14836 aarch64_sve_struct_memory_operand_p (rtx op)
14837 {
14838   if (!MEM_P (op))
14839     return false;
14840
14841   machine_mode mode = GET_MODE (op);
14842   struct aarch64_address_info addr;
14843   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
14844                                  ADDR_QUERY_ANY)
14845       || addr.type != ADDRESS_REG_IMM)
14846     return false;
14847
14848   poly_int64 first = addr.const_offset;
14849   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
14850   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
14851           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
14852 }
14853
14854 /* Emit a register copy from operand to operand, taking care not to
14855    early-clobber source registers in the process.
14856
14857    COUNT is the number of components into which the copy needs to be
14858    decomposed.  */
14859 void
14860 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
14861                                 unsigned int count)
14862 {
14863   unsigned int i;
14864   int rdest = REGNO (operands[0]);
14865   int rsrc = REGNO (operands[1]);
14866
14867   if (!reg_overlap_mentioned_p (operands[0], operands[1])
14868       || rdest < rsrc)
14869     for (i = 0; i < count; i++)
14870       emit_move_insn (gen_rtx_REG (mode, rdest + i),
14871                       gen_rtx_REG (mode, rsrc + i));
14872   else
14873     for (i = 0; i < count; i++)
14874       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
14875                       gen_rtx_REG (mode, rsrc + count - i - 1));
14876 }
14877
14878 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
14879    one of VSTRUCT modes: OI, CI, or XI.  */
14880 int
14881 aarch64_simd_attr_length_rglist (machine_mode mode)
14882 {
14883   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
14884   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
14885 }
14886
14887 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
14888    alignment of a vector to 128 bits.  SVE predicates have an alignment of
14889    16 bits.  */
14890 static HOST_WIDE_INT
14891 aarch64_simd_vector_alignment (const_tree type)
14892 {
14893   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14894     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
14895        be set for non-predicate vectors of booleans.  Modes are the most
14896        direct way we have of identifying real SVE predicate types.  */
14897     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
14898   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
14899   return MIN (align, 128);
14900 }
14901
14902 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
14903 static poly_uint64
14904 aarch64_vectorize_preferred_vector_alignment (const_tree type)
14905 {
14906   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
14907     {
14908       /* If the length of the vector is fixed, try to align to that length,
14909          otherwise don't try to align at all.  */
14910       HOST_WIDE_INT result;
14911       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
14912         result = TYPE_ALIGN (TREE_TYPE (type));
14913       return result;
14914     }
14915   return TYPE_ALIGN (type);
14916 }
14917
14918 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
14919 static bool
14920 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
14921 {
14922   if (is_packed)
14923     return false;
14924
14925   /* For fixed-length vectors, check that the vectorizer will aim for
14926      full-vector alignment.  This isn't true for generic GCC vectors
14927      that are wider than the ABI maximum of 128 bits.  */
14928   poly_uint64 preferred_alignment =
14929     aarch64_vectorize_preferred_vector_alignment (type);
14930   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
14931       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
14932                    preferred_alignment))
14933     return false;
14934
14935   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
14936   return true;
14937 }
14938
14939 /* Return true if the vector misalignment factor is supported by the
14940    target.  */
14941 static bool
14942 aarch64_builtin_support_vector_misalignment (machine_mode mode,
14943                                              const_tree type, int misalignment,
14944                                              bool is_packed)
14945 {
14946   if (TARGET_SIMD && STRICT_ALIGNMENT)
14947     {
14948       /* Return if movmisalign pattern is not supported for this mode.  */
14949       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
14950         return false;
14951
14952       /* Misalignment factor is unknown at compile time.  */
14953       if (misalignment == -1)
14954         return false;
14955     }
14956   return default_builtin_support_vector_misalignment (mode, type, misalignment,
14957                                                       is_packed);
14958 }
14959
14960 /* If VALS is a vector constant that can be loaded into a register
14961    using DUP, generate instructions to do so and return an RTX to
14962    assign to the register.  Otherwise return NULL_RTX.  */
14963 static rtx
14964 aarch64_simd_dup_constant (rtx vals)
14965 {
14966   machine_mode mode = GET_MODE (vals);
14967   machine_mode inner_mode = GET_MODE_INNER (mode);
14968   rtx x;
14969
14970   if (!const_vec_duplicate_p (vals, &x))
14971     return NULL_RTX;
14972
14973   /* We can load this constant by using DUP and a constant in a
14974      single ARM register.  This will be cheaper than a vector
14975      load.  */
14976   x = copy_to_mode_reg (inner_mode, x);
14977   return gen_vec_duplicate (mode, x);
14978 }
14979
14980
14981 /* Generate code to load VALS, which is a PARALLEL containing only
14982    constants (for vec_init) or CONST_VECTOR, efficiently into a
14983    register.  Returns an RTX to copy into the register, or NULL_RTX
14984    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
14985 static rtx
14986 aarch64_simd_make_constant (rtx vals)
14987 {
14988   machine_mode mode = GET_MODE (vals);
14989   rtx const_dup;
14990   rtx const_vec = NULL_RTX;
14991   int n_const = 0;
14992   int i;
14993
14994   if (GET_CODE (vals) == CONST_VECTOR)
14995     const_vec = vals;
14996   else if (GET_CODE (vals) == PARALLEL)
14997     {
14998       /* A CONST_VECTOR must contain only CONST_INTs and
14999          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
15000          Only store valid constants in a CONST_VECTOR.  */
15001       int n_elts = XVECLEN (vals, 0);
15002       for (i = 0; i < n_elts; ++i)
15003         {
15004           rtx x = XVECEXP (vals, 0, i);
15005           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15006             n_const++;
15007         }
15008       if (n_const == n_elts)
15009         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
15010     }
15011   else
15012     gcc_unreachable ();
15013
15014   if (const_vec != NULL_RTX
15015       && aarch64_simd_valid_immediate (const_vec, NULL))
15016     /* Load using MOVI/MVNI.  */
15017     return const_vec;
15018   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
15019     /* Loaded using DUP.  */
15020     return const_dup;
15021   else if (const_vec != NULL_RTX)
15022     /* Load from constant pool. We cannot take advantage of single-cycle
15023        LD1 because we need a PC-relative addressing mode.  */
15024     return const_vec;
15025   else
15026     /* A PARALLEL containing something not valid inside CONST_VECTOR.
15027        We cannot construct an initializer.  */
15028     return NULL_RTX;
15029 }
15030
15031 /* Expand a vector initialisation sequence, such that TARGET is
15032    initialised to contain VALS.  */
15033
15034 void
15035 aarch64_expand_vector_init (rtx target, rtx vals)
15036 {
15037   machine_mode mode = GET_MODE (target);
15038   scalar_mode inner_mode = GET_MODE_INNER (mode);
15039   /* The number of vector elements.  */
15040   int n_elts = XVECLEN (vals, 0);
15041   /* The number of vector elements which are not constant.  */
15042   int n_var = 0;
15043   rtx any_const = NULL_RTX;
15044   /* The first element of vals.  */
15045   rtx v0 = XVECEXP (vals, 0, 0);
15046   bool all_same = true;
15047
15048   /* Count the number of variable elements to initialise.  */
15049   for (int i = 0; i < n_elts; ++i)
15050     {
15051       rtx x = XVECEXP (vals, 0, i);
15052       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
15053         ++n_var;
15054       else
15055         any_const = x;
15056
15057       all_same &= rtx_equal_p (x, v0);
15058     }
15059
15060   /* No variable elements, hand off to aarch64_simd_make_constant which knows
15061      how best to handle this.  */
15062   if (n_var == 0)
15063     {
15064       rtx constant = aarch64_simd_make_constant (vals);
15065       if (constant != NULL_RTX)
15066         {
15067           emit_move_insn (target, constant);
15068           return;
15069         }
15070     }
15071
15072   /* Splat a single non-constant element if we can.  */
15073   if (all_same)
15074     {
15075       rtx x = copy_to_mode_reg (inner_mode, v0);
15076       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15077       return;
15078     }
15079
15080   enum insn_code icode = optab_handler (vec_set_optab, mode);
15081   gcc_assert (icode != CODE_FOR_nothing);
15082
15083   /* If there are only variable elements, try to optimize
15084      the insertion using dup for the most common element
15085      followed by insertions.  */
15086
15087   /* The algorithm will fill matches[*][0] with the earliest matching element,
15088      and matches[X][1] with the count of duplicate elements (if X is the
15089      earliest element which has duplicates).  */
15090
15091   if (n_var == n_elts && n_elts <= 16)
15092     {
15093       int matches[16][2] = {0};
15094       for (int i = 0; i < n_elts; i++)
15095         {
15096           for (int j = 0; j <= i; j++)
15097             {
15098               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
15099                 {
15100                   matches[i][0] = j;
15101                   matches[j][1]++;
15102                   break;
15103                 }
15104             }
15105         }
15106       int maxelement = 0;
15107       int maxv = 0;
15108       for (int i = 0; i < n_elts; i++)
15109         if (matches[i][1] > maxv)
15110           {
15111             maxelement = i;
15112             maxv = matches[i][1];
15113           }
15114
15115       /* Create a duplicate of the most common element, unless all elements
15116          are equally useless to us, in which case just immediately set the
15117          vector register using the first element.  */
15118
15119       if (maxv == 1)
15120         {
15121           /* For vectors of two 64-bit elements, we can do even better.  */
15122           if (n_elts == 2
15123               && (inner_mode == E_DImode
15124                   || inner_mode == E_DFmode))
15125
15126             {
15127               rtx x0 = XVECEXP (vals, 0, 0);
15128               rtx x1 = XVECEXP (vals, 0, 1);
15129               /* Combine can pick up this case, but handling it directly
15130                  here leaves clearer RTL.
15131
15132                  This is load_pair_lanes<mode>, and also gives us a clean-up
15133                  for store_pair_lanes<mode>.  */
15134               if (memory_operand (x0, inner_mode)
15135                   && memory_operand (x1, inner_mode)
15136                   && !STRICT_ALIGNMENT
15137                   && rtx_equal_p (XEXP (x1, 0),
15138                                   plus_constant (Pmode,
15139                                                  XEXP (x0, 0),
15140                                                  GET_MODE_SIZE (inner_mode))))
15141                 {
15142                   rtx t;
15143                   if (inner_mode == DFmode)
15144                     t = gen_load_pair_lanesdf (target, x0, x1);
15145                   else
15146                     t = gen_load_pair_lanesdi (target, x0, x1);
15147                   emit_insn (t);
15148                   return;
15149                 }
15150             }
15151           /* The subreg-move sequence below will move into lane zero of the
15152              vector register.  For big-endian we want that position to hold
15153              the last element of VALS.  */
15154           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
15155           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15156           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
15157         }
15158       else
15159         {
15160           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15161           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15162         }
15163
15164       /* Insert the rest.  */
15165       for (int i = 0; i < n_elts; i++)
15166         {
15167           rtx x = XVECEXP (vals, 0, i);
15168           if (matches[i][0] == maxelement)
15169             continue;
15170           x = copy_to_mode_reg (inner_mode, x);
15171           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15172         }
15173       return;
15174     }
15175
15176   /* Initialise a vector which is part-variable.  We want to first try
15177      to build those lanes which are constant in the most efficient way we
15178      can.  */
15179   if (n_var != n_elts)
15180     {
15181       rtx copy = copy_rtx (vals);
15182
15183       /* Load constant part of vector.  We really don't care what goes into the
15184          parts we will overwrite, but we're more likely to be able to load the
15185          constant efficiently if it has fewer, larger, repeating parts
15186          (see aarch64_simd_valid_immediate).  */
15187       for (int i = 0; i < n_elts; i++)
15188         {
15189           rtx x = XVECEXP (vals, 0, i);
15190           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15191             continue;
15192           rtx subst = any_const;
15193           for (int bit = n_elts / 2; bit > 0; bit /= 2)
15194             {
15195               /* Look in the copied vector, as more elements are const.  */
15196               rtx test = XVECEXP (copy, 0, i ^ bit);
15197               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
15198                 {
15199                   subst = test;
15200                   break;
15201                 }
15202             }
15203           XVECEXP (copy, 0, i) = subst;
15204         }
15205       aarch64_expand_vector_init (target, copy);
15206     }
15207
15208   /* Insert the variable lanes directly.  */
15209   for (int i = 0; i < n_elts; i++)
15210     {
15211       rtx x = XVECEXP (vals, 0, i);
15212       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15213         continue;
15214       x = copy_to_mode_reg (inner_mode, x);
15215       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15216     }
15217 }
15218
15219 static unsigned HOST_WIDE_INT
15220 aarch64_shift_truncation_mask (machine_mode mode)
15221 {
15222   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
15223     return 0;
15224   return GET_MODE_UNIT_BITSIZE (mode) - 1;
15225 }
15226
15227 /* Select a format to encode pointers in exception handling data.  */
15228 int
15229 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
15230 {
15231    int type;
15232    switch (aarch64_cmodel)
15233      {
15234      case AARCH64_CMODEL_TINY:
15235      case AARCH64_CMODEL_TINY_PIC:
15236      case AARCH64_CMODEL_SMALL:
15237      case AARCH64_CMODEL_SMALL_PIC:
15238      case AARCH64_CMODEL_SMALL_SPIC:
15239        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
15240           for everything.  */
15241        type = DW_EH_PE_sdata4;
15242        break;
15243      default:
15244        /* No assumptions here.  8-byte relocs required.  */
15245        type = DW_EH_PE_sdata8;
15246        break;
15247      }
15248    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
15249 }
15250
15251 /* The last .arch and .tune assembly strings that we printed.  */
15252 static std::string aarch64_last_printed_arch_string;
15253 static std::string aarch64_last_printed_tune_string;
15254
15255 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
15256    by the function fndecl.  */
15257
15258 void
15259 aarch64_declare_function_name (FILE *stream, const char* name,
15260                                 tree fndecl)
15261 {
15262   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15263
15264   struct cl_target_option *targ_options;
15265   if (target_parts)
15266     targ_options = TREE_TARGET_OPTION (target_parts);
15267   else
15268     targ_options = TREE_TARGET_OPTION (target_option_current_node);
15269   gcc_assert (targ_options);
15270
15271   const struct processor *this_arch
15272     = aarch64_get_arch (targ_options->x_explicit_arch);
15273
15274   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
15275   std::string extension
15276     = aarch64_get_extension_string_for_isa_flags (isa_flags,
15277                                                   this_arch->flags);
15278   /* Only update the assembler .arch string if it is distinct from the last
15279      such string we printed.  */
15280   std::string to_print = this_arch->name + extension;
15281   if (to_print != aarch64_last_printed_arch_string)
15282     {
15283       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
15284       aarch64_last_printed_arch_string = to_print;
15285     }
15286
15287   /* Print the cpu name we're tuning for in the comments, might be
15288      useful to readers of the generated asm.  Do it only when it changes
15289      from function to function and verbose assembly is requested.  */
15290   const struct processor *this_tune
15291     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
15292
15293   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
15294     {
15295       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
15296                    this_tune->name);
15297       aarch64_last_printed_tune_string = this_tune->name;
15298     }
15299
15300   /* Don't forget the type directive for ELF.  */
15301   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
15302   ASM_OUTPUT_LABEL (stream, name);
15303 }
15304
15305 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
15306
15307 static void
15308 aarch64_start_file (void)
15309 {
15310   struct cl_target_option *default_options
15311     = TREE_TARGET_OPTION (target_option_default_node);
15312
15313   const struct processor *default_arch
15314     = aarch64_get_arch (default_options->x_explicit_arch);
15315   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
15316   std::string extension
15317     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
15318                                                   default_arch->flags);
15319
15320    aarch64_last_printed_arch_string = default_arch->name + extension;
15321    aarch64_last_printed_tune_string = "";
15322    asm_fprintf (asm_out_file, "\t.arch %s\n",
15323                 aarch64_last_printed_arch_string.c_str ());
15324
15325    default_file_start ();
15326 }
15327
15328 /* Emit load exclusive.  */
15329
15330 static void
15331 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
15332                              rtx mem, rtx model_rtx)
15333 {
15334   emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
15335 }
15336
15337 /* Emit store exclusive.  */
15338
15339 static void
15340 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
15341                               rtx rval, rtx mem, rtx model_rtx)
15342 {
15343   emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
15344 }
15345
15346 /* Mark the previous jump instruction as unlikely.  */
15347
15348 static void
15349 aarch64_emit_unlikely_jump (rtx insn)
15350 {
15351   rtx_insn *jump = emit_jump_insn (insn);
15352   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
15353 }
15354
15355 /* Expand a compare and swap pattern.  */
15356
15357 void
15358 aarch64_expand_compare_and_swap (rtx operands[])
15359 {
15360   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
15361   machine_mode mode, r_mode;
15362
15363   bval = operands[0];
15364   rval = operands[1];
15365   mem = operands[2];
15366   oldval = operands[3];
15367   newval = operands[4];
15368   is_weak = operands[5];
15369   mod_s = operands[6];
15370   mod_f = operands[7];
15371   mode = GET_MODE (mem);
15372
15373   /* Normally the succ memory model must be stronger than fail, but in the
15374      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
15375      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
15376   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
15377       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
15378     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
15379
15380   r_mode = mode;
15381   if (mode == QImode || mode == HImode)
15382     {
15383       r_mode = SImode;
15384       rval = gen_reg_rtx (r_mode);
15385     }
15386
15387   if (TARGET_LSE)
15388     {
15389       /* The CAS insn requires oldval and rval overlap, but we need to
15390          have a copy of oldval saved across the operation to tell if
15391          the operation is successful.  */
15392       if (reg_overlap_mentioned_p (rval, oldval))
15393         rval = copy_to_mode_reg (r_mode, oldval);
15394       else
15395         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
15396
15397       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
15398                                                    newval, mod_s));
15399       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15400     }
15401   else
15402     {
15403       /* The oldval predicate varies by mode.  Test it and force to reg.  */
15404       insn_code code = code_for_aarch64_compare_and_swap (mode);
15405       if (!insn_data[code].operand[2].predicate (oldval, mode))
15406         oldval = force_reg (mode, oldval);
15407
15408       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
15409                                  is_weak, mod_s, mod_f));
15410       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
15411     }
15412
15413   if (r_mode != mode)
15414     rval = gen_lowpart (mode, rval);
15415   emit_move_insn (operands[1], rval);
15416
15417   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
15418   emit_insn (gen_rtx_SET (bval, x));
15419 }
15420
15421 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
15422    sequence implementing an atomic operation.  */
15423
15424 static void
15425 aarch64_emit_post_barrier (enum memmodel model)
15426 {
15427   const enum memmodel base_model = memmodel_base (model);
15428
15429   if (is_mm_sync (model)
15430       && (base_model == MEMMODEL_ACQUIRE
15431           || base_model == MEMMODEL_ACQ_REL
15432           || base_model == MEMMODEL_SEQ_CST))
15433     {
15434       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
15435     }
15436 }
15437
15438 /* Split a compare and swap pattern.  */
15439
15440 void
15441 aarch64_split_compare_and_swap (rtx operands[])
15442 {
15443   rtx rval, mem, oldval, newval, scratch;
15444   machine_mode mode;
15445   bool is_weak;
15446   rtx_code_label *label1, *label2;
15447   rtx x, cond;
15448   enum memmodel model;
15449   rtx model_rtx;
15450
15451   rval = operands[0];
15452   mem = operands[1];
15453   oldval = operands[2];
15454   newval = operands[3];
15455   is_weak = (operands[4] != const0_rtx);
15456   model_rtx = operands[5];
15457   scratch = operands[7];
15458   mode = GET_MODE (mem);
15459   model = memmodel_from_int (INTVAL (model_rtx));
15460
15461   /* When OLDVAL is zero and we want the strong version we can emit a tighter
15462     loop:
15463     .label1:
15464         LD[A]XR rval, [mem]
15465         CBNZ    rval, .label2
15466         ST[L]XR scratch, newval, [mem]
15467         CBNZ    scratch, .label1
15468     .label2:
15469         CMP     rval, 0.  */
15470   bool strong_zero_p = !is_weak && oldval == const0_rtx;
15471
15472   label1 = NULL;
15473   if (!is_weak)
15474     {
15475       label1 = gen_label_rtx ();
15476       emit_label (label1);
15477     }
15478   label2 = gen_label_rtx ();
15479
15480   /* The initial load can be relaxed for a __sync operation since a final
15481      barrier will be emitted to stop code hoisting.  */
15482   if (is_mm_sync (model))
15483     aarch64_emit_load_exclusive (mode, rval, mem,
15484                                  GEN_INT (MEMMODEL_RELAXED));
15485   else
15486     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
15487
15488   if (strong_zero_p)
15489     {
15490       if (aarch64_track_speculation)
15491         {
15492           /* Emit an explicit compare instruction, so that we can correctly
15493              track the condition codes.  */
15494           rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
15495           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15496         }
15497       else
15498         x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
15499
15500       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15501                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15502       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15503     }
15504   else
15505     {
15506       cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15507       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15508       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15509                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15510       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15511     }
15512
15513   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
15514
15515   if (!is_weak)
15516     {
15517       if (aarch64_track_speculation)
15518         {
15519           /* Emit an explicit compare instruction, so that we can correctly
15520              track the condition codes.  */
15521           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
15522           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15523         }
15524       else
15525         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
15526
15527       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15528                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
15529       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15530     }
15531   else
15532     {
15533       cond = gen_rtx_REG (CCmode, CC_REGNUM);
15534       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
15535       emit_insn (gen_rtx_SET (cond, x));
15536     }
15537
15538   emit_label (label2);
15539   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
15540      to set the condition flags.  If this is not used it will be removed by
15541      later passes.  */
15542   if (strong_zero_p)
15543     {
15544       cond = gen_rtx_REG (CCmode, CC_REGNUM);
15545       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
15546       emit_insn (gen_rtx_SET (cond, x));
15547     }
15548   /* Emit any final barrier needed for a __sync operation.  */
15549   if (is_mm_sync (model))
15550     aarch64_emit_post_barrier (model);
15551 }
15552
15553 /* Split an atomic operation.  */
15554
15555 void
15556 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
15557                          rtx value, rtx model_rtx, rtx cond)
15558 {
15559   machine_mode mode = GET_MODE (mem);
15560   machine_mode wmode = (mode == DImode ? DImode : SImode);
15561   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
15562   const bool is_sync = is_mm_sync (model);
15563   rtx_code_label *label;
15564   rtx x;
15565
15566   /* Split the atomic operation into a sequence.  */
15567   label = gen_label_rtx ();
15568   emit_label (label);
15569
15570   if (new_out)
15571     new_out = gen_lowpart (wmode, new_out);
15572   if (old_out)
15573     old_out = gen_lowpart (wmode, old_out);
15574   else
15575     old_out = new_out;
15576   value = simplify_gen_subreg (wmode, value, mode, 0);
15577
15578   /* The initial load can be relaxed for a __sync operation since a final
15579      barrier will be emitted to stop code hoisting.  */
15580  if (is_sync)
15581     aarch64_emit_load_exclusive (mode, old_out, mem,
15582                                  GEN_INT (MEMMODEL_RELAXED));
15583   else
15584     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
15585
15586   switch (code)
15587     {
15588     case SET:
15589       new_out = value;
15590       break;
15591
15592     case NOT:
15593       x = gen_rtx_AND (wmode, old_out, value);
15594       emit_insn (gen_rtx_SET (new_out, x));
15595       x = gen_rtx_NOT (wmode, new_out);
15596       emit_insn (gen_rtx_SET (new_out, x));
15597       break;
15598
15599     case MINUS:
15600       if (CONST_INT_P (value))
15601         {
15602           value = GEN_INT (-INTVAL (value));
15603           code = PLUS;
15604         }
15605       /* Fall through.  */
15606
15607     default:
15608       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
15609       emit_insn (gen_rtx_SET (new_out, x));
15610       break;
15611     }
15612
15613   aarch64_emit_store_exclusive (mode, cond, mem,
15614                                 gen_lowpart (mode, new_out), model_rtx);
15615
15616   if (aarch64_track_speculation)
15617     {
15618       /* Emit an explicit compare instruction, so that we can correctly
15619          track the condition codes.  */
15620       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
15621       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15622     }
15623   else
15624     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15625
15626   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15627                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
15628   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15629
15630   /* Emit any final barrier needed for a __sync operation.  */
15631   if (is_sync)
15632     aarch64_emit_post_barrier (model);
15633 }
15634
15635 static void
15636 aarch64_init_libfuncs (void)
15637 {
15638    /* Half-precision float operations.  The compiler handles all operations
15639      with NULL libfuncs by converting to SFmode.  */
15640
15641   /* Conversions.  */
15642   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
15643   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
15644
15645   /* Arithmetic.  */
15646   set_optab_libfunc (add_optab, HFmode, NULL);
15647   set_optab_libfunc (sdiv_optab, HFmode, NULL);
15648   set_optab_libfunc (smul_optab, HFmode, NULL);
15649   set_optab_libfunc (neg_optab, HFmode, NULL);
15650   set_optab_libfunc (sub_optab, HFmode, NULL);
15651
15652   /* Comparisons.  */
15653   set_optab_libfunc (eq_optab, HFmode, NULL);
15654   set_optab_libfunc (ne_optab, HFmode, NULL);
15655   set_optab_libfunc (lt_optab, HFmode, NULL);
15656   set_optab_libfunc (le_optab, HFmode, NULL);
15657   set_optab_libfunc (ge_optab, HFmode, NULL);
15658   set_optab_libfunc (gt_optab, HFmode, NULL);
15659   set_optab_libfunc (unord_optab, HFmode, NULL);
15660 }
15661
15662 /* Target hook for c_mode_for_suffix.  */
15663 static machine_mode
15664 aarch64_c_mode_for_suffix (char suffix)
15665 {
15666   if (suffix == 'q')
15667     return TFmode;
15668
15669   return VOIDmode;
15670 }
15671
15672 /* We can only represent floating point constants which will fit in
15673    "quarter-precision" values.  These values are characterised by
15674    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
15675    by:
15676
15677    (-1)^s * (n/16) * 2^r
15678
15679    Where:
15680      's' is the sign bit.
15681      'n' is an integer in the range 16 <= n <= 31.
15682      'r' is an integer in the range -3 <= r <= 4.  */
15683
15684 /* Return true iff X can be represented by a quarter-precision
15685    floating point immediate operand X.  Note, we cannot represent 0.0.  */
15686 bool
15687 aarch64_float_const_representable_p (rtx x)
15688 {
15689   /* This represents our current view of how many bits
15690      make up the mantissa.  */
15691   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
15692   int exponent;
15693   unsigned HOST_WIDE_INT mantissa, mask;
15694   REAL_VALUE_TYPE r, m;
15695   bool fail;
15696
15697   if (!CONST_DOUBLE_P (x))
15698     return false;
15699
15700   if (GET_MODE (x) == VOIDmode
15701       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
15702     return false;
15703
15704   r = *CONST_DOUBLE_REAL_VALUE (x);
15705
15706   /* We cannot represent infinities, NaNs or +/-zero.  We won't
15707      know if we have +zero until we analyse the mantissa, but we
15708      can reject the other invalid values.  */
15709   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
15710       || REAL_VALUE_MINUS_ZERO (r))
15711     return false;
15712
15713   /* Extract exponent.  */
15714   r = real_value_abs (&r);
15715   exponent = REAL_EXP (&r);
15716
15717   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
15718      highest (sign) bit, with a fixed binary point at bit point_pos.
15719      m1 holds the low part of the mantissa, m2 the high part.
15720      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
15721      bits for the mantissa, this can fail (low bits will be lost).  */
15722   real_ldexp (&m, &r, point_pos - exponent);
15723   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
15724
15725   /* If the low part of the mantissa has bits set we cannot represent
15726      the value.  */
15727   if (w.ulow () != 0)
15728     return false;
15729   /* We have rejected the lower HOST_WIDE_INT, so update our
15730      understanding of how many bits lie in the mantissa and
15731      look only at the high HOST_WIDE_INT.  */
15732   mantissa = w.elt (1);
15733   point_pos -= HOST_BITS_PER_WIDE_INT;
15734
15735   /* We can only represent values with a mantissa of the form 1.xxxx.  */
15736   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
15737   if ((mantissa & mask) != 0)
15738     return false;
15739
15740   /* Having filtered unrepresentable values, we may now remove all
15741      but the highest 5 bits.  */
15742   mantissa >>= point_pos - 5;
15743
15744   /* We cannot represent the value 0.0, so reject it.  This is handled
15745      elsewhere.  */
15746   if (mantissa == 0)
15747     return false;
15748
15749   /* Then, as bit 4 is always set, we can mask it off, leaving
15750      the mantissa in the range [0, 15].  */
15751   mantissa &= ~(1 << 4);
15752   gcc_assert (mantissa <= 15);
15753
15754   /* GCC internally does not use IEEE754-like encoding (where normalized
15755      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
15756      Our mantissa values are shifted 4 places to the left relative to
15757      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
15758      by 5 places to correct for GCC's representation.  */
15759   exponent = 5 - exponent;
15760
15761   return (exponent >= 0 && exponent <= 7);
15762 }
15763
15764 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
15765    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
15766    output MOVI/MVNI, ORR or BIC immediate.  */
15767 char*
15768 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
15769                                    enum simd_immediate_check which)
15770 {
15771   bool is_valid;
15772   static char templ[40];
15773   const char *mnemonic;
15774   const char *shift_op;
15775   unsigned int lane_count = 0;
15776   char element_char;
15777
15778   struct simd_immediate_info info;
15779
15780   /* This will return true to show const_vector is legal for use as either
15781      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
15782      It will also update INFO to show how the immediate should be generated.
15783      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
15784   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
15785   gcc_assert (is_valid);
15786
15787   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15788   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
15789
15790   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15791     {
15792       gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
15793       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
15794          move immediate path.  */
15795       if (aarch64_float_const_zero_rtx_p (info.value))
15796         info.value = GEN_INT (0);
15797       else
15798         {
15799           const unsigned int buf_size = 20;
15800           char float_buf[buf_size] = {'\0'};
15801           real_to_decimal_for_mode (float_buf,
15802                                     CONST_DOUBLE_REAL_VALUE (info.value),
15803                                     buf_size, buf_size, 1, info.elt_mode);
15804
15805           if (lane_count == 1)
15806             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
15807           else
15808             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
15809                       lane_count, element_char, float_buf);
15810           return templ;
15811         }
15812     }
15813
15814   gcc_assert (CONST_INT_P (info.value));
15815
15816   if (which == AARCH64_CHECK_MOV)
15817     {
15818       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
15819       shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
15820       if (lane_count == 1)
15821         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
15822                   mnemonic, UINTVAL (info.value));
15823       else if (info.shift)
15824         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15825                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
15826                   element_char, UINTVAL (info.value), shift_op, info.shift);
15827       else
15828         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15829                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
15830                   element_char, UINTVAL (info.value));
15831     }
15832   else
15833     {
15834       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
15835       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
15836       if (info.shift)
15837         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15838                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
15839                   element_char, UINTVAL (info.value), "lsl", info.shift);
15840       else
15841         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15842                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
15843                   element_char, UINTVAL (info.value));
15844     }
15845   return templ;
15846 }
15847
15848 char*
15849 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
15850 {
15851
15852   /* If a floating point number was passed and we desire to use it in an
15853      integer mode do the conversion to integer.  */
15854   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
15855     {
15856       unsigned HOST_WIDE_INT ival;
15857       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
15858           gcc_unreachable ();
15859       immediate = gen_int_mode (ival, mode);
15860     }
15861
15862   machine_mode vmode;
15863   /* use a 64 bit mode for everything except for DI/DF mode, where we use
15864      a 128 bit vector mode.  */
15865   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
15866
15867   vmode = aarch64_simd_container_mode (mode, width);
15868   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
15869   return aarch64_output_simd_mov_immediate (v_op, width);
15870 }
15871
15872 /* Return the output string to use for moving immediate CONST_VECTOR
15873    into an SVE register.  */
15874
15875 char *
15876 aarch64_output_sve_mov_immediate (rtx const_vector)
15877 {
15878   static char templ[40];
15879   struct simd_immediate_info info;
15880   char element_char;
15881
15882   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15883   gcc_assert (is_valid);
15884
15885   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15886
15887   if (info.step)
15888     {
15889       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15890                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15891                 element_char, INTVAL (info.value), INTVAL (info.step));
15892       return templ;
15893     }
15894
15895   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15896     {
15897       if (aarch64_float_const_zero_rtx_p (info.value))
15898         info.value = GEN_INT (0);
15899       else
15900         {
15901           const int buf_size = 20;
15902           char float_buf[buf_size] = {};
15903           real_to_decimal_for_mode (float_buf,
15904                                     CONST_DOUBLE_REAL_VALUE (info.value),
15905                                     buf_size, buf_size, 1, info.elt_mode);
15906
15907           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15908                     element_char, float_buf);
15909           return templ;
15910         }
15911     }
15912
15913   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15914             element_char, INTVAL (info.value));
15915   return templ;
15916 }
15917
15918 /* Return the asm format for a PTRUE instruction whose destination has
15919    mode MODE.  SUFFIX is the element size suffix.  */
15920
15921 char *
15922 aarch64_output_ptrue (machine_mode mode, char suffix)
15923 {
15924   unsigned int nunits;
15925   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15926   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15927     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15928   else
15929     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15930   return buf;
15931 }
15932
15933 /* Split operands into moves from op[1] + op[2] into op[0].  */
15934
15935 void
15936 aarch64_split_combinev16qi (rtx operands[3])
15937 {
15938   unsigned int dest = REGNO (operands[0]);
15939   unsigned int src1 = REGNO (operands[1]);
15940   unsigned int src2 = REGNO (operands[2]);
15941   machine_mode halfmode = GET_MODE (operands[1]);
15942   unsigned int halfregs = REG_NREGS (operands[1]);
15943   rtx destlo, desthi;
15944
15945   gcc_assert (halfmode == V16QImode);
15946
15947   if (src1 == dest && src2 == dest + halfregs)
15948     {
15949       /* No-op move.  Can't split to nothing; emit something.  */
15950       emit_note (NOTE_INSN_DELETED);
15951       return;
15952     }
15953
15954   /* Preserve register attributes for variable tracking.  */
15955   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15956   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15957                                GET_MODE_SIZE (halfmode));
15958
15959   /* Special case of reversed high/low parts.  */
15960   if (reg_overlap_mentioned_p (operands[2], destlo)
15961       && reg_overlap_mentioned_p (operands[1], desthi))
15962     {
15963       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15964       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15965       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15966     }
15967   else if (!reg_overlap_mentioned_p (operands[2], destlo))
15968     {
15969       /* Try to avoid unnecessary moves if part of the result
15970          is in the right place already.  */
15971       if (src1 != dest)
15972         emit_move_insn (destlo, operands[1]);
15973       if (src2 != dest + halfregs)
15974         emit_move_insn (desthi, operands[2]);
15975     }
15976   else
15977     {
15978       if (src2 != dest + halfregs)
15979         emit_move_insn (desthi, operands[2]);
15980       if (src1 != dest)
15981         emit_move_insn (destlo, operands[1]);
15982     }
15983 }
15984
15985 /* vec_perm support.  */
15986
15987 struct expand_vec_perm_d
15988 {
15989   rtx target, op0, op1;
15990   vec_perm_indices perm;
15991   machine_mode vmode;
15992   unsigned int vec_flags;
15993   bool one_vector_p;
15994   bool testing_p;
15995 };
15996
15997 /* Generate a variable permutation.  */
15998
15999 static void
16000 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
16001 {
16002   machine_mode vmode = GET_MODE (target);
16003   bool one_vector_p = rtx_equal_p (op0, op1);
16004
16005   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
16006   gcc_checking_assert (GET_MODE (op0) == vmode);
16007   gcc_checking_assert (GET_MODE (op1) == vmode);
16008   gcc_checking_assert (GET_MODE (sel) == vmode);
16009   gcc_checking_assert (TARGET_SIMD);
16010
16011   if (one_vector_p)
16012     {
16013       if (vmode == V8QImode)
16014         {
16015           /* Expand the argument to a V16QI mode by duplicating it.  */
16016           rtx pair = gen_reg_rtx (V16QImode);
16017           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
16018           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16019         }
16020       else
16021         {
16022           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
16023         }
16024     }
16025   else
16026     {
16027       rtx pair;
16028
16029       if (vmode == V8QImode)
16030         {
16031           pair = gen_reg_rtx (V16QImode);
16032           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
16033           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16034         }
16035       else
16036         {
16037           pair = gen_reg_rtx (OImode);
16038           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
16039           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
16040         }
16041     }
16042 }
16043
16044 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
16045    NELT is the number of elements in the vector.  */
16046
16047 void
16048 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
16049                          unsigned int nelt)
16050 {
16051   machine_mode vmode = GET_MODE (target);
16052   bool one_vector_p = rtx_equal_p (op0, op1);
16053   rtx mask;
16054
16055   /* The TBL instruction does not use a modulo index, so we must take care
16056      of that ourselves.  */
16057   mask = aarch64_simd_gen_const_vector_dup (vmode,
16058       one_vector_p ? nelt - 1 : 2 * nelt - 1);
16059   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
16060
16061   /* For big-endian, we also need to reverse the index within the vector
16062      (but not which vector).  */
16063   if (BYTES_BIG_ENDIAN)
16064     {
16065       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
16066       if (!one_vector_p)
16067         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
16068       sel = expand_simple_binop (vmode, XOR, sel, mask,
16069                                  NULL, 0, OPTAB_LIB_WIDEN);
16070     }
16071   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
16072 }
16073
16074 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
16075
16076 static void
16077 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
16078 {
16079   emit_insn (gen_rtx_SET (target,
16080                           gen_rtx_UNSPEC (GET_MODE (target),
16081                                           gen_rtvec (2, op0, op1), code)));
16082 }
16083
16084 /* Expand an SVE vec_perm with the given operands.  */
16085
16086 void
16087 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
16088 {
16089   machine_mode data_mode = GET_MODE (target);
16090   machine_mode sel_mode = GET_MODE (sel);
16091   /* Enforced by the pattern condition.  */
16092   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
16093
16094   /* Note: vec_perm indices are supposed to wrap when they go beyond the
16095      size of the two value vectors, i.e. the upper bits of the indices
16096      are effectively ignored.  SVE TBL instead produces 0 for any
16097      out-of-range indices, so we need to modulo all the vec_perm indices
16098      to ensure they are all in range.  */
16099   rtx sel_reg = force_reg (sel_mode, sel);
16100
16101   /* Check if the sel only references the first values vector.  */
16102   if (GET_CODE (sel) == CONST_VECTOR
16103       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
16104     {
16105       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
16106       return;
16107     }
16108
16109   /* Check if the two values vectors are the same.  */
16110   if (rtx_equal_p (op0, op1))
16111     {
16112       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
16113       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16114                                          NULL, 0, OPTAB_DIRECT);
16115       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
16116       return;
16117     }
16118
16119   /* Run TBL on for each value vector and combine the results.  */
16120
16121   rtx res0 = gen_reg_rtx (data_mode);
16122   rtx res1 = gen_reg_rtx (data_mode);
16123   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
16124   if (GET_CODE (sel) != CONST_VECTOR
16125       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
16126     {
16127       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
16128                                                        2 * nunits - 1);
16129       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16130                                      NULL, 0, OPTAB_DIRECT);
16131     }
16132   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
16133   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
16134                                      NULL, 0, OPTAB_DIRECT);
16135   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
16136   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
16137     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
16138   else
16139     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
16140 }
16141
16142 /* Recognize patterns suitable for the TRN instructions.  */
16143 static bool
16144 aarch64_evpc_trn (struct expand_vec_perm_d *d)
16145 {
16146   HOST_WIDE_INT odd;
16147   poly_uint64 nelt = d->perm.length ();
16148   rtx out, in0, in1, x;
16149   machine_mode vmode = d->vmode;
16150
16151   if (GET_MODE_UNIT_SIZE (vmode) > 8)
16152     return false;
16153
16154   /* Note that these are little-endian tests.
16155      We correct for big-endian later.  */
16156   if (!d->perm[0].is_constant (&odd)
16157       || (odd != 0 && odd != 1)
16158       || !d->perm.series_p (0, 2, odd, 2)
16159       || !d->perm.series_p (1, 2, nelt + odd, 2))
16160     return false;
16161
16162   /* Success!  */
16163   if (d->testing_p)
16164     return true;
16165
16166   in0 = d->op0;
16167   in1 = d->op1;
16168   /* We don't need a big-endian lane correction for SVE; see the comment
16169      at the head of aarch64-sve.md for details.  */
16170   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16171     {
16172       x = in0, in0 = in1, in1 = x;
16173       odd = !odd;
16174     }
16175   out = d->target;
16176
16177   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16178                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
16179   return true;
16180 }
16181
16182 /* Recognize patterns suitable for the UZP instructions.  */
16183 static bool
16184 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
16185 {
16186   HOST_WIDE_INT odd;
16187   rtx out, in0, in1, x;
16188   machine_mode vmode = d->vmode;
16189
16190   if (GET_MODE_UNIT_SIZE (vmode) > 8)
16191     return false;
16192
16193   /* Note that these are little-endian tests.
16194      We correct for big-endian later.  */
16195   if (!d->perm[0].is_constant (&odd)
16196       || (odd != 0 && odd != 1)
16197       || !d->perm.series_p (0, 1, odd, 2))
16198     return false;
16199
16200   /* Success!  */
16201   if (d->testing_p)
16202     return true;
16203
16204   in0 = d->op0;
16205   in1 = d->op1;
16206   /* We don't need a big-endian lane correction for SVE; see the comment
16207      at the head of aarch64-sve.md for details.  */
16208   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16209     {
16210       x = in0, in0 = in1, in1 = x;
16211       odd = !odd;
16212     }
16213   out = d->target;
16214
16215   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16216                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
16217   return true;
16218 }
16219
16220 /* Recognize patterns suitable for the ZIP instructions.  */
16221 static bool
16222 aarch64_evpc_zip (struct expand_vec_perm_d *d)
16223 {
16224   unsigned int high;
16225   poly_uint64 nelt = d->perm.length ();
16226   rtx out, in0, in1, x;
16227   machine_mode vmode = d->vmode;
16228
16229   if (GET_MODE_UNIT_SIZE (vmode) > 8)
16230     return false;
16231
16232   /* Note that these are little-endian tests.
16233      We correct for big-endian later.  */
16234   poly_uint64 first = d->perm[0];
16235   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
16236       || !d->perm.series_p (0, 2, first, 1)
16237       || !d->perm.series_p (1, 2, first + nelt, 1))
16238     return false;
16239   high = maybe_ne (first, 0U);
16240
16241   /* Success!  */
16242   if (d->testing_p)
16243     return true;
16244
16245   in0 = d->op0;
16246   in1 = d->op1;
16247   /* We don't need a big-endian lane correction for SVE; see the comment
16248      at the head of aarch64-sve.md for details.  */
16249   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16250     {
16251       x = in0, in0 = in1, in1 = x;
16252       high = !high;
16253     }
16254   out = d->target;
16255
16256   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16257                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
16258   return true;
16259 }
16260
16261 /* Recognize patterns for the EXT insn.  */
16262
16263 static bool
16264 aarch64_evpc_ext (struct expand_vec_perm_d *d)
16265 {
16266   HOST_WIDE_INT location;
16267   rtx offset;
16268
16269   /* The first element always refers to the first vector.
16270      Check if the extracted indices are increasing by one.  */
16271   if (d->vec_flags == VEC_SVE_PRED
16272       || !d->perm[0].is_constant (&location)
16273       || !d->perm.series_p (0, 1, location, 1))
16274     return false;
16275
16276   /* Success! */
16277   if (d->testing_p)
16278     return true;
16279
16280   /* The case where (location == 0) is a no-op for both big- and little-endian,
16281      and is removed by the mid-end at optimization levels -O1 and higher.
16282
16283      We don't need a big-endian lane correction for SVE; see the comment
16284      at the head of aarch64-sve.md for details.  */
16285   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
16286     {
16287       /* After setup, we want the high elements of the first vector (stored
16288          at the LSB end of the register), and the low elements of the second
16289          vector (stored at the MSB end of the register). So swap.  */
16290       std::swap (d->op0, d->op1);
16291       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
16292          to_constant () is safe since this is restricted to Advanced SIMD
16293          vectors.  */
16294       location = d->perm.length ().to_constant () - location;
16295     }
16296
16297   offset = GEN_INT (location);
16298   emit_set_insn (d->target,
16299                  gen_rtx_UNSPEC (d->vmode,
16300                                  gen_rtvec (3, d->op0, d->op1, offset),
16301                                  UNSPEC_EXT));
16302   return true;
16303 }
16304
16305 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
16306    within each 64-bit, 32-bit or 16-bit granule.  */
16307
16308 static bool
16309 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
16310 {
16311   HOST_WIDE_INT diff;
16312   unsigned int i, size, unspec;
16313   machine_mode pred_mode;
16314
16315   if (d->vec_flags == VEC_SVE_PRED
16316       || !d->one_vector_p
16317       || !d->perm[0].is_constant (&diff))
16318     return false;
16319
16320   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
16321   if (size == 8)
16322     {
16323       unspec = UNSPEC_REV64;
16324       pred_mode = VNx2BImode;
16325     }
16326   else if (size == 4)
16327     {
16328       unspec = UNSPEC_REV32;
16329       pred_mode = VNx4BImode;
16330     }
16331   else if (size == 2)
16332     {
16333       unspec = UNSPEC_REV16;
16334       pred_mode = VNx8BImode;
16335     }
16336   else
16337     return false;
16338
16339   unsigned int step = diff + 1;
16340   for (i = 0; i < step; ++i)
16341     if (!d->perm.series_p (i, step, diff - i, step))
16342       return false;
16343
16344   /* Success! */
16345   if (d->testing_p)
16346     return true;
16347
16348   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
16349   if (d->vec_flags == VEC_SVE_DATA)
16350     {
16351       rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16352       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
16353                             UNSPEC_MERGE_PTRUE);
16354     }
16355   emit_set_insn (d->target, src);
16356   return true;
16357 }
16358
16359 /* Recognize patterns for the REV insn, which reverses elements within
16360    a full vector.  */
16361
16362 static bool
16363 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
16364 {
16365   poly_uint64 nelt = d->perm.length ();
16366
16367   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
16368     return false;
16369
16370   if (!d->perm.series_p (0, 1, nelt - 1, -1))
16371     return false;
16372
16373   /* Success! */
16374   if (d->testing_p)
16375     return true;
16376
16377   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
16378   emit_set_insn (d->target, src);
16379   return true;
16380 }
16381
16382 static bool
16383 aarch64_evpc_dup (struct expand_vec_perm_d *d)
16384 {
16385   rtx out = d->target;
16386   rtx in0;
16387   HOST_WIDE_INT elt;
16388   machine_mode vmode = d->vmode;
16389   rtx lane;
16390
16391   if (d->vec_flags == VEC_SVE_PRED
16392       || d->perm.encoding ().encoded_nelts () != 1
16393       || !d->perm[0].is_constant (&elt))
16394     return false;
16395
16396   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
16397     return false;
16398
16399   /* Success! */
16400   if (d->testing_p)
16401     return true;
16402
16403   /* The generic preparation in aarch64_expand_vec_perm_const_1
16404      swaps the operand order and the permute indices if it finds
16405      d->perm[0] to be in the second operand.  Thus, we can always
16406      use d->op0 and need not do any extra arithmetic to get the
16407      correct lane number.  */
16408   in0 = d->op0;
16409   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
16410
16411   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
16412   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
16413   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
16414   return true;
16415 }
16416
16417 static bool
16418 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
16419 {
16420   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
16421   machine_mode vmode = d->vmode;
16422
16423   /* Make sure that the indices are constant.  */
16424   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
16425   for (unsigned int i = 0; i < encoded_nelts; ++i)
16426     if (!d->perm[i].is_constant ())
16427       return false;
16428
16429   if (d->testing_p)
16430     return true;
16431
16432   /* Generic code will try constant permutation twice.  Once with the
16433      original mode and again with the elements lowered to QImode.
16434      So wait and don't do the selector expansion ourselves.  */
16435   if (vmode != V8QImode && vmode != V16QImode)
16436     return false;
16437
16438   /* to_constant is safe since this routine is specific to Advanced SIMD
16439      vectors.  */
16440   unsigned int nelt = d->perm.length ().to_constant ();
16441   for (unsigned int i = 0; i < nelt; ++i)
16442     /* If big-endian and two vectors we end up with a weird mixed-endian
16443        mode on NEON.  Reverse the index within each word but not the word
16444        itself.  to_constant is safe because we checked is_constant above.  */
16445     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
16446                         ? d->perm[i].to_constant () ^ (nelt - 1)
16447                         : d->perm[i].to_constant ());
16448
16449   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16450   sel = force_reg (vmode, sel);
16451
16452   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
16453   return true;
16454 }
16455
16456 /* Try to implement D using an SVE TBL instruction.  */
16457
16458 static bool
16459 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
16460 {
16461   unsigned HOST_WIDE_INT nelt;
16462
16463   /* Permuting two variable-length vectors could overflow the
16464      index range.  */
16465   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
16466     return false;
16467
16468   if (d->testing_p)
16469     return true;
16470
16471   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
16472   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
16473   if (d->one_vector_p)
16474     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
16475   else
16476     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
16477   return true;
16478 }
16479
16480 static bool
16481 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
16482 {
16483   /* The pattern matching functions above are written to look for a small
16484      number to begin the sequence (0, 1, N/2).  If we begin with an index
16485      from the second operand, we can swap the operands.  */
16486   poly_int64 nelt = d->perm.length ();
16487   if (known_ge (d->perm[0], nelt))
16488     {
16489       d->perm.rotate_inputs (1);
16490       std::swap (d->op0, d->op1);
16491     }
16492
16493   if ((d->vec_flags == VEC_ADVSIMD
16494        || d->vec_flags == VEC_SVE_DATA
16495        || d->vec_flags == VEC_SVE_PRED)
16496       && known_gt (nelt, 1))
16497     {
16498       if (aarch64_evpc_rev_local (d))
16499         return true;
16500       else if (aarch64_evpc_rev_global (d))
16501         return true;
16502       else if (aarch64_evpc_ext (d))
16503         return true;
16504       else if (aarch64_evpc_dup (d))
16505         return true;
16506       else if (aarch64_evpc_zip (d))
16507         return true;
16508       else if (aarch64_evpc_uzp (d))
16509         return true;
16510       else if (aarch64_evpc_trn (d))
16511         return true;
16512       if (d->vec_flags == VEC_SVE_DATA)
16513         return aarch64_evpc_sve_tbl (d);
16514       else if (d->vec_flags == VEC_ADVSIMD)
16515         return aarch64_evpc_tbl (d);
16516     }
16517   return false;
16518 }
16519
16520 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
16521
16522 static bool
16523 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
16524                                   rtx op1, const vec_perm_indices &sel)
16525 {
16526   struct expand_vec_perm_d d;
16527
16528   /* Check whether the mask can be applied to a single vector.  */
16529   if (sel.ninputs () == 1
16530       || (op0 && rtx_equal_p (op0, op1)))
16531     d.one_vector_p = true;
16532   else if (sel.all_from_input_p (0))
16533     {
16534       d.one_vector_p = true;
16535       op1 = op0;
16536     }
16537   else if (sel.all_from_input_p (1))
16538     {
16539       d.one_vector_p = true;
16540       op0 = op1;
16541     }
16542   else
16543     d.one_vector_p = false;
16544
16545   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
16546                      sel.nelts_per_input ());
16547   d.vmode = vmode;
16548   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
16549   d.target = target;
16550   d.op0 = op0;
16551   d.op1 = op1;
16552   d.testing_p = !target;
16553
16554   if (!d.testing_p)
16555     return aarch64_expand_vec_perm_const_1 (&d);
16556
16557   rtx_insn *last = get_last_insn ();
16558   bool ret = aarch64_expand_vec_perm_const_1 (&d);
16559   gcc_assert (last == get_last_insn ());
16560
16561   return ret;
16562 }
16563
16564 /* Generate a byte permute mask for a register of mode MODE,
16565    which has NUNITS units.  */
16566
16567 rtx
16568 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
16569 {
16570   /* We have to reverse each vector because we dont have
16571      a permuted load that can reverse-load according to ABI rules.  */
16572   rtx mask;
16573   rtvec v = rtvec_alloc (16);
16574   unsigned int i, j;
16575   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
16576
16577   gcc_assert (BYTES_BIG_ENDIAN);
16578   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
16579
16580   for (i = 0; i < nunits; i++)
16581     for (j = 0; j < usize; j++)
16582       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
16583   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
16584   return force_reg (V16QImode, mask);
16585 }
16586
16587 /* Return true if X is a valid second operand for the SVE instruction
16588    that implements integer comparison OP_CODE.  */
16589
16590 static bool
16591 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
16592 {
16593   if (register_operand (x, VOIDmode))
16594     return true;
16595
16596   switch (op_code)
16597     {
16598     case LTU:
16599     case LEU:
16600     case GEU:
16601     case GTU:
16602       return aarch64_sve_cmp_immediate_p (x, false);
16603     case LT:
16604     case LE:
16605     case GE:
16606     case GT:
16607     case NE:
16608     case EQ:
16609       return aarch64_sve_cmp_immediate_p (x, true);
16610     default:
16611       gcc_unreachable ();
16612     }
16613 }
16614
16615 /* Use predicated SVE instructions to implement the equivalent of:
16616
16617      (set TARGET OP)
16618
16619    given that PTRUE is an all-true predicate of the appropriate mode.  */
16620
16621 static void
16622 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
16623 {
16624   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
16625                                gen_rtvec (2, ptrue, op),
16626                                UNSPEC_MERGE_PTRUE);
16627   rtx_insn *insn = emit_set_insn (target, unspec);
16628   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
16629 }
16630
16631 /* Likewise, but also clobber the condition codes.  */
16632
16633 static void
16634 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
16635 {
16636   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
16637                                gen_rtvec (2, ptrue, op),
16638                                UNSPEC_MERGE_PTRUE);
16639   rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
16640   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
16641 }
16642
16643 /* Return the UNSPEC_COND_* code for comparison CODE.  */
16644
16645 static unsigned int
16646 aarch64_unspec_cond_code (rtx_code code)
16647 {
16648   switch (code)
16649     {
16650     case NE:
16651       return UNSPEC_COND_NE;
16652     case EQ:
16653       return UNSPEC_COND_EQ;
16654     case LT:
16655       return UNSPEC_COND_LT;
16656     case GT:
16657       return UNSPEC_COND_GT;
16658     case LE:
16659       return UNSPEC_COND_LE;
16660     case GE:
16661       return UNSPEC_COND_GE;
16662     default:
16663       gcc_unreachable ();
16664     }
16665 }
16666
16667 /* Emit:
16668
16669       (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
16670
16671    where <X> is the operation associated with comparison CODE.  This form
16672    of instruction is used when (and (CODE OP0 OP1) PRED) would have different
16673    semantics, such as when PRED might not be all-true and when comparing
16674    inactive lanes could have side effects.  */
16675
16676 static void
16677 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
16678                                   rtx pred, rtx op0, rtx op1)
16679 {
16680   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
16681                                gen_rtvec (3, pred, op0, op1),
16682                                aarch64_unspec_cond_code (code));
16683   emit_set_insn (target, unspec);
16684 }
16685
16686 /* Expand an SVE integer comparison using the SVE equivalent of:
16687
16688      (set TARGET (CODE OP0 OP1)).  */
16689
16690 void
16691 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
16692 {
16693   machine_mode pred_mode = GET_MODE (target);
16694   machine_mode data_mode = GET_MODE (op0);
16695
16696   if (!aarch64_sve_cmp_operand_p (code, op1))
16697     op1 = force_reg (data_mode, op1);
16698
16699   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16700   rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16701   aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
16702 }
16703
16704 /* Emit the SVE equivalent of:
16705
16706       (set TMP1 (CODE1 OP0 OP1))
16707       (set TMP2 (CODE2 OP0 OP1))
16708       (set TARGET (ior:PRED_MODE TMP1 TMP2))
16709
16710    PTRUE is an all-true predicate with the same mode as TARGET.  */
16711
16712 static void
16713 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
16714                            rtx ptrue, rtx op0, rtx op1)
16715 {
16716   machine_mode pred_mode = GET_MODE (ptrue);
16717   rtx tmp1 = gen_reg_rtx (pred_mode);
16718   aarch64_emit_sve_ptrue_op (tmp1, ptrue,
16719                              gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
16720   rtx tmp2 = gen_reg_rtx (pred_mode);
16721   aarch64_emit_sve_ptrue_op (tmp2, ptrue,
16722                              gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
16723   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
16724 }
16725
16726 /* Emit the SVE equivalent of:
16727
16728       (set TMP (CODE OP0 OP1))
16729       (set TARGET (not TMP))
16730
16731    PTRUE is an all-true predicate with the same mode as TARGET.  */
16732
16733 static void
16734 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
16735                                 rtx op0, rtx op1)
16736 {
16737   machine_mode pred_mode = GET_MODE (ptrue);
16738   rtx tmp = gen_reg_rtx (pred_mode);
16739   aarch64_emit_sve_ptrue_op (tmp, ptrue,
16740                              gen_rtx_fmt_ee (code, pred_mode, op0, op1));
16741   aarch64_emit_unop (target, one_cmpl_optab, tmp);
16742 }
16743
16744 /* Expand an SVE floating-point comparison using the SVE equivalent of:
16745
16746      (set TARGET (CODE OP0 OP1))
16747
16748    If CAN_INVERT_P is true, the caller can also handle inverted results;
16749    return true if the result is in fact inverted.  */
16750
16751 bool
16752 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
16753                                   rtx op0, rtx op1, bool can_invert_p)
16754 {
16755   machine_mode pred_mode = GET_MODE (target);
16756   machine_mode data_mode = GET_MODE (op0);
16757
16758   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16759   switch (code)
16760     {
16761     case UNORDERED:
16762       /* UNORDERED has no immediate form.  */
16763       op1 = force_reg (data_mode, op1);
16764       /* fall through */
16765     case LT:
16766     case LE:
16767     case GT:
16768     case GE:
16769     case EQ:
16770     case NE:
16771       {
16772         /* There is native support for the comparison.  */
16773         rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16774         aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16775         return false;
16776       }
16777
16778     case LTGT:
16779       /* This is a trapping operation (LT or GT).  */
16780       aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
16781       return false;
16782
16783     case UNEQ:
16784       if (!flag_trapping_math)
16785         {
16786           /* This would trap for signaling NaNs.  */
16787           op1 = force_reg (data_mode, op1);
16788           aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
16789           return false;
16790         }
16791       /* fall through */
16792     case UNLT:
16793     case UNLE:
16794     case UNGT:
16795     case UNGE:
16796       if (flag_trapping_math)
16797         {
16798           /* Work out which elements are ordered.  */
16799           rtx ordered = gen_reg_rtx (pred_mode);
16800           op1 = force_reg (data_mode, op1);
16801           aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
16802
16803           /* Test the opposite condition for the ordered elements,
16804              then invert the result.  */
16805           if (code == UNEQ)
16806             code = NE;
16807           else
16808             code = reverse_condition_maybe_unordered (code);
16809           if (can_invert_p)
16810             {
16811               aarch64_emit_sve_predicated_cond (target, code,
16812                                                 ordered, op0, op1);
16813               return true;
16814             }
16815           rtx tmp = gen_reg_rtx (pred_mode);
16816           aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
16817           aarch64_emit_unop (target, one_cmpl_optab, tmp);
16818           return false;
16819         }
16820       break;
16821
16822     case ORDERED:
16823       /* ORDERED has no immediate form.  */
16824       op1 = force_reg (data_mode, op1);
16825       break;
16826
16827     default:
16828       gcc_unreachable ();
16829     }
16830
16831   /* There is native support for the inverse comparison.  */
16832   code = reverse_condition_maybe_unordered (code);
16833   if (can_invert_p)
16834     {
16835       rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16836       aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16837       return true;
16838     }
16839   aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
16840   return false;
16841 }
16842
16843 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
16844    of the data being selected and CMP_MODE is the mode of the values being
16845    compared.  */
16846
16847 void
16848 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
16849                           rtx *ops)
16850 {
16851   machine_mode pred_mode
16852     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
16853                              GET_MODE_SIZE (cmp_mode)).require ();
16854   rtx pred = gen_reg_rtx (pred_mode);
16855   if (FLOAT_MODE_P (cmp_mode))
16856     {
16857       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
16858                                             ops[4], ops[5], true))
16859         std::swap (ops[1], ops[2]);
16860     }
16861   else
16862     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
16863
16864   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
16865   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
16866 }
16867
16868 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
16869    true.  However due to issues with register allocation it is preferable
16870    to avoid tieing integer scalar and FP scalar modes.  Executing integer
16871    operations in general registers is better than treating them as scalar
16872    vector operations.  This reduces latency and avoids redundant int<->FP
16873    moves.  So tie modes if they are either the same class, or vector modes
16874    with other vector modes, vector structs or any scalar mode.  */
16875
16876 static bool
16877 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
16878 {
16879   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
16880     return true;
16881
16882   /* We specifically want to allow elements of "structure" modes to
16883      be tieable to the structure.  This more general condition allows
16884      other rarer situations too.  The reason we don't extend this to
16885      predicate modes is that there are no predicate structure modes
16886      nor any specific instructions for extracting part of a predicate
16887      register.  */
16888   if (aarch64_vector_data_mode_p (mode1)
16889       && aarch64_vector_data_mode_p (mode2))
16890     return true;
16891
16892   /* Also allow any scalar modes with vectors.  */
16893   if (aarch64_vector_mode_supported_p (mode1)
16894       || aarch64_vector_mode_supported_p (mode2))
16895     return true;
16896
16897   return false;
16898 }
16899
16900 /* Return a new RTX holding the result of moving POINTER forward by
16901    AMOUNT bytes.  */
16902
16903 static rtx
16904 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16905 {
16906   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16907
16908   return adjust_automodify_address (pointer, GET_MODE (pointer),
16909                                     next, amount);
16910 }
16911
16912 /* Return a new RTX holding the result of moving POINTER forward by the
16913    size of the mode it points to.  */
16914
16915 static rtx
16916 aarch64_progress_pointer (rtx pointer)
16917 {
16918   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16919 }
16920
16921 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16922    MODE bytes.  */
16923
16924 static void
16925 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16926                                               machine_mode mode)
16927 {
16928   rtx reg = gen_reg_rtx (mode);
16929
16930   /* "Cast" the pointers to the correct mode.  */
16931   *src = adjust_address (*src, mode, 0);
16932   *dst = adjust_address (*dst, mode, 0);
16933   /* Emit the memcpy.  */
16934   emit_move_insn (reg, *src);
16935   emit_move_insn (*dst, reg);
16936   /* Move the pointers forward.  */
16937   *src = aarch64_progress_pointer (*src);
16938   *dst = aarch64_progress_pointer (*dst);
16939 }
16940
16941 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
16942    we succeed, otherwise return false.  */
16943
16944 bool
16945 aarch64_expand_movmem (rtx *operands)
16946 {
16947   int n, mode_bits;
16948   rtx dst = operands[0];
16949   rtx src = operands[1];
16950   rtx base;
16951   machine_mode cur_mode = BLKmode, next_mode;
16952   bool speed_p = !optimize_function_for_size_p (cfun);
16953
16954   /* When optimizing for size, give a better estimate of the length of a
16955      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
16956      will always require an even number of instructions to do now.  And each
16957      operation requires both a load+store, so devide the max number by 2.  */
16958   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
16959
16960   /* We can't do anything smart if the amount to copy is not constant.  */
16961   if (!CONST_INT_P (operands[2]))
16962     return false;
16963
16964   n = INTVAL (operands[2]);
16965
16966   /* Try to keep the number of instructions low.  For all cases we will do at
16967      most two moves for the residual amount, since we'll always overlap the
16968      remainder.  */
16969   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
16970     return false;
16971
16972   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16973   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16974
16975   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16976   src = adjust_automodify_address (src, VOIDmode, base, 0);
16977
16978   /* Convert n to bits to make the rest of the code simpler.  */
16979   n = n * BITS_PER_UNIT;
16980
16981   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
16982      larger than TImode, but we should not use them for loads/stores here.  */
16983   const int copy_limit = GET_MODE_BITSIZE (TImode);
16984
16985   while (n > 0)
16986     {
16987       /* Find the largest mode in which to do the copy in without over reading
16988          or writing.  */
16989       opt_scalar_int_mode mode_iter;
16990       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
16991         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
16992           cur_mode = mode_iter.require ();
16993
16994       gcc_assert (cur_mode != BLKmode);
16995
16996       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
16997       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
16998
16999       n -= mode_bits;
17000
17001       /* Do certain trailing copies as overlapping if it's going to be
17002          cheaper.  i.e. less instructions to do so.  For instance doing a 15
17003          byte copy it's more efficient to do two overlapping 8 byte copies than
17004          8 + 6 + 1.  */
17005       if (n > 0 && n <= 8 * BITS_PER_UNIT)
17006         {
17007           next_mode = smallest_mode_for_size (n, MODE_INT);
17008           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
17009           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
17010           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
17011           n = n_bits;
17012         }
17013     }
17014
17015   return true;
17016 }
17017
17018 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
17019    SImode stores.  Handle the case when the constant has identical
17020    bottom and top halves.  This is beneficial when the two stores can be
17021    merged into an STP and we avoid synthesising potentially expensive
17022    immediates twice.  Return true if such a split is possible.  */
17023
17024 bool
17025 aarch64_split_dimode_const_store (rtx dst, rtx src)
17026 {
17027   rtx lo = gen_lowpart (SImode, src);
17028   rtx hi = gen_highpart_mode (SImode, DImode, src);
17029
17030   bool size_p = optimize_function_for_size_p (cfun);
17031
17032   if (!rtx_equal_p (lo, hi))
17033     return false;
17034
17035   unsigned int orig_cost
17036     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
17037   unsigned int lo_cost
17038     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
17039
17040   /* We want to transform:
17041      MOV        x1, 49370
17042      MOVK       x1, 0x140, lsl 16
17043      MOVK       x1, 0xc0da, lsl 32
17044      MOVK       x1, 0x140, lsl 48
17045      STR        x1, [x0]
17046    into:
17047      MOV        w1, 49370
17048      MOVK       w1, 0x140, lsl 16
17049      STP        w1, w1, [x0]
17050    So we want to perform this only when we save two instructions
17051    or more.  When optimizing for size, however, accept any code size
17052    savings we can.  */
17053   if (size_p && orig_cost <= lo_cost)
17054     return false;
17055
17056   if (!size_p
17057       && (orig_cost <= lo_cost + 1))
17058     return false;
17059
17060   rtx mem_lo = adjust_address (dst, SImode, 0);
17061   if (!aarch64_mem_pair_operand (mem_lo, SImode))
17062     return false;
17063
17064   rtx tmp_reg = gen_reg_rtx (SImode);
17065   aarch64_expand_mov_immediate (tmp_reg, lo);
17066   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
17067   /* Don't emit an explicit store pair as this may not be always profitable.
17068      Let the sched-fusion logic decide whether to merge them.  */
17069   emit_move_insn (mem_lo, tmp_reg);
17070   emit_move_insn (mem_hi, tmp_reg);
17071
17072   return true;
17073 }
17074
17075 /* Generate RTL for a conditional branch with rtx comparison CODE in
17076    mode CC_MODE.  The destination of the unlikely conditional branch
17077    is LABEL_REF.  */
17078
17079 void
17080 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
17081                               rtx label_ref)
17082 {
17083   rtx x;
17084   x = gen_rtx_fmt_ee (code, VOIDmode,
17085                       gen_rtx_REG (cc_mode, CC_REGNUM),
17086                       const0_rtx);
17087
17088   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17089                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
17090                             pc_rtx);
17091   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17092 }
17093
17094 /* Generate DImode scratch registers for 128-bit (TImode) addition.
17095
17096    OP1 represents the TImode destination operand 1
17097    OP2 represents the TImode destination operand 2
17098    LOW_DEST represents the low half (DImode) of TImode operand 0
17099    LOW_IN1 represents the low half (DImode) of TImode operand 1
17100    LOW_IN2 represents the low half (DImode) of TImode operand 2
17101    HIGH_DEST represents the high half (DImode) of TImode operand 0
17102    HIGH_IN1 represents the high half (DImode) of TImode operand 1
17103    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
17104
17105 void
17106 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17107                             rtx *low_in1, rtx *low_in2,
17108                             rtx *high_dest, rtx *high_in1,
17109                             rtx *high_in2)
17110 {
17111   *low_dest = gen_reg_rtx (DImode);
17112   *low_in1 = gen_lowpart (DImode, op1);
17113   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17114                                   subreg_lowpart_offset (DImode, TImode));
17115   *high_dest = gen_reg_rtx (DImode);
17116   *high_in1 = gen_highpart (DImode, op1);
17117   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17118                                    subreg_highpart_offset (DImode, TImode));
17119 }
17120
17121 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
17122
17123    This function differs from 'arch64_addti_scratch_regs' in that
17124    OP1 can be an immediate constant (zero). We must call
17125    subreg_highpart_offset with DImode and TImode arguments, otherwise
17126    VOIDmode will be used for the const_int which generates an internal
17127    error from subreg_size_highpart_offset which does not expect a size of zero.
17128
17129    OP1 represents the TImode destination operand 1
17130    OP2 represents the TImode destination operand 2
17131    LOW_DEST represents the low half (DImode) of TImode operand 0
17132    LOW_IN1 represents the low half (DImode) of TImode operand 1
17133    LOW_IN2 represents the low half (DImode) of TImode operand 2
17134    HIGH_DEST represents the high half (DImode) of TImode operand 0
17135    HIGH_IN1 represents the high half (DImode) of TImode operand 1
17136    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
17137
17138
17139 void
17140 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17141                              rtx *low_in1, rtx *low_in2,
17142                              rtx *high_dest, rtx *high_in1,
17143                              rtx *high_in2)
17144 {
17145   *low_dest = gen_reg_rtx (DImode);
17146   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
17147                                   subreg_lowpart_offset (DImode, TImode));
17148
17149   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17150                                   subreg_lowpart_offset (DImode, TImode));
17151   *high_dest = gen_reg_rtx (DImode);
17152
17153   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
17154                                    subreg_highpart_offset (DImode, TImode));
17155   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17156                                    subreg_highpart_offset (DImode, TImode));
17157 }
17158
17159 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
17160
17161    OP0 represents the TImode destination operand 0
17162    LOW_DEST represents the low half (DImode) of TImode operand 0
17163    LOW_IN1 represents the low half (DImode) of TImode operand 1
17164    LOW_IN2 represents the low half (DImode) of TImode operand 2
17165    HIGH_DEST represents the high half (DImode) of TImode operand 0
17166    HIGH_IN1 represents the high half (DImode) of TImode operand 1
17167    HIGH_IN2 represents the high half (DImode) of TImode operand 2
17168    UNSIGNED_P is true if the operation is being performed on unsigned
17169    values.  */
17170 void
17171 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
17172                        rtx low_in2, rtx high_dest, rtx high_in1,
17173                        rtx high_in2, bool unsigned_p)
17174 {
17175   if (low_in2 == const0_rtx)
17176     {
17177       low_dest = low_in1;
17178       high_in2 = force_reg (DImode, high_in2);
17179       if (unsigned_p)
17180         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
17181       else
17182         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
17183     }
17184   else
17185     {
17186       if (CONST_INT_P (low_in2))
17187         {
17188           high_in2 = force_reg (DImode, high_in2);
17189           emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
17190                                               GEN_INT (-INTVAL (low_in2))));
17191         }
17192       else
17193         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
17194
17195       if (unsigned_p)
17196         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
17197       else
17198         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
17199     }
17200
17201   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
17202   emit_move_insn (gen_highpart (DImode, op0), high_dest);
17203
17204 }
17205
17206 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
17207
17208 static unsigned HOST_WIDE_INT
17209 aarch64_asan_shadow_offset (void)
17210 {
17211   return (HOST_WIDE_INT_1 << 36);
17212 }
17213
17214 static rtx
17215 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
17216                         int code, tree treeop0, tree treeop1)
17217 {
17218   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17219   rtx op0, op1;
17220   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17221   insn_code icode;
17222   struct expand_operand ops[4];
17223
17224   start_sequence ();
17225   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17226
17227   op_mode = GET_MODE (op0);
17228   if (op_mode == VOIDmode)
17229     op_mode = GET_MODE (op1);
17230
17231   switch (op_mode)
17232     {
17233     case E_QImode:
17234     case E_HImode:
17235     case E_SImode:
17236       cmp_mode = SImode;
17237       icode = CODE_FOR_cmpsi;
17238       break;
17239
17240     case E_DImode:
17241       cmp_mode = DImode;
17242       icode = CODE_FOR_cmpdi;
17243       break;
17244
17245     case E_SFmode:
17246       cmp_mode = SFmode;
17247       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17248       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
17249       break;
17250
17251     case E_DFmode:
17252       cmp_mode = DFmode;
17253       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17254       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
17255       break;
17256
17257     default:
17258       end_sequence ();
17259       return NULL_RTX;
17260     }
17261
17262   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
17263   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
17264   if (!op0 || !op1)
17265     {
17266       end_sequence ();
17267       return NULL_RTX;
17268     }
17269   *prep_seq = get_insns ();
17270   end_sequence ();
17271
17272   create_fixed_operand (&ops[0], op0);
17273   create_fixed_operand (&ops[1], op1);
17274
17275   start_sequence ();
17276   if (!maybe_expand_insn (icode, 2, ops))
17277     {
17278       end_sequence ();
17279       return NULL_RTX;
17280     }
17281   *gen_seq = get_insns ();
17282   end_sequence ();
17283
17284   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
17285                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
17286 }
17287
17288 static rtx
17289 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
17290                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
17291 {
17292   rtx op0, op1, target;
17293   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17294   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17295   insn_code icode;
17296   struct expand_operand ops[6];
17297   int aarch64_cond;
17298
17299   push_to_sequence (*prep_seq);
17300   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17301
17302   op_mode = GET_MODE (op0);
17303   if (op_mode == VOIDmode)
17304     op_mode = GET_MODE (op1);
17305
17306   switch (op_mode)
17307     {
17308     case E_QImode:
17309     case E_HImode:
17310     case E_SImode:
17311       cmp_mode = SImode;
17312       icode = CODE_FOR_ccmpsi;
17313       break;
17314
17315     case E_DImode:
17316       cmp_mode = DImode;
17317       icode = CODE_FOR_ccmpdi;
17318       break;
17319
17320     case E_SFmode:
17321       cmp_mode = SFmode;
17322       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17323       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
17324       break;
17325
17326     case E_DFmode:
17327       cmp_mode = DFmode;
17328       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17329       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
17330       break;
17331
17332     default:
17333       end_sequence ();
17334       return NULL_RTX;
17335     }
17336
17337   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
17338   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
17339   if (!op0 || !op1)
17340     {
17341       end_sequence ();
17342       return NULL_RTX;
17343     }
17344   *prep_seq = get_insns ();
17345   end_sequence ();
17346
17347   target = gen_rtx_REG (cc_mode, CC_REGNUM);
17348   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
17349
17350   if (bit_code != AND)
17351     {
17352       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
17353                                                 GET_MODE (XEXP (prev, 0))),
17354                              VOIDmode, XEXP (prev, 0), const0_rtx);
17355       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
17356     }
17357
17358   create_fixed_operand (&ops[0], XEXP (prev, 0));
17359   create_fixed_operand (&ops[1], target);
17360   create_fixed_operand (&ops[2], op0);
17361   create_fixed_operand (&ops[3], op1);
17362   create_fixed_operand (&ops[4], prev);
17363   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
17364
17365   push_to_sequence (*gen_seq);
17366   if (!maybe_expand_insn (icode, 6, ops))
17367     {
17368       end_sequence ();
17369       return NULL_RTX;
17370     }
17371
17372   *gen_seq = get_insns ();
17373   end_sequence ();
17374
17375   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
17376 }
17377
17378 #undef TARGET_GEN_CCMP_FIRST
17379 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
17380
17381 #undef TARGET_GEN_CCMP_NEXT
17382 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
17383
17384 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
17385    instruction fusion of some sort.  */
17386
17387 static bool
17388 aarch64_macro_fusion_p (void)
17389 {
17390   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
17391 }
17392
17393
17394 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
17395    should be kept together during scheduling.  */
17396
17397 static bool
17398 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
17399 {
17400   rtx set_dest;
17401   rtx prev_set = single_set (prev);
17402   rtx curr_set = single_set (curr);
17403   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
17404   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
17405
17406   if (!aarch64_macro_fusion_p ())
17407     return false;
17408
17409   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
17410     {
17411       /* We are trying to match:
17412          prev (mov)  == (set (reg r0) (const_int imm16))
17413          curr (movk) == (set (zero_extract (reg r0)
17414                                            (const_int 16)
17415                                            (const_int 16))
17416                              (const_int imm16_1))  */
17417
17418       set_dest = SET_DEST (curr_set);
17419
17420       if (GET_CODE (set_dest) == ZERO_EXTRACT
17421           && CONST_INT_P (SET_SRC (curr_set))
17422           && CONST_INT_P (SET_SRC (prev_set))
17423           && CONST_INT_P (XEXP (set_dest, 2))
17424           && INTVAL (XEXP (set_dest, 2)) == 16
17425           && REG_P (XEXP (set_dest, 0))
17426           && REG_P (SET_DEST (prev_set))
17427           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
17428         {
17429           return true;
17430         }
17431     }
17432
17433   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
17434     {
17435
17436       /*  We're trying to match:
17437           prev (adrp) == (set (reg r1)
17438                               (high (symbol_ref ("SYM"))))
17439           curr (add) == (set (reg r0)
17440                              (lo_sum (reg r1)
17441                                      (symbol_ref ("SYM"))))
17442           Note that r0 need not necessarily be the same as r1, especially
17443           during pre-regalloc scheduling.  */
17444
17445       if (satisfies_constraint_Ush (SET_SRC (prev_set))
17446           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17447         {
17448           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
17449               && REG_P (XEXP (SET_SRC (curr_set), 0))
17450               && REGNO (XEXP (SET_SRC (curr_set), 0))
17451                  == REGNO (SET_DEST (prev_set))
17452               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
17453                               XEXP (SET_SRC (curr_set), 1)))
17454             return true;
17455         }
17456     }
17457
17458   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
17459     {
17460
17461       /* We're trying to match:
17462          prev (movk) == (set (zero_extract (reg r0)
17463                                            (const_int 16)
17464                                            (const_int 32))
17465                              (const_int imm16_1))
17466          curr (movk) == (set (zero_extract (reg r0)
17467                                            (const_int 16)
17468                                            (const_int 48))
17469                              (const_int imm16_2))  */
17470
17471       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
17472           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
17473           && REG_P (XEXP (SET_DEST (prev_set), 0))
17474           && REG_P (XEXP (SET_DEST (curr_set), 0))
17475           && REGNO (XEXP (SET_DEST (prev_set), 0))
17476              == REGNO (XEXP (SET_DEST (curr_set), 0))
17477           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
17478           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
17479           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
17480           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
17481           && CONST_INT_P (SET_SRC (prev_set))
17482           && CONST_INT_P (SET_SRC (curr_set)))
17483         return true;
17484
17485     }
17486   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
17487     {
17488       /* We're trying to match:
17489           prev (adrp) == (set (reg r0)
17490                               (high (symbol_ref ("SYM"))))
17491           curr (ldr) == (set (reg r1)
17492                              (mem (lo_sum (reg r0)
17493                                              (symbol_ref ("SYM")))))
17494                  or
17495           curr (ldr) == (set (reg r1)
17496                              (zero_extend (mem
17497                                            (lo_sum (reg r0)
17498                                                    (symbol_ref ("SYM"))))))  */
17499       if (satisfies_constraint_Ush (SET_SRC (prev_set))
17500           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17501         {
17502           rtx curr_src = SET_SRC (curr_set);
17503
17504           if (GET_CODE (curr_src) == ZERO_EXTEND)
17505             curr_src = XEXP (curr_src, 0);
17506
17507           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
17508               && REG_P (XEXP (XEXP (curr_src, 0), 0))
17509               && REGNO (XEXP (XEXP (curr_src, 0), 0))
17510                  == REGNO (SET_DEST (prev_set))
17511               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
17512                               XEXP (SET_SRC (prev_set), 0)))
17513               return true;
17514         }
17515     }
17516
17517   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
17518        && aarch_crypto_can_dual_issue (prev, curr))
17519     return true;
17520
17521   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
17522       && any_condjump_p (curr))
17523     {
17524       unsigned int condreg1, condreg2;
17525       rtx cc_reg_1;
17526       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
17527       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
17528
17529       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
17530           && prev
17531           && modified_in_p (cc_reg_1, prev))
17532         {
17533           enum attr_type prev_type = get_attr_type (prev);
17534
17535           /* FIXME: this misses some which is considered simple arthematic
17536              instructions for ThunderX.  Simple shifts are missed here.  */
17537           if (prev_type == TYPE_ALUS_SREG
17538               || prev_type == TYPE_ALUS_IMM
17539               || prev_type == TYPE_LOGICS_REG
17540               || prev_type == TYPE_LOGICS_IMM)
17541             return true;
17542         }
17543     }
17544
17545   if (prev_set
17546       && curr_set
17547       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
17548       && any_condjump_p (curr))
17549     {
17550       /* We're trying to match:
17551           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
17552           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
17553                                                          (const_int 0))
17554                                                  (label_ref ("SYM"))
17555                                                  (pc))  */
17556       if (SET_DEST (curr_set) == (pc_rtx)
17557           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
17558           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
17559           && REG_P (SET_DEST (prev_set))
17560           && REGNO (SET_DEST (prev_set))
17561              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
17562         {
17563           /* Fuse ALU operations followed by conditional branch instruction.  */
17564           switch (get_attr_type (prev))
17565             {
17566             case TYPE_ALU_IMM:
17567             case TYPE_ALU_SREG:
17568             case TYPE_ADC_REG:
17569             case TYPE_ADC_IMM:
17570             case TYPE_ADCS_REG:
17571             case TYPE_ADCS_IMM:
17572             case TYPE_LOGIC_REG:
17573             case TYPE_LOGIC_IMM:
17574             case TYPE_CSEL:
17575             case TYPE_ADR:
17576             case TYPE_MOV_IMM:
17577             case TYPE_SHIFT_REG:
17578             case TYPE_SHIFT_IMM:
17579             case TYPE_BFM:
17580             case TYPE_RBIT:
17581             case TYPE_REV:
17582             case TYPE_EXTEND:
17583               return true;
17584
17585             default:;
17586             }
17587         }
17588     }
17589
17590   return false;
17591 }
17592
17593 /* Return true iff the instruction fusion described by OP is enabled.  */
17594
17595 bool
17596 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
17597 {
17598   return (aarch64_tune_params.fusible_ops & op) != 0;
17599 }
17600
17601 /* If MEM is in the form of [base+offset], extract the two parts
17602    of address and set to BASE and OFFSET, otherwise return false
17603    after clearing BASE and OFFSET.  */
17604
17605 bool
17606 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
17607 {
17608   rtx addr;
17609
17610   gcc_assert (MEM_P (mem));
17611
17612   addr = XEXP (mem, 0);
17613
17614   if (REG_P (addr))
17615     {
17616       *base = addr;
17617       *offset = const0_rtx;
17618       return true;
17619     }
17620
17621   if (GET_CODE (addr) == PLUS
17622       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
17623     {
17624       *base = XEXP (addr, 0);
17625       *offset = XEXP (addr, 1);
17626       return true;
17627     }
17628
17629   *base = NULL_RTX;
17630   *offset = NULL_RTX;
17631
17632   return false;
17633 }
17634
17635 /* Types for scheduling fusion.  */
17636 enum sched_fusion_type
17637 {
17638   SCHED_FUSION_NONE = 0,
17639   SCHED_FUSION_LD_SIGN_EXTEND,
17640   SCHED_FUSION_LD_ZERO_EXTEND,
17641   SCHED_FUSION_LD,
17642   SCHED_FUSION_ST,
17643   SCHED_FUSION_NUM
17644 };
17645
17646 /* If INSN is a load or store of address in the form of [base+offset],
17647    extract the two parts and set to BASE and OFFSET.  Return scheduling
17648    fusion type this INSN is.  */
17649
17650 static enum sched_fusion_type
17651 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
17652 {
17653   rtx x, dest, src;
17654   enum sched_fusion_type fusion = SCHED_FUSION_LD;
17655
17656   gcc_assert (INSN_P (insn));
17657   x = PATTERN (insn);
17658   if (GET_CODE (x) != SET)
17659     return SCHED_FUSION_NONE;
17660
17661   src = SET_SRC (x);
17662   dest = SET_DEST (x);
17663
17664   machine_mode dest_mode = GET_MODE (dest);
17665
17666   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
17667     return SCHED_FUSION_NONE;
17668
17669   if (GET_CODE (src) == SIGN_EXTEND)
17670     {
17671       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
17672       src = XEXP (src, 0);
17673       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
17674         return SCHED_FUSION_NONE;
17675     }
17676   else if (GET_CODE (src) == ZERO_EXTEND)
17677     {
17678       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
17679       src = XEXP (src, 0);
17680       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
17681         return SCHED_FUSION_NONE;
17682     }
17683
17684   if (GET_CODE (src) == MEM && REG_P (dest))
17685     extract_base_offset_in_addr (src, base, offset);
17686   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
17687     {
17688       fusion = SCHED_FUSION_ST;
17689       extract_base_offset_in_addr (dest, base, offset);
17690     }
17691   else
17692     return SCHED_FUSION_NONE;
17693
17694   if (*base == NULL_RTX || *offset == NULL_RTX)
17695     fusion = SCHED_FUSION_NONE;
17696
17697   return fusion;
17698 }
17699
17700 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
17701
17702    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
17703    and PRI are only calculated for these instructions.  For other instruction,
17704    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
17705    type instruction fusion can be added by returning different priorities.
17706
17707    It's important that irrelevant instructions get the largest FUSION_PRI.  */
17708
17709 static void
17710 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
17711                                int *fusion_pri, int *pri)
17712 {
17713   int tmp, off_val;
17714   rtx base, offset;
17715   enum sched_fusion_type fusion;
17716
17717   gcc_assert (INSN_P (insn));
17718
17719   tmp = max_pri - 1;
17720   fusion = fusion_load_store (insn, &base, &offset);
17721   if (fusion == SCHED_FUSION_NONE)
17722     {
17723       *pri = tmp;
17724       *fusion_pri = tmp;
17725       return;
17726     }
17727
17728   /* Set FUSION_PRI according to fusion type and base register.  */
17729   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
17730
17731   /* Calculate PRI.  */
17732   tmp /= 2;
17733
17734   /* INSN with smaller offset goes first.  */
17735   off_val = (int)(INTVAL (offset));
17736   if (off_val >= 0)
17737     tmp -= (off_val & 0xfffff);
17738   else
17739     tmp += ((- off_val) & 0xfffff);
17740
17741   *pri = tmp;
17742   return;
17743 }
17744
17745 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
17746    Adjust priority of sha1h instructions so they are scheduled before
17747    other SHA1 instructions.  */
17748
17749 static int
17750 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
17751 {
17752   rtx x = PATTERN (insn);
17753
17754   if (GET_CODE (x) == SET)
17755     {
17756       x = SET_SRC (x);
17757
17758       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
17759         return priority + 10;
17760     }
17761
17762   return priority;
17763 }
17764
17765 /* Given OPERANDS of consecutive load/store, check if we can merge
17766    them into ldp/stp.  LOAD is true if they are load instructions.
17767    MODE is the mode of memory operands.  */
17768
17769 bool
17770 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
17771                                 machine_mode mode)
17772 {
17773   HOST_WIDE_INT offval_1, offval_2, msize;
17774   enum reg_class rclass_1, rclass_2;
17775   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
17776
17777   if (load)
17778     {
17779       mem_1 = operands[1];
17780       mem_2 = operands[3];
17781       reg_1 = operands[0];
17782       reg_2 = operands[2];
17783       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
17784       if (REGNO (reg_1) == REGNO (reg_2))
17785         return false;
17786     }
17787   else
17788     {
17789       mem_1 = operands[0];
17790       mem_2 = operands[2];
17791       reg_1 = operands[1];
17792       reg_2 = operands[3];
17793     }
17794
17795   /* The mems cannot be volatile.  */
17796   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
17797     return false;
17798
17799   /* If we have SImode and slow unaligned ldp,
17800      check the alignment to be at least 8 byte. */
17801   if (mode == SImode
17802       && (aarch64_tune_params.extra_tuning_flags
17803           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17804       && !optimize_size
17805       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
17806     return false;
17807
17808   /* Check if the addresses are in the form of [base+offset].  */
17809   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17810   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
17811     return false;
17812   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17813   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
17814     return false;
17815
17816   /* Check if the bases are same.  */
17817   if (!rtx_equal_p (base_1, base_2))
17818     return false;
17819
17820   /* The operands must be of the same size.  */
17821   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
17822                          GET_MODE_SIZE (GET_MODE (mem_2))));
17823
17824   offval_1 = INTVAL (offset_1);
17825   offval_2 = INTVAL (offset_2);
17826   /* We should only be trying this for fixed-sized modes.  There is no
17827      SVE LDP/STP instruction.  */
17828   msize = GET_MODE_SIZE (mode).to_constant ();
17829   /* Check if the offsets are consecutive.  */
17830   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
17831     return false;
17832
17833   /* Check if the addresses are clobbered by load.  */
17834   if (load)
17835     {
17836       if (reg_mentioned_p (reg_1, mem_1))
17837         return false;
17838
17839       /* In increasing order, the last load can clobber the address.  */
17840       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
17841         return false;
17842     }
17843
17844   /* One of the memory accesses must be a mempair operand.
17845      If it is not the first one, they need to be swapped by the
17846      peephole.  */
17847   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
17848        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
17849     return false;
17850
17851   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
17852     rclass_1 = FP_REGS;
17853   else
17854     rclass_1 = GENERAL_REGS;
17855
17856   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
17857     rclass_2 = FP_REGS;
17858   else
17859     rclass_2 = GENERAL_REGS;
17860
17861   /* Check if the registers are of same class.  */
17862   if (rclass_1 != rclass_2)
17863     return false;
17864
17865   return true;
17866 }
17867
17868 /* Given OPERANDS of consecutive load/store that can be merged,
17869    swap them if they are not in ascending order.  */
17870 void
17871 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
17872 {
17873   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
17874   HOST_WIDE_INT offval_1, offval_2;
17875
17876   if (load)
17877     {
17878       mem_1 = operands[1];
17879       mem_2 = operands[3];
17880     }
17881   else
17882     {
17883       mem_1 = operands[0];
17884       mem_2 = operands[2];
17885     }
17886
17887   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17888   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17889
17890   offval_1 = INTVAL (offset_1);
17891   offval_2 = INTVAL (offset_2);
17892
17893   if (offval_1 > offval_2)
17894     {
17895       /* Irrespective of whether this is a load or a store,
17896          we do the same swap.  */
17897       std::swap (operands[0], operands[2]);
17898       std::swap (operands[1], operands[3]);
17899     }
17900 }
17901
17902 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
17903    comparison between the two.  */
17904 int
17905 aarch64_host_wide_int_compare (const void *x, const void *y)
17906 {
17907   return wi::cmps (* ((const HOST_WIDE_INT *) x),
17908                    * ((const HOST_WIDE_INT *) y));
17909 }
17910
17911 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
17912    other pointing to a REG rtx containing an offset, compare the offsets
17913    of the two pairs.
17914
17915    Return:
17916
17917         1 iff offset (X) > offset (Y)
17918         0 iff offset (X) == offset (Y)
17919         -1 iff offset (X) < offset (Y)  */
17920 int
17921 aarch64_ldrstr_offset_compare (const void *x, const void *y)
17922 {
17923   const rtx * operands_1 = (const rtx *) x;
17924   const rtx * operands_2 = (const rtx *) y;
17925   rtx mem_1, mem_2, base, offset_1, offset_2;
17926
17927   if (MEM_P (operands_1[0]))
17928     mem_1 = operands_1[0];
17929   else
17930     mem_1 = operands_1[1];
17931
17932   if (MEM_P (operands_2[0]))
17933     mem_2 = operands_2[0];
17934   else
17935     mem_2 = operands_2[1];
17936
17937   /* Extract the offsets.  */
17938   extract_base_offset_in_addr (mem_1, &base, &offset_1);
17939   extract_base_offset_in_addr (mem_2, &base, &offset_2);
17940
17941   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
17942
17943   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
17944 }
17945
17946 /* Given OPERANDS of consecutive load/store, check if we can merge
17947    them into ldp/stp by adjusting the offset.  LOAD is true if they
17948    are load instructions.  MODE is the mode of memory operands.
17949
17950    Given below consecutive stores:
17951
17952      str  w1, [xb, 0x100]
17953      str  w1, [xb, 0x104]
17954      str  w1, [xb, 0x108]
17955      str  w1, [xb, 0x10c]
17956
17957    Though the offsets are out of the range supported by stp, we can
17958    still pair them after adjusting the offset, like:
17959
17960      add  scratch, xb, 0x100
17961      stp  w1, w1, [scratch]
17962      stp  w1, w1, [scratch, 0x8]
17963
17964    The peephole patterns detecting this opportunity should guarantee
17965    the scratch register is avaliable.  */
17966
17967 bool
17968 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
17969                                        scalar_mode mode)
17970 {
17971   const int num_insns = 4;
17972   enum reg_class rclass;
17973   HOST_WIDE_INT offvals[num_insns], msize;
17974   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
17975
17976   if (load)
17977     {
17978       for (int i = 0; i < num_insns; i++)
17979         {
17980           reg[i] = operands[2 * i];
17981           mem[i] = operands[2 * i + 1];
17982
17983           gcc_assert (REG_P (reg[i]));
17984         }
17985
17986       /* Do not attempt to merge the loads if the loads clobber each other.  */
17987       for (int i = 0; i < 8; i += 2)
17988         for (int j = i + 2; j < 8; j += 2)
17989           if (reg_overlap_mentioned_p (operands[i], operands[j]))
17990             return false;
17991     }
17992   else
17993     for (int i = 0; i < num_insns; i++)
17994       {
17995         mem[i] = operands[2 * i];
17996         reg[i] = operands[2 * i + 1];
17997       }
17998
17999   /* Skip if memory operand is by itself valid for ldp/stp.  */
18000   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
18001     return false;
18002
18003   for (int i = 0; i < num_insns; i++)
18004     {
18005       /* The mems cannot be volatile.  */
18006       if (MEM_VOLATILE_P (mem[i]))
18007         return false;
18008
18009       /* Check if the addresses are in the form of [base+offset].  */
18010       extract_base_offset_in_addr (mem[i], base + i, offset + i);
18011       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
18012         return false;
18013     }
18014
18015   /* Check if the registers are of same class.  */
18016   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
18017     ? FP_REGS : GENERAL_REGS;
18018
18019   for (int i = 1; i < num_insns; i++)
18020     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
18021       {
18022         if (rclass != FP_REGS)
18023           return false;
18024       }
18025     else
18026       {
18027         if (rclass != GENERAL_REGS)
18028           return false;
18029       }
18030
18031   /* Only the last register in the order in which they occur
18032      may be clobbered by the load.  */
18033   if (rclass == GENERAL_REGS && load)
18034     for (int i = 0; i < num_insns - 1; i++)
18035       if (reg_mentioned_p (reg[i], mem[i]))
18036         return false;
18037
18038   /* Check if the bases are same.  */
18039   for (int i = 0; i < num_insns - 1; i++)
18040     if (!rtx_equal_p (base[i], base[i + 1]))
18041       return false;
18042
18043   for (int i = 0; i < num_insns; i++)
18044     offvals[i] = INTVAL (offset[i]);
18045
18046   msize = GET_MODE_SIZE (mode);
18047
18048   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
18049   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
18050          aarch64_host_wide_int_compare);
18051
18052   if (!(offvals[1] == offvals[0] + msize
18053         && offvals[3] == offvals[2] + msize))
18054     return false;
18055
18056   /* Check that offsets are within range of each other.  The ldp/stp
18057      instructions have 7 bit immediate offsets, so use 0x80.  */
18058   if (offvals[2] - offvals[0] >= msize * 0x80)
18059     return false;
18060
18061   /* The offsets must be aligned with respect to each other.  */
18062   if (offvals[0] % msize != offvals[2] % msize)
18063     return false;
18064
18065   /* If we have SImode and slow unaligned ldp,
18066      check the alignment to be at least 8 byte. */
18067   if (mode == SImode
18068       && (aarch64_tune_params.extra_tuning_flags
18069           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18070       && !optimize_size
18071       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
18072     return false;
18073
18074   return true;
18075 }
18076
18077 /* Given OPERANDS of consecutive load/store, this function pairs them
18078    into LDP/STP after adjusting the offset.  It depends on the fact
18079    that the operands can be sorted so the offsets are correct for STP.
18080    MODE is the mode of memory operands.  CODE is the rtl operator
18081    which should be applied to all memory operands, it's SIGN_EXTEND,
18082    ZERO_EXTEND or UNKNOWN.  */
18083
18084 bool
18085 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
18086                              scalar_mode mode, RTX_CODE code)
18087 {
18088   rtx base, offset_1, offset_3, t1, t2;
18089   rtx mem_1, mem_2, mem_3, mem_4;
18090   rtx temp_operands[8];
18091   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
18092                 stp_off_upper_limit, stp_off_lower_limit, msize;
18093
18094   /* We make changes on a copy as we may still bail out.  */
18095   for (int i = 0; i < 8; i ++)
18096     temp_operands[i] = operands[i];
18097
18098   /* Sort the operands.  */
18099   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
18100
18101   if (load)
18102     {
18103       mem_1 = temp_operands[1];
18104       mem_2 = temp_operands[3];
18105       mem_3 = temp_operands[5];
18106       mem_4 = temp_operands[7];
18107     }
18108   else
18109     {
18110       mem_1 = temp_operands[0];
18111       mem_2 = temp_operands[2];
18112       mem_3 = temp_operands[4];
18113       mem_4 = temp_operands[6];
18114       gcc_assert (code == UNKNOWN);
18115     }
18116
18117   extract_base_offset_in_addr (mem_1, &base, &offset_1);
18118   extract_base_offset_in_addr (mem_3, &base, &offset_3);
18119   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
18120               && offset_3 != NULL_RTX);
18121
18122   /* Adjust offset so it can fit in LDP/STP instruction.  */
18123   msize = GET_MODE_SIZE (mode);
18124   stp_off_upper_limit = msize * (0x40 - 1);
18125   stp_off_lower_limit = - msize * 0x40;
18126
18127   off_val_1 = INTVAL (offset_1);
18128   off_val_3 = INTVAL (offset_3);
18129
18130   /* The base offset is optimally half way between the two STP/LDP offsets.  */
18131   if (msize <= 4)
18132     base_off = (off_val_1 + off_val_3) / 2;
18133   else
18134     /* However, due to issues with negative LDP/STP offset generation for
18135        larger modes, for DF, DI and vector modes. we must not use negative
18136        addresses smaller than 9 signed unadjusted bits can store.  This
18137        provides the most range in this case.  */
18138     base_off = off_val_1;
18139
18140   /* Adjust the base so that it is aligned with the addresses but still
18141      optimal.  */
18142   if (base_off % msize != off_val_1 % msize)
18143     /* Fix the offset, bearing in mind we want to make it bigger not
18144        smaller.  */
18145     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18146   else if (msize <= 4)
18147     /* The negative range of LDP/STP is one larger than the positive range.  */
18148     base_off += msize;
18149
18150   /* Check if base offset is too big or too small.  We can attempt to resolve
18151      this issue by setting it to the maximum value and seeing if the offsets
18152      still fit.  */
18153   if (base_off >= 0x1000)
18154     {
18155       base_off = 0x1000 - 1;
18156       /* We must still make sure that the base offset is aligned with respect
18157          to the address.  But it may may not be made any bigger.  */
18158       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18159     }
18160
18161   /* Likewise for the case where the base is too small.  */
18162   if (base_off <= -0x1000)
18163     {
18164       base_off = -0x1000 + 1;
18165       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18166     }
18167
18168   /* Offset of the first STP/LDP.  */
18169   new_off_1 = off_val_1 - base_off;
18170
18171   /* Offset of the second STP/LDP.  */
18172   new_off_3 = off_val_3 - base_off;
18173
18174   /* The offsets must be within the range of the LDP/STP instructions.  */
18175   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
18176       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
18177     return false;
18178
18179   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
18180                                                   new_off_1), true);
18181   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
18182                                                   new_off_1 + msize), true);
18183   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
18184                                                   new_off_3), true);
18185   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
18186                                                   new_off_3 + msize), true);
18187
18188   if (!aarch64_mem_pair_operand (mem_1, mode)
18189       || !aarch64_mem_pair_operand (mem_3, mode))
18190     return false;
18191
18192   if (code == ZERO_EXTEND)
18193     {
18194       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
18195       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
18196       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
18197       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
18198     }
18199   else if (code == SIGN_EXTEND)
18200     {
18201       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
18202       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
18203       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
18204       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
18205     }
18206
18207   if (load)
18208     {
18209       operands[0] = temp_operands[0];
18210       operands[1] = mem_1;
18211       operands[2] = temp_operands[2];
18212       operands[3] = mem_2;
18213       operands[4] = temp_operands[4];
18214       operands[5] = mem_3;
18215       operands[6] = temp_operands[6];
18216       operands[7] = mem_4;
18217     }
18218   else
18219     {
18220       operands[0] = mem_1;
18221       operands[1] = temp_operands[1];
18222       operands[2] = mem_2;
18223       operands[3] = temp_operands[3];
18224       operands[4] = mem_3;
18225       operands[5] = temp_operands[5];
18226       operands[6] = mem_4;
18227       operands[7] = temp_operands[7];
18228     }
18229
18230   /* Emit adjusting instruction.  */
18231   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
18232   /* Emit ldp/stp instructions.  */
18233   t1 = gen_rtx_SET (operands[0], operands[1]);
18234   t2 = gen_rtx_SET (operands[2], operands[3]);
18235   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18236   t1 = gen_rtx_SET (operands[4], operands[5]);
18237   t2 = gen_rtx_SET (operands[6], operands[7]);
18238   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18239   return true;
18240 }
18241
18242 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
18243    it isn't worth branching around empty masked ops (including masked
18244    stores).  */
18245
18246 static bool
18247 aarch64_empty_mask_is_expensive (unsigned)
18248 {
18249   return false;
18250 }
18251
18252 /* Return 1 if pseudo register should be created and used to hold
18253    GOT address for PIC code.  */
18254
18255 bool
18256 aarch64_use_pseudo_pic_reg (void)
18257 {
18258   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
18259 }
18260
18261 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
18262
18263 static int
18264 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
18265 {
18266   switch (XINT (x, 1))
18267     {
18268     case UNSPEC_GOTSMALLPIC:
18269     case UNSPEC_GOTSMALLPIC28K:
18270     case UNSPEC_GOTTINYPIC:
18271       return 0;
18272     default:
18273       break;
18274     }
18275
18276   return default_unspec_may_trap_p (x, flags);
18277 }
18278
18279
18280 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
18281    return the log2 of that value.  Otherwise return -1.  */
18282
18283 int
18284 aarch64_fpconst_pow_of_2 (rtx x)
18285 {
18286   const REAL_VALUE_TYPE *r;
18287
18288   if (!CONST_DOUBLE_P (x))
18289     return -1;
18290
18291   r = CONST_DOUBLE_REAL_VALUE (x);
18292
18293   if (REAL_VALUE_NEGATIVE (*r)
18294       || REAL_VALUE_ISNAN (*r)
18295       || REAL_VALUE_ISINF (*r)
18296       || !real_isinteger (r, DFmode))
18297     return -1;
18298
18299   return exact_log2 (real_to_integer (r));
18300 }
18301
18302 /* If X is a vector of equal CONST_DOUBLE values and that value is
18303    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
18304
18305 int
18306 aarch64_vec_fpconst_pow_of_2 (rtx x)
18307 {
18308   int nelts;
18309   if (GET_CODE (x) != CONST_VECTOR
18310       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
18311     return -1;
18312
18313   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
18314     return -1;
18315
18316   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
18317   if (firstval <= 0)
18318     return -1;
18319
18320   for (int i = 1; i < nelts; i++)
18321     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
18322       return -1;
18323
18324   return firstval;
18325 }
18326
18327 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
18328    to float.
18329
18330    __fp16 always promotes through this hook.
18331    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
18332    through the generic excess precision logic rather than here.  */
18333
18334 static tree
18335 aarch64_promoted_type (const_tree t)
18336 {
18337   if (SCALAR_FLOAT_TYPE_P (t)
18338       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
18339     return float_type_node;
18340
18341   return NULL_TREE;
18342 }
18343
18344 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
18345
18346 static bool
18347 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
18348                            optimization_type opt_type)
18349 {
18350   switch (op)
18351     {
18352     case rsqrt_optab:
18353       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
18354
18355     default:
18356       return true;
18357     }
18358 }
18359
18360 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
18361
18362 static unsigned int
18363 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
18364                                         int *offset)
18365 {
18366   /* Polynomial invariant 1 == (VG / 2) - 1.  */
18367   gcc_assert (i == 1);
18368   *factor = 2;
18369   *offset = 1;
18370   return AARCH64_DWARF_VG;
18371 }
18372
18373 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
18374    if MODE is HFmode, and punt to the generic implementation otherwise.  */
18375
18376 static bool
18377 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
18378 {
18379   return (mode == HFmode
18380           ? true
18381           : default_libgcc_floating_mode_supported_p (mode));
18382 }
18383
18384 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
18385    if MODE is HFmode, and punt to the generic implementation otherwise.  */
18386
18387 static bool
18388 aarch64_scalar_mode_supported_p (scalar_mode mode)
18389 {
18390   return (mode == HFmode
18391           ? true
18392           : default_scalar_mode_supported_p (mode));
18393 }
18394
18395 /* Set the value of FLT_EVAL_METHOD.
18396    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
18397
18398     0: evaluate all operations and constants, whose semantic type has at
18399        most the range and precision of type float, to the range and
18400        precision of float; evaluate all other operations and constants to
18401        the range and precision of the semantic type;
18402
18403     N, where _FloatN is a supported interchange floating type
18404        evaluate all operations and constants, whose semantic type has at
18405        most the range and precision of _FloatN type, to the range and
18406        precision of the _FloatN type; evaluate all other operations and
18407        constants to the range and precision of the semantic type;
18408
18409    If we have the ARMv8.2-A extensions then we support _Float16 in native
18410    precision, so we should set this to 16.  Otherwise, we support the type,
18411    but want to evaluate expressions in float precision, so set this to
18412    0.  */
18413
18414 static enum flt_eval_method
18415 aarch64_excess_precision (enum excess_precision_type type)
18416 {
18417   switch (type)
18418     {
18419       case EXCESS_PRECISION_TYPE_FAST:
18420       case EXCESS_PRECISION_TYPE_STANDARD:
18421         /* We can calculate either in 16-bit range and precision or
18422            32-bit range and precision.  Make that decision based on whether
18423            we have native support for the ARMv8.2-A 16-bit floating-point
18424            instructions or not.  */
18425         return (TARGET_FP_F16INST
18426                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
18427                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
18428       case EXCESS_PRECISION_TYPE_IMPLICIT:
18429         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
18430       default:
18431         gcc_unreachable ();
18432     }
18433   return FLT_EVAL_METHOD_UNPREDICTABLE;
18434 }
18435
18436 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
18437    scheduled for speculative execution.  Reject the long-running division
18438    and square-root instructions.  */
18439
18440 static bool
18441 aarch64_sched_can_speculate_insn (rtx_insn *insn)
18442 {
18443   switch (get_attr_type (insn))
18444     {
18445       case TYPE_SDIV:
18446       case TYPE_UDIV:
18447       case TYPE_FDIVS:
18448       case TYPE_FDIVD:
18449       case TYPE_FSQRTS:
18450       case TYPE_FSQRTD:
18451       case TYPE_NEON_FP_SQRT_S:
18452       case TYPE_NEON_FP_SQRT_D:
18453       case TYPE_NEON_FP_SQRT_S_Q:
18454       case TYPE_NEON_FP_SQRT_D_Q:
18455       case TYPE_NEON_FP_DIV_S:
18456       case TYPE_NEON_FP_DIV_D:
18457       case TYPE_NEON_FP_DIV_S_Q:
18458       case TYPE_NEON_FP_DIV_D_Q:
18459         return false;
18460       default:
18461         return true;
18462     }
18463 }
18464
18465 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
18466
18467 static int
18468 aarch64_compute_pressure_classes (reg_class *classes)
18469 {
18470   int i = 0;
18471   classes[i++] = GENERAL_REGS;
18472   classes[i++] = FP_REGS;
18473   /* PR_REGS isn't a useful pressure class because many predicate pseudo
18474      registers need to go in PR_LO_REGS at some point during their
18475      lifetime.  Splitting it into two halves has the effect of making
18476      all predicates count against PR_LO_REGS, so that we try whenever
18477      possible to restrict the number of live predicates to 8.  This
18478      greatly reduces the amount of spilling in certain loops.  */
18479   classes[i++] = PR_LO_REGS;
18480   classes[i++] = PR_HI_REGS;
18481   return i;
18482 }
18483
18484 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
18485
18486 static bool
18487 aarch64_can_change_mode_class (machine_mode from,
18488                                machine_mode to, reg_class_t)
18489 {
18490   if (BYTES_BIG_ENDIAN)
18491     {
18492       bool from_sve_p = aarch64_sve_data_mode_p (from);
18493       bool to_sve_p = aarch64_sve_data_mode_p (to);
18494
18495       /* Don't allow changes between SVE data modes and non-SVE modes.
18496          See the comment at the head of aarch64-sve.md for details.  */
18497       if (from_sve_p != to_sve_p)
18498         return false;
18499
18500       /* Don't allow changes in element size: lane 0 of the new vector
18501          would not then be lane 0 of the old vector.  See the comment
18502          above aarch64_maybe_expand_sve_subreg_move for a more detailed
18503          description.
18504
18505          In the worst case, this forces a register to be spilled in
18506          one mode and reloaded in the other, which handles the
18507          endianness correctly.  */
18508       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
18509         return false;
18510     }
18511   return true;
18512 }
18513
18514 /* Implement TARGET_EARLY_REMAT_MODES.  */
18515
18516 static void
18517 aarch64_select_early_remat_modes (sbitmap modes)
18518 {
18519   /* SVE values are not normally live across a call, so it should be
18520      worth doing early rematerialization even in VL-specific mode.  */
18521   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
18522     {
18523       machine_mode mode = (machine_mode) i;
18524       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
18525       if (vec_flags & VEC_ANY_SVE)
18526         bitmap_set_bit (modes, i);
18527     }
18528 }
18529
18530 /* Override the default target speculation_safe_value.  */
18531 static rtx
18532 aarch64_speculation_safe_value (machine_mode mode,
18533                                 rtx result, rtx val, rtx failval)
18534 {
18535   /* Maybe we should warn if falling back to hard barriers.  They are
18536      likely to be noticably more expensive than the alternative below.  */
18537   if (!aarch64_track_speculation)
18538     return default_speculation_safe_value (mode, result, val, failval);
18539
18540   if (!REG_P (val))
18541     val = copy_to_mode_reg (mode, val);
18542
18543   if (!aarch64_reg_or_zero (failval, mode))
18544     failval = copy_to_mode_reg (mode, failval);
18545
18546   emit_insn (gen_despeculate_copy (mode, result, val, failval));
18547   return result;
18548 }
18549
18550 /* Implement TARGET_ESTIMATED_POLY_VALUE.
18551    Look into the tuning structure for an estimate.
18552    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
18553    Advanced SIMD 128 bits.  */
18554
18555 static HOST_WIDE_INT
18556 aarch64_estimated_poly_value (poly_int64 val)
18557 {
18558   enum aarch64_sve_vector_bits_enum width_source
18559     = aarch64_tune_params.sve_width;
18560
18561   /* If we still don't have an estimate, use the default.  */
18562   if (width_source == SVE_SCALABLE)
18563     return default_estimated_poly_value (val);
18564
18565   HOST_WIDE_INT over_128 = width_source - 128;
18566   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
18567 }
18568
18569
18570 /* Return true for types that could be supported as SIMD return or
18571    argument types.  */
18572
18573 static bool
18574 supported_simd_type (tree t)
18575 {
18576   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
18577     {
18578       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
18579       return s == 1 || s == 2 || s == 4 || s == 8;
18580     }
18581   return false;
18582 }
18583
18584 /* Return true for types that currently are supported as SIMD return
18585    or argument types.  */
18586
18587 static bool
18588 currently_supported_simd_type (tree t, tree b)
18589 {
18590   if (COMPLEX_FLOAT_TYPE_P (t))
18591     return false;
18592
18593   if (TYPE_SIZE (t) != TYPE_SIZE (b))
18594     return false;
18595
18596   return supported_simd_type (t);
18597 }
18598
18599 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
18600
18601 static int
18602 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
18603                                         struct cgraph_simd_clone *clonei,
18604                                         tree base_type, int num)
18605 {
18606   tree t, ret_type, arg_type;
18607   unsigned int elt_bits, vec_bits, count;
18608
18609   if (!TARGET_SIMD)
18610     return 0;
18611
18612   if (clonei->simdlen
18613       && (clonei->simdlen < 2
18614           || clonei->simdlen > 1024
18615           || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
18616     {
18617       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18618                   "unsupported simdlen %d", clonei->simdlen);
18619       return 0;
18620     }
18621
18622   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
18623   if (TREE_CODE (ret_type) != VOID_TYPE
18624       && !currently_supported_simd_type (ret_type, base_type))
18625     {
18626       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
18627         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18628                     "GCC does not currently support mixed size types "
18629                     "for %<simd%> functions");
18630       else if (supported_simd_type (ret_type))
18631         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18632                     "GCC does not currently support return type %qT "
18633                     "for %<simd%> functions", ret_type);
18634       else
18635         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18636                     "unsupported return type %qT for %<simd%> functions",
18637                     ret_type);
18638       return 0;
18639     }
18640
18641   for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
18642     {
18643       arg_type = TREE_TYPE (t);
18644
18645       if (!currently_supported_simd_type (arg_type, base_type))
18646         {
18647           if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
18648             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18649                         "GCC does not currently support mixed size types "
18650                         "for %<simd%> functions");
18651           else
18652             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18653                         "GCC does not currently support argument type %qT "
18654                         "for %<simd%> functions", arg_type);
18655           return 0;
18656         }
18657     }
18658
18659   clonei->vecsize_mangle = 'n';
18660   clonei->mask_mode = VOIDmode;
18661   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
18662   if (clonei->simdlen == 0)
18663     {
18664       count = 2;
18665       vec_bits = (num == 0 ? 64 : 128);
18666       clonei->simdlen = vec_bits / elt_bits;
18667     }
18668   else
18669     {
18670       count = 1;
18671       vec_bits = clonei->simdlen * elt_bits;
18672       if (vec_bits != 64 && vec_bits != 128)
18673         {
18674           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18675                       "GCC does not currently support simdlen %d for type %qT",
18676                       clonei->simdlen, base_type);
18677           return 0;
18678         }
18679     }
18680   clonei->vecsize_int = vec_bits;
18681   clonei->vecsize_float = vec_bits;
18682   return count;
18683 }
18684
18685 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
18686
18687 static void
18688 aarch64_simd_clone_adjust (struct cgraph_node *node)
18689 {
18690   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
18691      use the correct ABI.  */
18692
18693   tree t = TREE_TYPE (node->decl);
18694   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
18695                                         TYPE_ATTRIBUTES (t));
18696 }
18697
18698 /* Implement TARGET_SIMD_CLONE_USABLE.  */
18699
18700 static int
18701 aarch64_simd_clone_usable (struct cgraph_node *node)
18702 {
18703   switch (node->simdclone->vecsize_mangle)
18704     {
18705     case 'n':
18706       if (!TARGET_SIMD)
18707         return -1;
18708       return 0;
18709     default:
18710       gcc_unreachable ();
18711     }
18712 }
18713
18714 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
18715
18716 static int
18717 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
18718 {
18719   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
18720       != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
18721     return 0;
18722   return 1;
18723 }
18724
18725 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
18726
18727 static const char *
18728 aarch64_get_multilib_abi_name (void)
18729 {
18730   if (TARGET_BIG_END)
18731     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
18732   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
18733 }
18734
18735 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
18736    global variable based guard use the default else
18737    return a null tree.  */
18738 static tree
18739 aarch64_stack_protect_guard (void)
18740 {
18741   if (aarch64_stack_protector_guard == SSP_GLOBAL)
18742     return default_stack_protect_guard ();
18743
18744   return NULL_TREE;
18745 }
18746
18747
18748 /* Target-specific selftests.  */
18749
18750 #if CHECKING_P
18751
18752 namespace selftest {
18753
18754 /* Selftest for the RTL loader.
18755    Verify that the RTL loader copes with a dump from
18756    print_rtx_function.  This is essentially just a test that class
18757    function_reader can handle a real dump, but it also verifies
18758    that lookup_reg_by_dump_name correctly handles hard regs.
18759    The presence of hard reg names in the dump means that the test is
18760    target-specific, hence it is in this file.  */
18761
18762 static void
18763 aarch64_test_loading_full_dump ()
18764 {
18765   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
18766
18767   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
18768
18769   rtx_insn *insn_1 = get_insn_by_uid (1);
18770   ASSERT_EQ (NOTE, GET_CODE (insn_1));
18771
18772   rtx_insn *insn_15 = get_insn_by_uid (15);
18773   ASSERT_EQ (INSN, GET_CODE (insn_15));
18774   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
18775
18776   /* Verify crtl->return_rtx.  */
18777   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
18778   ASSERT_EQ (0, REGNO (crtl->return_rtx));
18779   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
18780 }
18781
18782 /* Run all target-specific selftests.  */
18783
18784 static void
18785 aarch64_run_selftests (void)
18786 {
18787   aarch64_test_loading_full_dump ();
18788 }
18789
18790 } // namespace selftest
18791
18792 #endif /* #if CHECKING_P */
18793
18794 #undef TARGET_STACK_PROTECT_GUARD
18795 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
18796
18797 #undef TARGET_ADDRESS_COST
18798 #define TARGET_ADDRESS_COST aarch64_address_cost
18799
18800 /* This hook will determines whether unnamed bitfields affect the alignment
18801    of the containing structure.  The hook returns true if the structure
18802    should inherit the alignment requirements of an unnamed bitfield's
18803    type.  */
18804 #undef TARGET_ALIGN_ANON_BITFIELD
18805 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
18806
18807 #undef TARGET_ASM_ALIGNED_DI_OP
18808 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
18809
18810 #undef TARGET_ASM_ALIGNED_HI_OP
18811 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
18812
18813 #undef TARGET_ASM_ALIGNED_SI_OP
18814 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
18815
18816 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
18817 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
18818   hook_bool_const_tree_hwi_hwi_const_tree_true
18819
18820 #undef TARGET_ASM_FILE_START
18821 #define TARGET_ASM_FILE_START aarch64_start_file
18822
18823 #undef TARGET_ASM_OUTPUT_MI_THUNK
18824 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
18825
18826 #undef TARGET_ASM_SELECT_RTX_SECTION
18827 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
18828
18829 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
18830 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
18831
18832 #undef TARGET_BUILD_BUILTIN_VA_LIST
18833 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
18834
18835 #undef TARGET_CALLEE_COPIES
18836 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
18837
18838 #undef TARGET_CAN_ELIMINATE
18839 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
18840
18841 #undef TARGET_CAN_INLINE_P
18842 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
18843
18844 #undef TARGET_CANNOT_FORCE_CONST_MEM
18845 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
18846
18847 #undef TARGET_CASE_VALUES_THRESHOLD
18848 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
18849
18850 #undef TARGET_CONDITIONAL_REGISTER_USAGE
18851 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
18852
18853 /* Only the least significant bit is used for initialization guard
18854    variables.  */
18855 #undef TARGET_CXX_GUARD_MASK_BIT
18856 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
18857
18858 #undef TARGET_C_MODE_FOR_SUFFIX
18859 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
18860
18861 #ifdef TARGET_BIG_ENDIAN_DEFAULT
18862 #undef  TARGET_DEFAULT_TARGET_FLAGS
18863 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
18864 #endif
18865
18866 #undef TARGET_CLASS_MAX_NREGS
18867 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
18868
18869 #undef TARGET_BUILTIN_DECL
18870 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
18871
18872 #undef TARGET_BUILTIN_RECIPROCAL
18873 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
18874
18875 #undef TARGET_C_EXCESS_PRECISION
18876 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
18877
18878 #undef  TARGET_EXPAND_BUILTIN
18879 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
18880
18881 #undef TARGET_EXPAND_BUILTIN_VA_START
18882 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
18883
18884 #undef TARGET_FOLD_BUILTIN
18885 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
18886
18887 #undef TARGET_FUNCTION_ARG
18888 #define TARGET_FUNCTION_ARG aarch64_function_arg
18889
18890 #undef TARGET_FUNCTION_ARG_ADVANCE
18891 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
18892
18893 #undef TARGET_FUNCTION_ARG_BOUNDARY
18894 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
18895
18896 #undef TARGET_FUNCTION_ARG_PADDING
18897 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
18898
18899 #undef TARGET_GET_RAW_RESULT_MODE
18900 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
18901 #undef TARGET_GET_RAW_ARG_MODE
18902 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
18903
18904 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
18905 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
18906
18907 #undef TARGET_FUNCTION_VALUE
18908 #define TARGET_FUNCTION_VALUE aarch64_function_value
18909
18910 #undef TARGET_FUNCTION_VALUE_REGNO_P
18911 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
18912
18913 #undef TARGET_GIMPLE_FOLD_BUILTIN
18914 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
18915
18916 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
18917 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
18918
18919 #undef  TARGET_INIT_BUILTINS
18920 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
18921
18922 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
18923 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
18924   aarch64_ira_change_pseudo_allocno_class
18925
18926 #undef TARGET_LEGITIMATE_ADDRESS_P
18927 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
18928
18929 #undef TARGET_LEGITIMATE_CONSTANT_P
18930 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
18931
18932 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
18933 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
18934   aarch64_legitimize_address_displacement
18935
18936 #undef TARGET_LIBGCC_CMP_RETURN_MODE
18937 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
18938
18939 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
18940 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
18941 aarch64_libgcc_floating_mode_supported_p
18942
18943 #undef TARGET_MANGLE_TYPE
18944 #define TARGET_MANGLE_TYPE aarch64_mangle_type
18945
18946 #undef TARGET_MEMORY_MOVE_COST
18947 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
18948
18949 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
18950 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
18951
18952 #undef TARGET_MUST_PASS_IN_STACK
18953 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
18954
18955 /* This target hook should return true if accesses to volatile bitfields
18956    should use the narrowest mode possible.  It should return false if these
18957    accesses should use the bitfield container type.  */
18958 #undef TARGET_NARROW_VOLATILE_BITFIELD
18959 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
18960
18961 #undef  TARGET_OPTION_OVERRIDE
18962 #define TARGET_OPTION_OVERRIDE aarch64_override_options
18963
18964 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
18965 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
18966   aarch64_override_options_after_change
18967
18968 #undef TARGET_OPTION_SAVE
18969 #define TARGET_OPTION_SAVE aarch64_option_save
18970
18971 #undef TARGET_OPTION_RESTORE
18972 #define TARGET_OPTION_RESTORE aarch64_option_restore
18973
18974 #undef TARGET_OPTION_PRINT
18975 #define TARGET_OPTION_PRINT aarch64_option_print
18976
18977 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
18978 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
18979
18980 #undef TARGET_SET_CURRENT_FUNCTION
18981 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
18982
18983 #undef TARGET_PASS_BY_REFERENCE
18984 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
18985
18986 #undef TARGET_PREFERRED_RELOAD_CLASS
18987 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
18988
18989 #undef TARGET_SCHED_REASSOCIATION_WIDTH
18990 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
18991
18992 #undef TARGET_PROMOTED_TYPE
18993 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
18994
18995 #undef TARGET_SECONDARY_RELOAD
18996 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
18997
18998 #undef TARGET_SHIFT_TRUNCATION_MASK
18999 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
19000
19001 #undef TARGET_SETUP_INCOMING_VARARGS
19002 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
19003
19004 #undef TARGET_STRUCT_VALUE_RTX
19005 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
19006
19007 #undef TARGET_REGISTER_MOVE_COST
19008 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
19009
19010 #undef TARGET_RETURN_IN_MEMORY
19011 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
19012
19013 #undef TARGET_RETURN_IN_MSB
19014 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
19015
19016 #undef TARGET_RTX_COSTS
19017 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
19018
19019 #undef TARGET_SCALAR_MODE_SUPPORTED_P
19020 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
19021
19022 #undef TARGET_SCHED_ISSUE_RATE
19023 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
19024
19025 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
19026 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
19027   aarch64_sched_first_cycle_multipass_dfa_lookahead
19028
19029 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
19030 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
19031   aarch64_first_cycle_multipass_dfa_lookahead_guard
19032
19033 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
19034 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
19035   aarch64_get_separate_components
19036
19037 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
19038 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
19039   aarch64_components_for_bb
19040
19041 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
19042 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
19043   aarch64_disqualify_components
19044
19045 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
19046 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
19047   aarch64_emit_prologue_components
19048
19049 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
19050 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
19051   aarch64_emit_epilogue_components
19052
19053 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
19054 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
19055   aarch64_set_handled_components
19056
19057 #undef TARGET_TRAMPOLINE_INIT
19058 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
19059
19060 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
19061 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
19062
19063 #undef TARGET_VECTOR_MODE_SUPPORTED_P
19064 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
19065
19066 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
19067 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
19068   aarch64_builtin_support_vector_misalignment
19069
19070 #undef TARGET_ARRAY_MODE
19071 #define TARGET_ARRAY_MODE aarch64_array_mode
19072
19073 #undef TARGET_ARRAY_MODE_SUPPORTED_P
19074 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
19075
19076 #undef TARGET_VECTORIZE_ADD_STMT_COST
19077 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
19078
19079 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
19080 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
19081   aarch64_builtin_vectorization_cost
19082
19083 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
19084 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
19085
19086 #undef TARGET_VECTORIZE_BUILTINS
19087 #define TARGET_VECTORIZE_BUILTINS
19088
19089 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
19090 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
19091   aarch64_builtin_vectorized_function
19092
19093 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
19094 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
19095   aarch64_autovectorize_vector_sizes
19096
19097 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
19098 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
19099   aarch64_atomic_assign_expand_fenv
19100
19101 /* Section anchor support.  */
19102
19103 #undef TARGET_MIN_ANCHOR_OFFSET
19104 #define TARGET_MIN_ANCHOR_OFFSET -256
19105
19106 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
19107    byte offset; we can do much more for larger data types, but have no way
19108    to determine the size of the access.  We assume accesses are aligned.  */
19109 #undef TARGET_MAX_ANCHOR_OFFSET
19110 #define TARGET_MAX_ANCHOR_OFFSET 4095
19111
19112 #undef TARGET_VECTOR_ALIGNMENT
19113 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
19114
19115 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
19116 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
19117   aarch64_vectorize_preferred_vector_alignment
19118 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
19119 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
19120   aarch64_simd_vector_alignment_reachable
19121
19122 /* vec_perm support.  */
19123
19124 #undef TARGET_VECTORIZE_VEC_PERM_CONST
19125 #define TARGET_VECTORIZE_VEC_PERM_CONST \
19126   aarch64_vectorize_vec_perm_const
19127
19128 #undef TARGET_VECTORIZE_GET_MASK_MODE
19129 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
19130 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
19131 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
19132   aarch64_empty_mask_is_expensive
19133 #undef TARGET_PREFERRED_ELSE_VALUE
19134 #define TARGET_PREFERRED_ELSE_VALUE \
19135   aarch64_preferred_else_value
19136
19137 #undef TARGET_INIT_LIBFUNCS
19138 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
19139
19140 #undef TARGET_FIXED_CONDITION_CODE_REGS
19141 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
19142
19143 #undef TARGET_FLAGS_REGNUM
19144 #define TARGET_FLAGS_REGNUM CC_REGNUM
19145
19146 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
19147 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
19148
19149 #undef TARGET_ASAN_SHADOW_OFFSET
19150 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
19151
19152 #undef TARGET_LEGITIMIZE_ADDRESS
19153 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
19154
19155 #undef TARGET_SCHED_CAN_SPECULATE_INSN
19156 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
19157
19158 #undef TARGET_CAN_USE_DOLOOP_P
19159 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
19160
19161 #undef TARGET_SCHED_ADJUST_PRIORITY
19162 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
19163
19164 #undef TARGET_SCHED_MACRO_FUSION_P
19165 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
19166
19167 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
19168 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
19169
19170 #undef TARGET_SCHED_FUSION_PRIORITY
19171 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
19172
19173 #undef TARGET_UNSPEC_MAY_TRAP_P
19174 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
19175
19176 #undef TARGET_USE_PSEUDO_PIC_REG
19177 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
19178
19179 #undef TARGET_PRINT_OPERAND
19180 #define TARGET_PRINT_OPERAND aarch64_print_operand
19181
19182 #undef TARGET_PRINT_OPERAND_ADDRESS
19183 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
19184
19185 #undef TARGET_OPTAB_SUPPORTED_P
19186 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
19187
19188 #undef TARGET_OMIT_STRUCT_RETURN_REG
19189 #define TARGET_OMIT_STRUCT_RETURN_REG true
19190
19191 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
19192 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
19193   aarch64_dwarf_poly_indeterminate_value
19194
19195 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
19196 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
19197 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
19198
19199 #undef TARGET_HARD_REGNO_NREGS
19200 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
19201 #undef TARGET_HARD_REGNO_MODE_OK
19202 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
19203
19204 #undef TARGET_MODES_TIEABLE_P
19205 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
19206
19207 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
19208 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
19209   aarch64_hard_regno_call_part_clobbered
19210
19211 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
19212 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
19213   aarch64_remove_extra_call_preserved_regs
19214
19215 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
19216 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
19217   aarch64_return_call_with_max_clobbers
19218
19219 #undef TARGET_CONSTANT_ALIGNMENT
19220 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
19221
19222 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
19223 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
19224   aarch64_stack_clash_protection_alloca_probe_range
19225
19226 #undef TARGET_COMPUTE_PRESSURE_CLASSES
19227 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
19228
19229 #undef TARGET_CAN_CHANGE_MODE_CLASS
19230 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
19231
19232 #undef TARGET_SELECT_EARLY_REMAT_MODES
19233 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
19234
19235 #undef TARGET_SPECULATION_SAFE_VALUE
19236 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
19237
19238 #undef TARGET_ESTIMATED_POLY_VALUE
19239 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
19240
19241 #undef TARGET_ATTRIBUTE_TABLE
19242 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
19243
19244 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
19245 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
19246   aarch64_simd_clone_compute_vecsize_and_simdlen
19247
19248 #undef TARGET_SIMD_CLONE_ADJUST
19249 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
19250
19251 #undef TARGET_SIMD_CLONE_USABLE
19252 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
19253
19254 #undef TARGET_COMP_TYPE_ATTRIBUTES
19255 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
19256
19257 #undef TARGET_GET_MULTILIB_ABI_NAME
19258 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
19259
19260 #if CHECKING_P
19261 #undef TARGET_RUN_TARGET_SELFTESTS
19262 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
19263 #endif /* #if CHECKING_P */
19264
19265 struct gcc_target targetm = TARGET_INITIALIZER;
19266
19267 #include "gt-aarch64.h"