gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2019 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "cgraph.h"
  44 #include "diagnostic.h"
  45 #include "insn-attr.h"
  46 #include "alias.h"
  47 #include "fold-const.h"
  48 #include "stor-layout.h"
  49 #include "calls.h"
  50 #include "varasm.h"
  51 #include "output.h"
  52 #include "flags.h"
  53 #include "explow.h"
  54 #include "expr.h"
  55 #include "reload.h"
  56 #include "langhooks.h"
  57 #include "opts.h"
  58 #include "params.h"
  59 #include "gimplify.h"
  60 #include "dwarf2.h"
  61 #include "gimple-iterator.h"
  62 #include "tree-vectorizer.h"
  63 #include "aarch64-cost-tables.h"
  64 #include "dumpfile.h"
  65 #include "builtins.h"
  66 #include "rtl-iter.h"
  67 #include "tm-constrs.h"
  68 #include "sched-int.h"
  69 #include "target-globals.h"
  70 #include "common/common-target.h"
  71 #include "cfgrtl.h"
  72 #include "selftest.h"
  73 #include "selftest-rtl.h"
  74 #include "rtx-vector-builder.h"
  75 #include "intl.h"
  76
  77 /* This file should be included last.  */
  78 #include "target-def.h"
  79
  80 /* Defined for convenience.  */
  81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  82
  83 /* Information about a legitimate vector immediate operand.  */
  84 struct simd_immediate_info
  85 {
  86   enum insn_type { MOV, MVN };
  87   enum modifier_type { LSL, MSL };
  88
  89   simd_immediate_info () {}
  90   simd_immediate_info (scalar_float_mode, rtx);
  91   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  92                        insn_type = MOV, modifier_type = LSL,
  93                        unsigned int = 0);
  94   simd_immediate_info (scalar_mode, rtx, rtx);
  95
  96   /* The mode of the elements.  */
  97   scalar_mode elt_mode;
  98
  99   /* The value of each element if all elements are the same, or the
 100      first value if the constant is a series.  */
 101   rtx value;
 102
 103   /* The value of the step if the constant is a series, null otherwise.  */
 104   rtx step;
 105
 106   /* The instruction to use to move the immediate into a vector.  */
 107   insn_type insn;
 108
 109   /* The kind of shift modifier to use, and the number of bits to shift.
 110      This is (LSL, 0) if no shift is needed.  */
 111   modifier_type modifier;
 112   unsigned int shift;
 113 };
 114
 115 /* Construct a floating-point immediate in which each element has mode
 116    ELT_MODE_IN and value VALUE_IN.  */
 117 inline simd_immediate_info
 118 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 119   : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
 120     modifier (LSL), shift (0)
 121 {}
 122
 123 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 124    and value VALUE_IN.  The other parameters are as for the structure
 125    fields.  */
 126 inline simd_immediate_info
 127 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 128                        unsigned HOST_WIDE_INT value_in,
 129                        insn_type insn_in, modifier_type modifier_in,
 130                        unsigned int shift_in)
 131   : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
 132     step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
 133 {}
 134
 135 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 136    and where element I is equal to VALUE_IN + I * STEP_IN.  */
 137 inline simd_immediate_info
 138 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
 139   : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
 140     modifier (LSL), shift (0)
 141 {}
 142
 143 /* The current code model.  */
 144 enum aarch64_code_model aarch64_cmodel;
 145
 146 /* The number of 64-bit elements in an SVE vector.  */
 147 poly_uint16 aarch64_sve_vg;
 148
 149 #ifdef HAVE_AS_TLS
 150 #undef TARGET_HAVE_TLS
 151 #define TARGET_HAVE_TLS 1
 152 #endif
 153
 154 static bool aarch64_composite_type_p (const_tree, machine_mode);
 155 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 156                                                      const_tree,
 157                                                      machine_mode *, int *,
 158                                                      bool *);
 159 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 160 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 161 static void aarch64_override_options_after_change (void);
 162 static bool aarch64_vector_mode_supported_p (machine_mode);
 163 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 164 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 165                                                          const_tree type,
 166                                                          int misalignment,
 167                                                          bool is_packed);
 168 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 169 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 170                                             aarch64_addr_query_type);
 171 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 172
 173 /* Major revision number of the ARM Architecture implemented by the target.  */
 174 unsigned aarch64_architecture_version;
 175
 176 /* The processor for which instructions should be scheduled.  */
 177 enum aarch64_processor aarch64_tune = cortexa53;
 178
 179 /* Mask to specify which instruction scheduling options should be used.  */
 180 unsigned long aarch64_tune_flags = 0;
 181
 182 /* Global flag for PC relative loads.  */
 183 bool aarch64_pcrelative_literal_loads;
 184
 185 /* Global flag for whether frame pointer is enabled.  */
 186 bool aarch64_use_frame_pointer;
 187
 188 #define BRANCH_PROTECT_STR_MAX 255
 189 char *accepted_branch_protection_string = NULL;
 190
 191 static enum aarch64_parse_opt_result
 192 aarch64_parse_branch_protection (const char*, char**);
 193
 194 /* Support for command line parsing of boolean flags in the tuning
 195    structures.  */
 196 struct aarch64_flag_desc
 197 {
 198   const char* name;
 199   unsigned int flag;
 200 };
 201
 202 #define AARCH64_FUSION_PAIR(name, internal_name) \
 203   { name, AARCH64_FUSE_##internal_name },
 204 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 205 {
 206   { "none", AARCH64_FUSE_NOTHING },
 207 #include "aarch64-fusion-pairs.def"
 208   { "all", AARCH64_FUSE_ALL },
 209   { NULL, AARCH64_FUSE_NOTHING }
 210 };
 211
 212 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 213   { name, AARCH64_EXTRA_TUNE_##internal_name },
 214 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 215 {
 216   { "none", AARCH64_EXTRA_TUNE_NONE },
 217 #include "aarch64-tuning-flags.def"
 218   { "all", AARCH64_EXTRA_TUNE_ALL },
 219   { NULL, AARCH64_EXTRA_TUNE_NONE }
 220 };
 221
 222 /* Tuning parameters.  */
 223
 224 static const struct cpu_addrcost_table generic_addrcost_table =
 225 {
 226     {
 227       1, /* hi  */
 228       0, /* si  */
 229       0, /* di  */
 230       1, /* ti  */
 231     },
 232   0, /* pre_modify  */
 233   0, /* post_modify  */
 234   0, /* register_offset  */
 235   0, /* register_sextend  */
 236   0, /* register_zextend  */
 237   0 /* imm_offset  */
 238 };
 239
 240 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 241 {
 242     {
 243       0, /* hi  */
 244       0, /* si  */
 245       0, /* di  */
 246       2, /* ti  */
 247     },
 248   0, /* pre_modify  */
 249   0, /* post_modify  */
 250   1, /* register_offset  */
 251   1, /* register_sextend  */
 252   2, /* register_zextend  */
 253   0, /* imm_offset  */
 254 };
 255
 256 static const struct cpu_addrcost_table xgene1_addrcost_table =
 257 {
 258     {
 259       1, /* hi  */
 260       0, /* si  */
 261       0, /* di  */
 262       1, /* ti  */
 263     },
 264   1, /* pre_modify  */
 265   1, /* post_modify  */
 266   0, /* register_offset  */
 267   1, /* register_sextend  */
 268   1, /* register_zextend  */
 269   0, /* imm_offset  */
 270 };
 271
 272 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 273 {
 274     {
 275       1, /* hi  */
 276       1, /* si  */
 277       1, /* di  */
 278       2, /* ti  */
 279     },
 280   0, /* pre_modify  */
 281   0, /* post_modify  */
 282   2, /* register_offset  */
 283   3, /* register_sextend  */
 284   3, /* register_zextend  */
 285   0, /* imm_offset  */
 286 };
 287
 288 static const struct cpu_addrcost_table tsv110_addrcost_table =
 289 {
 290     {
 291       1, /* hi  */
 292       0, /* si  */
 293       0, /* di  */
 294       1, /* ti  */
 295     },
 296   0, /* pre_modify  */
 297   0, /* post_modify  */
 298   0, /* register_offset  */
 299   1, /* register_sextend  */
 300   1, /* register_zextend  */
 301   0, /* imm_offset  */
 302 };
 303
 304 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 305 {
 306     {
 307       1, /* hi  */
 308       1, /* si  */
 309       1, /* di  */
 310       2, /* ti  */
 311     },
 312   1, /* pre_modify  */
 313   1, /* post_modify  */
 314   3, /* register_offset  */
 315   3, /* register_sextend  */
 316   3, /* register_zextend  */
 317   2, /* imm_offset  */
 318 };
 319
 320 static const struct cpu_regmove_cost generic_regmove_cost =
 321 {
 322   1, /* GP2GP  */
 323   /* Avoid the use of slow int<->fp moves for spilling by setting
 324      their cost higher than memmov_cost.  */
 325   5, /* GP2FP  */
 326   5, /* FP2GP  */
 327   2 /* FP2FP  */
 328 };
 329
 330 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 331 {
 332   1, /* GP2GP  */
 333   /* Avoid the use of slow int<->fp moves for spilling by setting
 334      their cost higher than memmov_cost.  */
 335   5, /* GP2FP  */
 336   5, /* FP2GP  */
 337   2 /* FP2FP  */
 338 };
 339
 340 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 341 {
 342   1, /* GP2GP  */
 343   /* Avoid the use of slow int<->fp moves for spilling by setting
 344      their cost higher than memmov_cost.  */
 345   5, /* GP2FP  */
 346   5, /* FP2GP  */
 347   2 /* FP2FP  */
 348 };
 349
 350 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 351 {
 352   1, /* GP2GP  */
 353   /* Avoid the use of slow int<->fp moves for spilling by setting
 354      their cost higher than memmov_cost (actual, 4 and 9).  */
 355   9, /* GP2FP  */
 356   9, /* FP2GP  */
 357   1 /* FP2FP  */
 358 };
 359
 360 static const struct cpu_regmove_cost thunderx_regmove_cost =
 361 {
 362   2, /* GP2GP  */
 363   2, /* GP2FP  */
 364   6, /* FP2GP  */
 365   4 /* FP2FP  */
 366 };
 367
 368 static const struct cpu_regmove_cost xgene1_regmove_cost =
 369 {
 370   1, /* GP2GP  */
 371   /* Avoid the use of slow int<->fp moves for spilling by setting
 372      their cost higher than memmov_cost.  */
 373   8, /* GP2FP  */
 374   8, /* FP2GP  */
 375   2 /* FP2FP  */
 376 };
 377
 378 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 379 {
 380   2, /* GP2GP  */
 381   /* Avoid the use of int<->fp moves for spilling.  */
 382   6, /* GP2FP  */
 383   6, /* FP2GP  */
 384   4 /* FP2FP  */
 385 };
 386
 387 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 388 {
 389   1, /* GP2GP  */
 390   /* Avoid the use of int<->fp moves for spilling.  */
 391   8, /* GP2FP  */
 392   8, /* FP2GP  */
 393   4  /* FP2FP  */
 394 };
 395
 396 static const struct cpu_regmove_cost tsv110_regmove_cost =
 397 {
 398   1, /* GP2GP  */
 399   /* Avoid the use of slow int<->fp moves for spilling by setting
 400      their cost higher than memmov_cost.  */
 401   2, /* GP2FP  */
 402   3, /* FP2GP  */
 403   2  /* FP2FP  */
 404 };
 405
 406 /* Generic costs for vector insn classes.  */
 407 static const struct cpu_vector_cost generic_vector_cost =
 408 {
 409   1, /* scalar_int_stmt_cost  */
 410   1, /* scalar_fp_stmt_cost  */
 411   1, /* scalar_load_cost  */
 412   1, /* scalar_store_cost  */
 413   1, /* vec_int_stmt_cost  */
 414   1, /* vec_fp_stmt_cost  */
 415   2, /* vec_permute_cost  */
 416   1, /* vec_to_scalar_cost  */
 417   1, /* scalar_to_vec_cost  */
 418   1, /* vec_align_load_cost  */
 419   1, /* vec_unalign_load_cost  */
 420   1, /* vec_unalign_store_cost  */
 421   1, /* vec_store_cost  */
 422   3, /* cond_taken_branch_cost  */
 423   1 /* cond_not_taken_branch_cost  */
 424 };
 425
 426 /* QDF24XX costs for vector insn classes.  */
 427 static const struct cpu_vector_cost qdf24xx_vector_cost =
 428 {
 429   1, /* scalar_int_stmt_cost  */
 430   1, /* scalar_fp_stmt_cost  */
 431   1, /* scalar_load_cost  */
 432   1, /* scalar_store_cost  */
 433   1, /* vec_int_stmt_cost  */
 434   3, /* vec_fp_stmt_cost  */
 435   2, /* vec_permute_cost  */
 436   1, /* vec_to_scalar_cost  */
 437   1, /* scalar_to_vec_cost  */
 438   1, /* vec_align_load_cost  */
 439   1, /* vec_unalign_load_cost  */
 440   1, /* vec_unalign_store_cost  */
 441   1, /* vec_store_cost  */
 442   3, /* cond_taken_branch_cost  */
 443   1 /* cond_not_taken_branch_cost  */
 444 };
 445
 446 /* ThunderX costs for vector insn classes.  */
 447 static const struct cpu_vector_cost thunderx_vector_cost =
 448 {
 449   1, /* scalar_int_stmt_cost  */
 450   1, /* scalar_fp_stmt_cost  */
 451   3, /* scalar_load_cost  */
 452   1, /* scalar_store_cost  */
 453   4, /* vec_int_stmt_cost  */
 454   1, /* vec_fp_stmt_cost  */
 455   4, /* vec_permute_cost  */
 456   2, /* vec_to_scalar_cost  */
 457   2, /* scalar_to_vec_cost  */
 458   3, /* vec_align_load_cost  */
 459   5, /* vec_unalign_load_cost  */
 460   5, /* vec_unalign_store_cost  */
 461   1, /* vec_store_cost  */
 462   3, /* cond_taken_branch_cost  */
 463   3 /* cond_not_taken_branch_cost  */
 464 };
 465
 466 static const struct cpu_vector_cost tsv110_vector_cost =
 467 {
 468   1, /* scalar_int_stmt_cost  */
 469   1, /* scalar_fp_stmt_cost  */
 470   5, /* scalar_load_cost  */
 471   1, /* scalar_store_cost  */
 472   2, /* vec_int_stmt_cost  */
 473   2, /* vec_fp_stmt_cost  */
 474   2, /* vec_permute_cost  */
 475   3, /* vec_to_scalar_cost  */
 476   2, /* scalar_to_vec_cost  */
 477   5, /* vec_align_load_cost  */
 478   5, /* vec_unalign_load_cost  */
 479   1, /* vec_unalign_store_cost  */
 480   1, /* vec_store_cost  */
 481   1, /* cond_taken_branch_cost  */
 482   1 /* cond_not_taken_branch_cost  */
 483 };
 484
 485 /* Generic costs for vector insn classes.  */
 486 static const struct cpu_vector_cost cortexa57_vector_cost =
 487 {
 488   1, /* scalar_int_stmt_cost  */
 489   1, /* scalar_fp_stmt_cost  */
 490   4, /* scalar_load_cost  */
 491   1, /* scalar_store_cost  */
 492   2, /* vec_int_stmt_cost  */
 493   2, /* vec_fp_stmt_cost  */
 494   3, /* vec_permute_cost  */
 495   8, /* vec_to_scalar_cost  */
 496   8, /* scalar_to_vec_cost  */
 497   4, /* vec_align_load_cost  */
 498   4, /* vec_unalign_load_cost  */
 499   1, /* vec_unalign_store_cost  */
 500   1, /* vec_store_cost  */
 501   1, /* cond_taken_branch_cost  */
 502   1 /* cond_not_taken_branch_cost  */
 503 };
 504
 505 static const struct cpu_vector_cost exynosm1_vector_cost =
 506 {
 507   1, /* scalar_int_stmt_cost  */
 508   1, /* scalar_fp_stmt_cost  */
 509   5, /* scalar_load_cost  */
 510   1, /* scalar_store_cost  */
 511   3, /* vec_int_stmt_cost  */
 512   3, /* vec_fp_stmt_cost  */
 513   3, /* vec_permute_cost  */
 514   3, /* vec_to_scalar_cost  */
 515   3, /* scalar_to_vec_cost  */
 516   5, /* vec_align_load_cost  */
 517   5, /* vec_unalign_load_cost  */
 518   1, /* vec_unalign_store_cost  */
 519   1, /* vec_store_cost  */
 520   1, /* cond_taken_branch_cost  */
 521   1 /* cond_not_taken_branch_cost  */
 522 };
 523
 524 /* Generic costs for vector insn classes.  */
 525 static const struct cpu_vector_cost xgene1_vector_cost =
 526 {
 527   1, /* scalar_int_stmt_cost  */
 528   1, /* scalar_fp_stmt_cost  */
 529   5, /* scalar_load_cost  */
 530   1, /* scalar_store_cost  */
 531   2, /* vec_int_stmt_cost  */
 532   2, /* vec_fp_stmt_cost  */
 533   2, /* vec_permute_cost  */
 534   4, /* vec_to_scalar_cost  */
 535   4, /* scalar_to_vec_cost  */
 536   10, /* vec_align_load_cost  */
 537   10, /* vec_unalign_load_cost  */
 538   2, /* vec_unalign_store_cost  */
 539   2, /* vec_store_cost  */
 540   2, /* cond_taken_branch_cost  */
 541   1 /* cond_not_taken_branch_cost  */
 542 };
 543
 544 /* Costs for vector insn classes for Vulcan.  */
 545 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 546 {
 547   1, /* scalar_int_stmt_cost  */
 548   6, /* scalar_fp_stmt_cost  */
 549   4, /* scalar_load_cost  */
 550   1, /* scalar_store_cost  */
 551   5, /* vec_int_stmt_cost  */
 552   6, /* vec_fp_stmt_cost  */
 553   3, /* vec_permute_cost  */
 554   6, /* vec_to_scalar_cost  */
 555   5, /* scalar_to_vec_cost  */
 556   8, /* vec_align_load_cost  */
 557   8, /* vec_unalign_load_cost  */
 558   4, /* vec_unalign_store_cost  */
 559   4, /* vec_store_cost  */
 560   2, /* cond_taken_branch_cost  */
 561   1  /* cond_not_taken_branch_cost  */
 562 };
 563
 564 /* Generic costs for branch instructions.  */
 565 static const struct cpu_branch_cost generic_branch_cost =
 566 {
 567   1,  /* Predictable.  */
 568   3   /* Unpredictable.  */
 569 };
 570
 571 /* Generic approximation modes.  */
 572 static const cpu_approx_modes generic_approx_modes =
 573 {
 574   AARCH64_APPROX_NONE,  /* division  */
 575   AARCH64_APPROX_NONE,  /* sqrt  */
 576   AARCH64_APPROX_NONE   /* recip_sqrt  */
 577 };
 578
 579 /* Approximation modes for Exynos M1.  */
 580 static const cpu_approx_modes exynosm1_approx_modes =
 581 {
 582   AARCH64_APPROX_NONE,  /* division  */
 583   AARCH64_APPROX_ALL,   /* sqrt  */
 584   AARCH64_APPROX_ALL    /* recip_sqrt  */
 585 };
 586
 587 /* Approximation modes for X-Gene 1.  */
 588 static const cpu_approx_modes xgene1_approx_modes =
 589 {
 590   AARCH64_APPROX_NONE,  /* division  */
 591   AARCH64_APPROX_NONE,  /* sqrt  */
 592   AARCH64_APPROX_ALL    /* recip_sqrt  */
 593 };
 594
 595 /* Generic prefetch settings (which disable prefetch).  */
 596 static const cpu_prefetch_tune generic_prefetch_tune =
 597 {
 598   0,                    /* num_slots  */
 599   -1,                   /* l1_cache_size  */
 600   -1,                   /* l1_cache_line_size  */
 601   -1,                   /* l2_cache_size  */
 602   true,                 /* prefetch_dynamic_strides */
 603   -1,                   /* minimum_stride */
 604   -1                    /* default_opt_level  */
 605 };
 606
 607 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 608 {
 609   0,                    /* num_slots  */
 610   -1,                   /* l1_cache_size  */
 611   64,                   /* l1_cache_line_size  */
 612   -1,                   /* l2_cache_size  */
 613   true,                 /* prefetch_dynamic_strides */
 614   -1,                   /* minimum_stride */
 615   -1                    /* default_opt_level  */
 616 };
 617
 618 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 619 {
 620   4,                    /* num_slots  */
 621   32,                   /* l1_cache_size  */
 622   64,                   /* l1_cache_line_size  */
 623   512,                  /* l2_cache_size  */
 624   false,                /* prefetch_dynamic_strides */
 625   2048,                 /* minimum_stride */
 626   3                     /* default_opt_level  */
 627 };
 628
 629 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 630 {
 631   8,                    /* num_slots  */
 632   32,                   /* l1_cache_size  */
 633   128,                  /* l1_cache_line_size  */
 634   16*1024,              /* l2_cache_size  */
 635   true,                 /* prefetch_dynamic_strides */
 636   -1,                   /* minimum_stride */
 637   3                     /* default_opt_level  */
 638 };
 639
 640 static const cpu_prefetch_tune thunderx_prefetch_tune =
 641 {
 642   8,                    /* num_slots  */
 643   32,                   /* l1_cache_size  */
 644   128,                  /* l1_cache_line_size  */
 645   -1,                   /* l2_cache_size  */
 646   true,                 /* prefetch_dynamic_strides */
 647   -1,                   /* minimum_stride */
 648   -1                    /* default_opt_level  */
 649 };
 650
 651 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 652 {
 653   8,                    /* num_slots  */
 654   32,                   /* l1_cache_size  */
 655   64,                   /* l1_cache_line_size  */
 656   256,                  /* l2_cache_size  */
 657   true,                 /* prefetch_dynamic_strides */
 658   -1,                   /* minimum_stride */
 659   -1                    /* default_opt_level  */
 660 };
 661
 662 static const cpu_prefetch_tune tsv110_prefetch_tune =
 663 {
 664   0,                    /* num_slots  */
 665   64,                   /* l1_cache_size  */
 666   64,                   /* l1_cache_line_size  */
 667   512,                  /* l2_cache_size  */
 668   true,                 /* prefetch_dynamic_strides */
 669   -1,                   /* minimum_stride */
 670   -1                    /* default_opt_level  */
 671 };
 672
 673 static const cpu_prefetch_tune xgene1_prefetch_tune =
 674 {
 675   8,                    /* num_slots  */
 676   32,                   /* l1_cache_size  */
 677   64,                   /* l1_cache_line_size  */
 678   256,                  /* l2_cache_size  */
 679   true,                 /* prefetch_dynamic_strides */
 680   -1,                   /* minimum_stride */
 681   -1                    /* default_opt_level  */
 682 };
 683
 684 static const struct tune_params generic_tunings =
 685 {
 686   &cortexa57_extra_costs,
 687   &generic_addrcost_table,
 688   &generic_regmove_cost,
 689   &generic_vector_cost,
 690   &generic_branch_cost,
 691   &generic_approx_modes,
 692   SVE_NOT_IMPLEMENTED, /* sve_width  */
 693   4, /* memmov_cost  */
 694   2, /* issue_rate  */
 695   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 696   "8",  /* function_align.  */
 697   "4",  /* jump_align.  */
 698   "8",  /* loop_align.  */
 699   2,    /* int_reassoc_width.  */
 700   4,    /* fp_reassoc_width.  */
 701   1,    /* vec_reassoc_width.  */
 702   2,    /* min_div_recip_mul_sf.  */
 703   2,    /* min_div_recip_mul_df.  */
 704   0,    /* max_case_values.  */
 705   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 706   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 707   &generic_prefetch_tune
 708 };
 709
 710 static const struct tune_params cortexa35_tunings =
 711 {
 712   &cortexa53_extra_costs,
 713   &generic_addrcost_table,
 714   &cortexa53_regmove_cost,
 715   &generic_vector_cost,
 716   &generic_branch_cost,
 717   &generic_approx_modes,
 718   SVE_NOT_IMPLEMENTED, /* sve_width  */
 719   4, /* memmov_cost  */
 720   1, /* issue_rate  */
 721   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 722    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 723   "16", /* function_align.  */
 724   "4",  /* jump_align.  */
 725   "8",  /* loop_align.  */
 726   2,    /* int_reassoc_width.  */
 727   4,    /* fp_reassoc_width.  */
 728   1,    /* vec_reassoc_width.  */
 729   2,    /* min_div_recip_mul_sf.  */
 730   2,    /* min_div_recip_mul_df.  */
 731   0,    /* max_case_values.  */
 732   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 733   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 734   &generic_prefetch_tune
 735 };
 736
 737 static const struct tune_params cortexa53_tunings =
 738 {
 739   &cortexa53_extra_costs,
 740   &generic_addrcost_table,
 741   &cortexa53_regmove_cost,
 742   &generic_vector_cost,
 743   &generic_branch_cost,
 744   &generic_approx_modes,
 745   SVE_NOT_IMPLEMENTED, /* sve_width  */
 746   4, /* memmov_cost  */
 747   2, /* issue_rate  */
 748   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 749    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 750   "16", /* function_align.  */
 751   "4",  /* jump_align.  */
 752   "8",  /* loop_align.  */
 753   2,    /* int_reassoc_width.  */
 754   4,    /* fp_reassoc_width.  */
 755   1,    /* vec_reassoc_width.  */
 756   2,    /* min_div_recip_mul_sf.  */
 757   2,    /* min_div_recip_mul_df.  */
 758   0,    /* max_case_values.  */
 759   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 760   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 761   &generic_prefetch_tune
 762 };
 763
 764 static const struct tune_params cortexa57_tunings =
 765 {
 766   &cortexa57_extra_costs,
 767   &generic_addrcost_table,
 768   &cortexa57_regmove_cost,
 769   &cortexa57_vector_cost,
 770   &generic_branch_cost,
 771   &generic_approx_modes,
 772   SVE_NOT_IMPLEMENTED, /* sve_width  */
 773   4, /* memmov_cost  */
 774   3, /* issue_rate  */
 775   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 776    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 777   "16", /* function_align.  */
 778   "4",  /* jump_align.  */
 779   "8",  /* loop_align.  */
 780   2,    /* int_reassoc_width.  */
 781   4,    /* fp_reassoc_width.  */
 782   1,    /* vec_reassoc_width.  */
 783   2,    /* min_div_recip_mul_sf.  */
 784   2,    /* min_div_recip_mul_df.  */
 785   0,    /* max_case_values.  */
 786   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 787   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 788   &generic_prefetch_tune
 789 };
 790
 791 static const struct tune_params cortexa72_tunings =
 792 {
 793   &cortexa57_extra_costs,
 794   &generic_addrcost_table,
 795   &cortexa57_regmove_cost,
 796   &cortexa57_vector_cost,
 797   &generic_branch_cost,
 798   &generic_approx_modes,
 799   SVE_NOT_IMPLEMENTED, /* sve_width  */
 800   4, /* memmov_cost  */
 801   3, /* issue_rate  */
 802   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 803    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 804   "16", /* function_align.  */
 805   "4",  /* jump_align.  */
 806   "8",  /* loop_align.  */
 807   2,    /* int_reassoc_width.  */
 808   4,    /* fp_reassoc_width.  */
 809   1,    /* vec_reassoc_width.  */
 810   2,    /* min_div_recip_mul_sf.  */
 811   2,    /* min_div_recip_mul_df.  */
 812   0,    /* max_case_values.  */
 813   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 814   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 815   &generic_prefetch_tune
 816 };
 817
 818 static const struct tune_params cortexa73_tunings =
 819 {
 820   &cortexa57_extra_costs,
 821   &generic_addrcost_table,
 822   &cortexa57_regmove_cost,
 823   &cortexa57_vector_cost,
 824   &generic_branch_cost,
 825   &generic_approx_modes,
 826   SVE_NOT_IMPLEMENTED, /* sve_width  */
 827   4, /* memmov_cost.  */
 828   2, /* issue_rate.  */
 829   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 830    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 831   "16", /* function_align.  */
 832   "4",  /* jump_align.  */
 833   "8",  /* loop_align.  */
 834   2,    /* int_reassoc_width.  */
 835   4,    /* fp_reassoc_width.  */
 836   1,    /* vec_reassoc_width.  */
 837   2,    /* min_div_recip_mul_sf.  */
 838   2,    /* min_div_recip_mul_df.  */
 839   0,    /* max_case_values.  */
 840   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 841   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 842   &generic_prefetch_tune
 843 };
 844
 845
 846
 847 static const struct tune_params exynosm1_tunings =
 848 {
 849   &exynosm1_extra_costs,
 850   &exynosm1_addrcost_table,
 851   &exynosm1_regmove_cost,
 852   &exynosm1_vector_cost,
 853   &generic_branch_cost,
 854   &exynosm1_approx_modes,
 855   SVE_NOT_IMPLEMENTED, /* sve_width  */
 856   4,    /* memmov_cost  */
 857   3,    /* issue_rate  */
 858   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 859   "4",  /* function_align.  */
 860   "4",  /* jump_align.  */
 861   "4",  /* loop_align.  */
 862   2,    /* int_reassoc_width.  */
 863   4,    /* fp_reassoc_width.  */
 864   1,    /* vec_reassoc_width.  */
 865   2,    /* min_div_recip_mul_sf.  */
 866   2,    /* min_div_recip_mul_df.  */
 867   48,   /* max_case_values.  */
 868   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 869   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 870   &exynosm1_prefetch_tune
 871 };
 872
 873 static const struct tune_params thunderxt88_tunings =
 874 {
 875   &thunderx_extra_costs,
 876   &generic_addrcost_table,
 877   &thunderx_regmove_cost,
 878   &thunderx_vector_cost,
 879   &generic_branch_cost,
 880   &generic_approx_modes,
 881   SVE_NOT_IMPLEMENTED, /* sve_width  */
 882   6, /* memmov_cost  */
 883   2, /* issue_rate  */
 884   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 885   "8",  /* function_align.  */
 886   "8",  /* jump_align.  */
 887   "8",  /* loop_align.  */
 888   2,    /* int_reassoc_width.  */
 889   4,    /* fp_reassoc_width.  */
 890   1,    /* vec_reassoc_width.  */
 891   2,    /* min_div_recip_mul_sf.  */
 892   2,    /* min_div_recip_mul_df.  */
 893   0,    /* max_case_values.  */
 894   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 895   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 896   &thunderxt88_prefetch_tune
 897 };
 898
 899 static const struct tune_params thunderx_tunings =
 900 {
 901   &thunderx_extra_costs,
 902   &generic_addrcost_table,
 903   &thunderx_regmove_cost,
 904   &thunderx_vector_cost,
 905   &generic_branch_cost,
 906   &generic_approx_modes,
 907   SVE_NOT_IMPLEMENTED, /* sve_width  */
 908   6, /* memmov_cost  */
 909   2, /* issue_rate  */
 910   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 911   "8",  /* function_align.  */
 912   "8",  /* jump_align.  */
 913   "8",  /* loop_align.  */
 914   2,    /* int_reassoc_width.  */
 915   4,    /* fp_reassoc_width.  */
 916   1,    /* vec_reassoc_width.  */
 917   2,    /* min_div_recip_mul_sf.  */
 918   2,    /* min_div_recip_mul_df.  */
 919   0,    /* max_case_values.  */
 920   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 921   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 922    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 923   &thunderx_prefetch_tune
 924 };
 925
 926 static const struct tune_params tsv110_tunings =
 927 {
 928   &tsv110_extra_costs,
 929   &tsv110_addrcost_table,
 930   &tsv110_regmove_cost,
 931   &tsv110_vector_cost,
 932   &generic_branch_cost,
 933   &generic_approx_modes,
 934   SVE_NOT_IMPLEMENTED, /* sve_width  */
 935   4,    /* memmov_cost  */
 936   4,    /* issue_rate  */
 937   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
 938    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 939   "16", /* function_align.  */
 940   "4",  /* jump_align.  */
 941   "8",  /* loop_align.  */
 942   2,    /* int_reassoc_width.  */
 943   4,    /* fp_reassoc_width.  */
 944   1,    /* vec_reassoc_width.  */
 945   2,    /* min_div_recip_mul_sf.  */
 946   2,    /* min_div_recip_mul_df.  */
 947   0,    /* max_case_values.  */
 948   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 949   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
 950   &tsv110_prefetch_tune
 951 };
 952
 953 static const struct tune_params xgene1_tunings =
 954 {
 955   &xgene1_extra_costs,
 956   &xgene1_addrcost_table,
 957   &xgene1_regmove_cost,
 958   &xgene1_vector_cost,
 959   &generic_branch_cost,
 960   &xgene1_approx_modes,
 961   SVE_NOT_IMPLEMENTED, /* sve_width  */
 962   6, /* memmov_cost  */
 963   4, /* issue_rate  */
 964   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 965   "16", /* function_align.  */
 966   "16", /* jump_align.  */
 967   "16", /* loop_align.  */
 968   2,    /* int_reassoc_width.  */
 969   4,    /* fp_reassoc_width.  */
 970   1,    /* vec_reassoc_width.  */
 971   2,    /* min_div_recip_mul_sf.  */
 972   2,    /* min_div_recip_mul_df.  */
 973   17,   /* max_case_values.  */
 974   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 975   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
 976   &xgene1_prefetch_tune
 977 };
 978
 979 static const struct tune_params emag_tunings =
 980 {
 981   &xgene1_extra_costs,
 982   &xgene1_addrcost_table,
 983   &xgene1_regmove_cost,
 984   &xgene1_vector_cost,
 985   &generic_branch_cost,
 986   &xgene1_approx_modes,
 987   SVE_NOT_IMPLEMENTED,
 988   6, /* memmov_cost  */
 989   4, /* issue_rate  */
 990   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 991   "16", /* function_align.  */
 992   "16", /* jump_align.  */
 993   "16", /* loop_align.  */
 994   2,    /* int_reassoc_width.  */
 995   4,    /* fp_reassoc_width.  */
 996   1,    /* vec_reassoc_width.  */
 997   2,    /* min_div_recip_mul_sf.  */
 998   2,    /* min_div_recip_mul_df.  */
 999   17,   /* max_case_values.  */
1000   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1001   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1002   &xgene1_prefetch_tune
1003 };
1004
1005 static const struct tune_params qdf24xx_tunings =
1006 {
1007   &qdf24xx_extra_costs,
1008   &qdf24xx_addrcost_table,
1009   &qdf24xx_regmove_cost,
1010   &qdf24xx_vector_cost,
1011   &generic_branch_cost,
1012   &generic_approx_modes,
1013   SVE_NOT_IMPLEMENTED, /* sve_width  */
1014   4, /* memmov_cost  */
1015   4, /* issue_rate  */
1016   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1017    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1018   "16", /* function_align.  */
1019   "8",  /* jump_align.  */
1020   "16", /* loop_align.  */
1021   2,    /* int_reassoc_width.  */
1022   4,    /* fp_reassoc_width.  */
1023   1,    /* vec_reassoc_width.  */
1024   2,    /* min_div_recip_mul_sf.  */
1025   2,    /* min_div_recip_mul_df.  */
1026   0,    /* max_case_values.  */
1027   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1028   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1029   &qdf24xx_prefetch_tune
1030 };
1031
1032 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1033    for now.  */
1034 static const struct tune_params saphira_tunings =
1035 {
1036   &generic_extra_costs,
1037   &generic_addrcost_table,
1038   &generic_regmove_cost,
1039   &generic_vector_cost,
1040   &generic_branch_cost,
1041   &generic_approx_modes,
1042   SVE_NOT_IMPLEMENTED, /* sve_width  */
1043   4, /* memmov_cost  */
1044   4, /* issue_rate  */
1045   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1046    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1047   "16", /* function_align.  */
1048   "8",  /* jump_align.  */
1049   "16", /* loop_align.  */
1050   2,    /* int_reassoc_width.  */
1051   4,    /* fp_reassoc_width.  */
1052   1,    /* vec_reassoc_width.  */
1053   2,    /* min_div_recip_mul_sf.  */
1054   2,    /* min_div_recip_mul_df.  */
1055   0,    /* max_case_values.  */
1056   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1057   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1058   &generic_prefetch_tune
1059 };
1060
1061 static const struct tune_params thunderx2t99_tunings =
1062 {
1063   &thunderx2t99_extra_costs,
1064   &thunderx2t99_addrcost_table,
1065   &thunderx2t99_regmove_cost,
1066   &thunderx2t99_vector_cost,
1067   &generic_branch_cost,
1068   &generic_approx_modes,
1069   SVE_NOT_IMPLEMENTED, /* sve_width  */
1070   4, /* memmov_cost.  */
1071   4, /* issue_rate.  */
1072   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1073    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
1074   "16", /* function_align.  */
1075   "8",  /* jump_align.  */
1076   "16", /* loop_align.  */
1077   3,    /* int_reassoc_width.  */
1078   2,    /* fp_reassoc_width.  */
1079   2,    /* vec_reassoc_width.  */
1080   2,    /* min_div_recip_mul_sf.  */
1081   2,    /* min_div_recip_mul_df.  */
1082   0,    /* max_case_values.  */
1083   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1084   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1085   &thunderx2t99_prefetch_tune
1086 };
1087
1088 static const struct tune_params ares_tunings =
1089 {
1090   &cortexa57_extra_costs,
1091   &generic_addrcost_table,
1092   &generic_regmove_cost,
1093   &cortexa57_vector_cost,
1094   &generic_branch_cost,
1095   &generic_approx_modes,
1096   SVE_NOT_IMPLEMENTED, /* sve_width  */
1097   4, /* memmov_cost  */
1098   3, /* issue_rate  */
1099   AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
1100   "32:16",      /* function_align.  */
1101   "32:16",      /* jump_align.  */
1102   "32:16",      /* loop_align.  */
1103   2,    /* int_reassoc_width.  */
1104   4,    /* fp_reassoc_width.  */
1105   2,    /* vec_reassoc_width.  */
1106   2,    /* min_div_recip_mul_sf.  */
1107   2,    /* min_div_recip_mul_df.  */
1108   0,    /* max_case_values.  */
1109   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1110   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1111   &generic_prefetch_tune
1112 };
1113
1114 /* Support for fine-grained override of the tuning structures.  */
1115 struct aarch64_tuning_override_function
1116 {
1117   const char* name;
1118   void (*parse_override)(const char*, struct tune_params*);
1119 };
1120
1121 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1122 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1123 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1124
1125 static const struct aarch64_tuning_override_function
1126 aarch64_tuning_override_functions[] =
1127 {
1128   { "fuse", aarch64_parse_fuse_string },
1129   { "tune", aarch64_parse_tune_string },
1130   { "sve_width", aarch64_parse_sve_width_string },
1131   { NULL, NULL }
1132 };
1133
1134 /* A processor implementing AArch64.  */
1135 struct processor
1136 {
1137   const char *const name;
1138   enum aarch64_processor ident;
1139   enum aarch64_processor sched_core;
1140   enum aarch64_arch arch;
1141   unsigned architecture_version;
1142   const unsigned long flags;
1143   const struct tune_params *const tune;
1144 };
1145
1146 /* Architectures implementing AArch64.  */
1147 static const struct processor all_architectures[] =
1148 {
1149 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1150   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1151 #include "aarch64-arches.def"
1152   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1153 };
1154
1155 /* Processor cores implementing AArch64.  */
1156 static const struct processor all_cores[] =
1157 {
1158 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1159   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1160   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1161   FLAGS, &COSTS##_tunings},
1162 #include "aarch64-cores.def"
1163   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1164     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1165   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1166 };
1167
1168
1169 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1170    handling code or by target attributes.  */
1171 static const struct processor *selected_arch;
1172 static const struct processor *selected_cpu;
1173 static const struct processor *selected_tune;
1174
1175 /* The current tuning set.  */
1176 struct tune_params aarch64_tune_params = generic_tunings;
1177
1178 /* Table of machine attributes.  */
1179 static const struct attribute_spec aarch64_attribute_table[] =
1180 {
1181   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1182        affects_type_identity, handler, exclude } */
1183   { "aarch64_vector_pcs", 0, 0, false, true,  true,  false, NULL, NULL },
1184   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1185 };
1186
1187 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1188
1189 /* An ISA extension in the co-processor and main instruction set space.  */
1190 struct aarch64_option_extension
1191 {
1192   const char *const name;
1193   const unsigned long flags_on;
1194   const unsigned long flags_off;
1195 };
1196
1197 typedef enum aarch64_cond_code
1198 {
1199   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1200   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1201   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1202 }
1203 aarch64_cc;
1204
1205 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1206
1207 struct aarch64_branch_protect_type
1208 {
1209   /* The type's name that the user passes to the branch-protection option
1210     string.  */
1211   const char* name;
1212   /* Function to handle the protection type and set global variables.
1213     First argument is the string token corresponding with this type and the
1214     second argument is the next token in the option string.
1215     Return values:
1216     * AARCH64_PARSE_OK: Handling was sucessful.
1217     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1218       should print an error.
1219     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1220       own error.  */
1221   enum aarch64_parse_opt_result (*handler)(char*, char*);
1222   /* A list of types that can follow this type in the option string.  */
1223   const aarch64_branch_protect_type* subtypes;
1224   unsigned int num_subtypes;
1225 };
1226
1227 static enum aarch64_parse_opt_result
1228 aarch64_handle_no_branch_protection (char* str, char* rest)
1229 {
1230   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1231   aarch64_enable_bti = 0;
1232   if (rest)
1233     {
1234       error ("unexpected %<%s%> after %<%s%>", rest, str);
1235       return AARCH64_PARSE_INVALID_FEATURE;
1236     }
1237   return AARCH64_PARSE_OK;
1238 }
1239
1240 static enum aarch64_parse_opt_result
1241 aarch64_handle_standard_branch_protection (char* str, char* rest)
1242 {
1243   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1244   aarch64_enable_bti = 1;
1245   if (rest)
1246     {
1247       error ("unexpected %<%s%> after %<%s%>", rest, str);
1248       return AARCH64_PARSE_INVALID_FEATURE;
1249     }
1250   return AARCH64_PARSE_OK;
1251 }
1252
1253 static enum aarch64_parse_opt_result
1254 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1255                                     char* rest ATTRIBUTE_UNUSED)
1256 {
1257   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1258   return AARCH64_PARSE_OK;
1259 }
1260
1261 static enum aarch64_parse_opt_result
1262 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1263                               char* rest ATTRIBUTE_UNUSED)
1264 {
1265   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1266   return AARCH64_PARSE_OK;
1267 }
1268
1269 static enum aarch64_parse_opt_result
1270 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1271                                     char* rest ATTRIBUTE_UNUSED)
1272 {
1273   aarch64_enable_bti = 1;
1274   return AARCH64_PARSE_OK;
1275 }
1276
1277 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1278   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1279   { NULL, NULL, NULL, 0 }
1280 };
1281
1282 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1283   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1284   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1285   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1286     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1287   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1288   { NULL, NULL, NULL, 0 }
1289 };
1290
1291 /* The condition codes of the processor, and the inverse function.  */
1292 static const char * const aarch64_condition_codes[] =
1293 {
1294   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1295   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1296 };
1297
1298 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1299 const char *
1300 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1301                         const char * branch_format)
1302 {
1303     rtx_code_label * tmp_label = gen_label_rtx ();
1304     char label_buf[256];
1305     char buffer[128];
1306     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1307                                  CODE_LABEL_NUMBER (tmp_label));
1308     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1309     rtx dest_label = operands[pos_label];
1310     operands[pos_label] = tmp_label;
1311
1312     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1313     output_asm_insn (buffer, operands);
1314
1315     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1316     operands[pos_label] = dest_label;
1317     output_asm_insn (buffer, operands);
1318     return "";
1319 }
1320
1321 void
1322 aarch64_err_no_fpadvsimd (machine_mode mode)
1323 {
1324   if (TARGET_GENERAL_REGS_ONLY)
1325     if (FLOAT_MODE_P (mode))
1326       error ("%qs is incompatible with the use of floating-point types",
1327              "-mgeneral-regs-only");
1328     else
1329       error ("%qs is incompatible with the use of vector types",
1330              "-mgeneral-regs-only");
1331   else
1332     if (FLOAT_MODE_P (mode))
1333       error ("%qs feature modifier is incompatible with the use of"
1334              " floating-point types", "+nofp");
1335     else
1336       error ("%qs feature modifier is incompatible with the use of"
1337              " vector types", "+nofp");
1338 }
1339
1340 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1341    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1342    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1343    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1344    and GENERAL_REGS is lower than the memory cost (in this case the best class
1345    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1346    cost results in bad allocations with many redundant int<->FP moves which
1347    are expensive on various cores.
1348    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1349    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1350    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1351    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1352    The result of this is that it is no longer inefficient to have a higher
1353    memory move cost than the register move cost.
1354 */
1355
1356 static reg_class_t
1357 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1358                                          reg_class_t best_class)
1359 {
1360   machine_mode mode;
1361
1362   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1363       || !reg_class_subset_p (FP_REGS, allocno_class))
1364     return allocno_class;
1365
1366   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1367       || !reg_class_subset_p (FP_REGS, best_class))
1368     return best_class;
1369
1370   mode = PSEUDO_REGNO_MODE (regno);
1371   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1372 }
1373
1374 static unsigned int
1375 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1376 {
1377   if (GET_MODE_UNIT_SIZE (mode) == 4)
1378     return aarch64_tune_params.min_div_recip_mul_sf;
1379   return aarch64_tune_params.min_div_recip_mul_df;
1380 }
1381
1382 /* Return the reassociation width of treeop OPC with mode MODE.  */
1383 static int
1384 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1385 {
1386   if (VECTOR_MODE_P (mode))
1387     return aarch64_tune_params.vec_reassoc_width;
1388   if (INTEGRAL_MODE_P (mode))
1389     return aarch64_tune_params.int_reassoc_width;
1390   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1391   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1392     return aarch64_tune_params.fp_reassoc_width;
1393   return 1;
1394 }
1395
1396 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1397 unsigned
1398 aarch64_dbx_register_number (unsigned regno)
1399 {
1400    if (GP_REGNUM_P (regno))
1401      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1402    else if (regno == SP_REGNUM)
1403      return AARCH64_DWARF_SP;
1404    else if (FP_REGNUM_P (regno))
1405      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1406    else if (PR_REGNUM_P (regno))
1407      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1408    else if (regno == VG_REGNUM)
1409      return AARCH64_DWARF_VG;
1410
1411    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1412       equivalent DWARF register.  */
1413    return DWARF_FRAME_REGISTERS;
1414 }
1415
1416 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1417 static bool
1418 aarch64_advsimd_struct_mode_p (machine_mode mode)
1419 {
1420   return (TARGET_SIMD
1421           && (mode == OImode || mode == CImode || mode == XImode));
1422 }
1423
1424 /* Return true if MODE is an SVE predicate mode.  */
1425 static bool
1426 aarch64_sve_pred_mode_p (machine_mode mode)
1427 {
1428   return (TARGET_SVE
1429           && (mode == VNx16BImode
1430               || mode == VNx8BImode
1431               || mode == VNx4BImode
1432               || mode == VNx2BImode));
1433 }
1434
1435 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1436 const unsigned int VEC_ADVSIMD  = 1;
1437 const unsigned int VEC_SVE_DATA = 2;
1438 const unsigned int VEC_SVE_PRED = 4;
1439 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1440    a structure of 2, 3 or 4 vectors.  */
1441 const unsigned int VEC_STRUCT   = 8;
1442 /* Useful combinations of the above.  */
1443 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1444 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1445
1446 /* Return a set of flags describing the vector properties of mode MODE.
1447    Ignore modes that are not supported by the current target.  */
1448 static unsigned int
1449 aarch64_classify_vector_mode (machine_mode mode)
1450 {
1451   if (aarch64_advsimd_struct_mode_p (mode))
1452     return VEC_ADVSIMD | VEC_STRUCT;
1453
1454   if (aarch64_sve_pred_mode_p (mode))
1455     return VEC_SVE_PRED;
1456
1457   scalar_mode inner = GET_MODE_INNER (mode);
1458   if (VECTOR_MODE_P (mode)
1459       && (inner == QImode
1460           || inner == HImode
1461           || inner == HFmode
1462           || inner == SImode
1463           || inner == SFmode
1464           || inner == DImode
1465           || inner == DFmode))
1466     {
1467       if (TARGET_SVE)
1468         {
1469           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1470             return VEC_SVE_DATA;
1471           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1472               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1473               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1474             return VEC_SVE_DATA | VEC_STRUCT;
1475         }
1476
1477       /* This includes V1DF but not V1DI (which doesn't exist).  */
1478       if (TARGET_SIMD
1479           && (known_eq (GET_MODE_BITSIZE (mode), 64)
1480               || known_eq (GET_MODE_BITSIZE (mode), 128)))
1481         return VEC_ADVSIMD;
1482     }
1483
1484   return 0;
1485 }
1486
1487 /* Return true if MODE is any of the data vector modes, including
1488    structure modes.  */
1489 static bool
1490 aarch64_vector_data_mode_p (machine_mode mode)
1491 {
1492   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1493 }
1494
1495 /* Return true if MODE is an SVE data vector mode; either a single vector
1496    or a structure of vectors.  */
1497 static bool
1498 aarch64_sve_data_mode_p (machine_mode mode)
1499 {
1500   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1501 }
1502
1503 /* Implement target hook TARGET_ARRAY_MODE.  */
1504 static opt_machine_mode
1505 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1506 {
1507   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1508       && IN_RANGE (nelems, 2, 4))
1509     return mode_for_vector (GET_MODE_INNER (mode),
1510                             GET_MODE_NUNITS (mode) * nelems);
1511
1512   return opt_machine_mode ();
1513 }
1514
1515 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1516 static bool
1517 aarch64_array_mode_supported_p (machine_mode mode,
1518                                 unsigned HOST_WIDE_INT nelems)
1519 {
1520   if (TARGET_SIMD
1521       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1522           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1523       && (nelems >= 2 && nelems <= 4))
1524     return true;
1525
1526   return false;
1527 }
1528
1529 /* Return the SVE predicate mode to use for elements that have
1530    ELEM_NBYTES bytes, if such a mode exists.  */
1531
1532 opt_machine_mode
1533 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1534 {
1535   if (TARGET_SVE)
1536     {
1537       if (elem_nbytes == 1)
1538         return VNx16BImode;
1539       if (elem_nbytes == 2)
1540         return VNx8BImode;
1541       if (elem_nbytes == 4)
1542         return VNx4BImode;
1543       if (elem_nbytes == 8)
1544         return VNx2BImode;
1545     }
1546   return opt_machine_mode ();
1547 }
1548
1549 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1550
1551 static opt_machine_mode
1552 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1553 {
1554   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1555     {
1556       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1557       machine_mode pred_mode;
1558       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1559         return pred_mode;
1560     }
1561
1562   return default_get_mask_mode (nunits, nbytes);
1563 }
1564
1565 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1566    prefer to use the first arithmetic operand as the else value if
1567    the else value doesn't matter, since that exactly matches the SVE
1568    destructive merging form.  For ternary operations we could either
1569    pick the first operand and use FMAD-like instructions or the last
1570    operand and use FMLA-like instructions; the latter seems more
1571    natural.  */
1572
1573 static tree
1574 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1575 {
1576   return nops == 3 ? ops[2] : ops[0];
1577 }
1578
1579 /* Implement TARGET_HARD_REGNO_NREGS.  */
1580
1581 static unsigned int
1582 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1583 {
1584   /* ??? Logically we should only need to provide a value when
1585      HARD_REGNO_MODE_OK says that the combination is valid,
1586      but at the moment we need to handle all modes.  Just ignore
1587      any runtime parts for registers that can't store them.  */
1588   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1589   switch (aarch64_regno_regclass (regno))
1590     {
1591     case FP_REGS:
1592     case FP_LO_REGS:
1593       if (aarch64_sve_data_mode_p (mode))
1594         return exact_div (GET_MODE_SIZE (mode),
1595                           BYTES_PER_SVE_VECTOR).to_constant ();
1596       return CEIL (lowest_size, UNITS_PER_VREG);
1597     case PR_REGS:
1598     case PR_LO_REGS:
1599     case PR_HI_REGS:
1600       return 1;
1601     default:
1602       return CEIL (lowest_size, UNITS_PER_WORD);
1603     }
1604   gcc_unreachable ();
1605 }
1606
1607 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1608
1609 static bool
1610 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1611 {
1612   if (GET_MODE_CLASS (mode) == MODE_CC)
1613     return regno == CC_REGNUM;
1614
1615   if (regno == VG_REGNUM)
1616     /* This must have the same size as _Unwind_Word.  */
1617     return mode == DImode;
1618
1619   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1620   if (vec_flags & VEC_SVE_PRED)
1621     return PR_REGNUM_P (regno);
1622
1623   if (PR_REGNUM_P (regno))
1624     return 0;
1625
1626   if (regno == SP_REGNUM)
1627     /* The purpose of comparing with ptr_mode is to support the
1628        global register variable associated with the stack pointer
1629        register via the syntax of asm ("wsp") in ILP32.  */
1630     return mode == Pmode || mode == ptr_mode;
1631
1632   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1633     return mode == Pmode;
1634
1635   if (GP_REGNUM_P (regno))
1636     {
1637       if (known_le (GET_MODE_SIZE (mode), 8))
1638         return true;
1639       else if (known_le (GET_MODE_SIZE (mode), 16))
1640         return (regno & 1) == 0;
1641     }
1642   else if (FP_REGNUM_P (regno))
1643     {
1644       if (vec_flags & VEC_STRUCT)
1645         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1646       else
1647         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1648     }
1649
1650   return false;
1651 }
1652
1653 /* Return true if this is a definition of a vectorized simd function.  */
1654
1655 static bool
1656 aarch64_simd_decl_p (tree fndecl)
1657 {
1658   tree fntype;
1659
1660   if (fndecl == NULL)
1661     return false;
1662   fntype = TREE_TYPE (fndecl);
1663   if (fntype == NULL)
1664     return false;
1665
1666   /* Functions with the aarch64_vector_pcs attribute use the simd ABI.  */
1667   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1668     return true;
1669
1670   return false;
1671 }
1672
1673 /* Return the mode a register save/restore should use.  DImode for integer
1674    registers, DFmode for FP registers in non-SIMD functions (they only save
1675    the bottom half of a 128 bit register), or TFmode for FP registers in
1676    SIMD functions.  */
1677
1678 static machine_mode
1679 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1680 {
1681   return GP_REGNUM_P (regno)
1682            ? E_DImode
1683            : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1684 }
1685
1686 /* Return true if the instruction is a call to a SIMD function, false
1687    if it is not a SIMD function or if we do not know anything about
1688    the function.  */
1689
1690 static bool
1691 aarch64_simd_call_p (rtx_insn *insn)
1692 {
1693   rtx symbol;
1694   rtx call;
1695   tree fndecl;
1696
1697   gcc_assert (CALL_P (insn));
1698   call = get_call_rtx_from (insn);
1699   symbol = XEXP (XEXP (call, 0), 0);
1700   if (GET_CODE (symbol) != SYMBOL_REF)
1701     return false;
1702   fndecl = SYMBOL_REF_DECL (symbol);
1703   if (!fndecl)
1704     return false;
1705
1706   return aarch64_simd_decl_p (fndecl);
1707 }
1708
1709 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS.  If INSN calls
1710    a function that uses the SIMD ABI, take advantage of the extra
1711    call-preserved registers that the ABI provides.  */
1712
1713 void
1714 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1715                                           HARD_REG_SET *return_set)
1716 {
1717   if (aarch64_simd_call_p (insn))
1718     {
1719       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1720         if (FP_SIMD_SAVED_REGNUM_P (regno))
1721           CLEAR_HARD_REG_BIT (*return_set, regno);
1722     }
1723 }
1724
1725 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1726    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1727    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1728
1729 static bool
1730 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1731                                         machine_mode mode)
1732 {
1733   bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1734   return FP_REGNUM_P (regno)
1735          && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1736 }
1737
1738 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS.  */
1739
1740 rtx_insn *
1741 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1742 {
1743   gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1744
1745   if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1746     return call_1;
1747   else
1748     return call_2;
1749 }
1750
1751 /* Implement REGMODE_NATURAL_SIZE.  */
1752 poly_uint64
1753 aarch64_regmode_natural_size (machine_mode mode)
1754 {
1755   /* The natural size for SVE data modes is one SVE data vector,
1756      and similarly for predicates.  We can't independently modify
1757      anything smaller than that.  */
1758   /* ??? For now, only do this for variable-width SVE registers.
1759      Doing it for constant-sized registers breaks lower-subreg.c.  */
1760   /* ??? And once that's fixed, we should probably have similar
1761      code for Advanced SIMD.  */
1762   if (!aarch64_sve_vg.is_constant ())
1763     {
1764       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1765       if (vec_flags & VEC_SVE_PRED)
1766         return BYTES_PER_SVE_PRED;
1767       if (vec_flags & VEC_SVE_DATA)
1768         return BYTES_PER_SVE_VECTOR;
1769     }
1770   return UNITS_PER_WORD;
1771 }
1772
1773 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1774 machine_mode
1775 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1776                                      machine_mode mode)
1777 {
1778   /* The predicate mode determines which bits are significant and
1779      which are "don't care".  Decreasing the number of lanes would
1780      lose data while increasing the number of lanes would make bits
1781      unnecessarily significant.  */
1782   if (PR_REGNUM_P (regno))
1783     return mode;
1784   if (known_ge (GET_MODE_SIZE (mode), 4))
1785     return mode;
1786   else
1787     return SImode;
1788 }
1789
1790 /* Return true if I's bits are consecutive ones from the MSB.  */
1791 bool
1792 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1793 {
1794   return exact_log2 (-i) != HOST_WIDE_INT_M1;
1795 }
1796
1797 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1798    that strcpy from constants will be faster.  */
1799
1800 static HOST_WIDE_INT
1801 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1802 {
1803   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1804     return MAX (align, BITS_PER_WORD);
1805   return align;
1806 }
1807
1808 /* Return true if calls to DECL should be treated as
1809    long-calls (ie called via a register).  */
1810 static bool
1811 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1812 {
1813   return false;
1814 }
1815
1816 /* Return true if calls to symbol-ref SYM should be treated as
1817    long-calls (ie called via a register).  */
1818 bool
1819 aarch64_is_long_call_p (rtx sym)
1820 {
1821   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1822 }
1823
1824 /* Return true if calls to symbol-ref SYM should not go through
1825    plt stubs.  */
1826
1827 bool
1828 aarch64_is_noplt_call_p (rtx sym)
1829 {
1830   const_tree decl = SYMBOL_REF_DECL (sym);
1831
1832   if (flag_pic
1833       && decl
1834       && (!flag_plt
1835           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1836       && !targetm.binds_local_p (decl))
1837     return true;
1838
1839   return false;
1840 }
1841
1842 /* Return true if the offsets to a zero/sign-extract operation
1843    represent an expression that matches an extend operation.  The
1844    operands represent the paramters from
1845
1846    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1847 bool
1848 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1849                                 rtx extract_imm)
1850 {
1851   HOST_WIDE_INT mult_val, extract_val;
1852
1853   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1854     return false;
1855
1856   mult_val = INTVAL (mult_imm);
1857   extract_val = INTVAL (extract_imm);
1858
1859   if (extract_val > 8
1860       && extract_val < GET_MODE_BITSIZE (mode)
1861       && exact_log2 (extract_val & ~7) > 0
1862       && (extract_val & 7) <= 4
1863       && mult_val == (1 << (extract_val & 7)))
1864     return true;
1865
1866   return false;
1867 }
1868
1869 /* Emit an insn that's a simple single-set.  Both the operands must be
1870    known to be valid.  */
1871 inline static rtx_insn *
1872 emit_set_insn (rtx x, rtx y)
1873 {
1874   return emit_insn (gen_rtx_SET (x, y));
1875 }
1876
1877 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1878    return the rtx for register 0 in the proper mode.  */
1879 rtx
1880 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1881 {
1882   machine_mode mode = SELECT_CC_MODE (code, x, y);
1883   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1884
1885   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1886   return cc_reg;
1887 }
1888
1889 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
1890
1891 static rtx
1892 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
1893                                   machine_mode y_mode)
1894 {
1895   if (y_mode == E_QImode || y_mode == E_HImode)
1896     {
1897       if (CONST_INT_P (y))
1898         y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
1899       else
1900         {
1901           rtx t, cc_reg;
1902           machine_mode cc_mode;
1903
1904           t = gen_rtx_ZERO_EXTEND (SImode, y);
1905           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
1906           cc_mode = CC_SWPmode;
1907           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
1908           emit_set_insn (cc_reg, t);
1909           return cc_reg;
1910         }
1911     }
1912
1913   return aarch64_gen_compare_reg (code, x, y);
1914 }
1915
1916 /* Build the SYMBOL_REF for __tls_get_addr.  */
1917
1918 static GTY(()) rtx tls_get_addr_libfunc;
1919
1920 rtx
1921 aarch64_tls_get_addr (void)
1922 {
1923   if (!tls_get_addr_libfunc)
1924     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1925   return tls_get_addr_libfunc;
1926 }
1927
1928 /* Return the TLS model to use for ADDR.  */
1929
1930 static enum tls_model
1931 tls_symbolic_operand_type (rtx addr)
1932 {
1933   enum tls_model tls_kind = TLS_MODEL_NONE;
1934   if (GET_CODE (addr) == CONST)
1935     {
1936       poly_int64 addend;
1937       rtx sym = strip_offset (addr, &addend);
1938       if (GET_CODE (sym) == SYMBOL_REF)
1939         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1940     }
1941   else if (GET_CODE (addr) == SYMBOL_REF)
1942     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1943
1944   return tls_kind;
1945 }
1946
1947 /* We'll allow lo_sum's in addresses in our legitimate addresses
1948    so that combine would take care of combining addresses where
1949    necessary, but for generation purposes, we'll generate the address
1950    as :
1951    RTL                               Absolute
1952    tmp = hi (symbol_ref);            adrp  x1, foo
1953    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1954                                      nop
1955
1956    PIC                               TLS
1957    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1958    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1959                                      bl   __tls_get_addr
1960                                      nop
1961
1962    Load TLS symbol, depending on TLS mechanism and TLS access model.
1963
1964    Global Dynamic - Traditional TLS:
1965    adrp tmp, :tlsgd:imm
1966    add  dest, tmp, #:tlsgd_lo12:imm
1967    bl   __tls_get_addr
1968
1969    Global Dynamic - TLS Descriptors:
1970    adrp dest, :tlsdesc:imm
1971    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1972    add  dest, dest, #:tlsdesc_lo12:imm
1973    blr  tmp
1974    mrs  tp, tpidr_el0
1975    add  dest, dest, tp
1976
1977    Initial Exec:
1978    mrs  tp, tpidr_el0
1979    adrp tmp, :gottprel:imm
1980    ldr  dest, [tmp, #:gottprel_lo12:imm]
1981    add  dest, dest, tp
1982
1983    Local Exec:
1984    mrs  tp, tpidr_el0
1985    add  t0, tp, #:tprel_hi12:imm, lsl #12
1986    add  t0, t0, #:tprel_lo12_nc:imm
1987 */
1988
1989 static void
1990 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1991                                    enum aarch64_symbol_type type)
1992 {
1993   switch (type)
1994     {
1995     case SYMBOL_SMALL_ABSOLUTE:
1996       {
1997         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1998         rtx tmp_reg = dest;
1999         machine_mode mode = GET_MODE (dest);
2000
2001         gcc_assert (mode == Pmode || mode == ptr_mode);
2002
2003         if (can_create_pseudo_p ())
2004           tmp_reg = gen_reg_rtx (mode);
2005
2006         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2007         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2008         return;
2009       }
2010
2011     case SYMBOL_TINY_ABSOLUTE:
2012       emit_insn (gen_rtx_SET (dest, imm));
2013       return;
2014
2015     case SYMBOL_SMALL_GOT_28K:
2016       {
2017         machine_mode mode = GET_MODE (dest);
2018         rtx gp_rtx = pic_offset_table_rtx;
2019         rtx insn;
2020         rtx mem;
2021
2022         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2023            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2024            decide rtx costs, in which case pic_offset_table_rtx is not
2025            initialized.  For that case no need to generate the first adrp
2026            instruction as the final cost for global variable access is
2027            one instruction.  */
2028         if (gp_rtx != NULL)
2029           {
2030             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2031                using the page base as GOT base, the first page may be wasted,
2032                in the worst scenario, there is only 28K space for GOT).
2033
2034                The generate instruction sequence for accessing global variable
2035                is:
2036
2037                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2038
2039                Only one instruction needed. But we must initialize
2040                pic_offset_table_rtx properly.  We generate initialize insn for
2041                every global access, and allow CSE to remove all redundant.
2042
2043                The final instruction sequences will look like the following
2044                for multiply global variables access.
2045
2046                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2047
2048                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2049                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2050                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2051                  ...  */
2052
2053             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2054             crtl->uses_pic_offset_table = 1;
2055             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2056
2057             if (mode != GET_MODE (gp_rtx))
2058              gp_rtx = gen_lowpart (mode, gp_rtx);
2059
2060           }
2061
2062         if (mode == ptr_mode)
2063           {
2064             if (mode == DImode)
2065               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2066             else
2067               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2068
2069             mem = XVECEXP (SET_SRC (insn), 0, 0);
2070           }
2071         else
2072           {
2073             gcc_assert (mode == Pmode);
2074
2075             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2076             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2077           }
2078
2079         /* The operand is expected to be MEM.  Whenever the related insn
2080            pattern changed, above code which calculate mem should be
2081            updated.  */
2082         gcc_assert (GET_CODE (mem) == MEM);
2083         MEM_READONLY_P (mem) = 1;
2084         MEM_NOTRAP_P (mem) = 1;
2085         emit_insn (insn);
2086         return;
2087       }
2088
2089     case SYMBOL_SMALL_GOT_4G:
2090       {
2091         /* In ILP32, the mode of dest can be either SImode or DImode,
2092            while the got entry is always of SImode size.  The mode of
2093            dest depends on how dest is used: if dest is assigned to a
2094            pointer (e.g. in the memory), it has SImode; it may have
2095            DImode if dest is dereferenced to access the memeory.
2096            This is why we have to handle three different ldr_got_small
2097            patterns here (two patterns for ILP32).  */
2098
2099         rtx insn;
2100         rtx mem;
2101         rtx tmp_reg = dest;
2102         machine_mode mode = GET_MODE (dest);
2103
2104         if (can_create_pseudo_p ())
2105           tmp_reg = gen_reg_rtx (mode);
2106
2107         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2108         if (mode == ptr_mode)
2109           {
2110             if (mode == DImode)
2111               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2112             else
2113               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2114
2115             mem = XVECEXP (SET_SRC (insn), 0, 0);
2116           }
2117         else
2118           {
2119             gcc_assert (mode == Pmode);
2120
2121             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2122             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2123           }
2124
2125         gcc_assert (GET_CODE (mem) == MEM);
2126         MEM_READONLY_P (mem) = 1;
2127         MEM_NOTRAP_P (mem) = 1;
2128         emit_insn (insn);
2129         return;
2130       }
2131
2132     case SYMBOL_SMALL_TLSGD:
2133       {
2134         rtx_insn *insns;
2135         machine_mode mode = GET_MODE (dest);
2136         rtx result = gen_rtx_REG (mode, R0_REGNUM);
2137
2138         start_sequence ();
2139         if (TARGET_ILP32)
2140           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2141         else
2142           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2143         insns = get_insns ();
2144         end_sequence ();
2145
2146         RTL_CONST_CALL_P (insns) = 1;
2147         emit_libcall_block (insns, dest, result, imm);
2148         return;
2149       }
2150
2151     case SYMBOL_SMALL_TLSDESC:
2152       {
2153         machine_mode mode = GET_MODE (dest);
2154         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2155         rtx tp;
2156
2157         gcc_assert (mode == Pmode || mode == ptr_mode);
2158
2159         /* In ILP32, the got entry is always of SImode size.  Unlike
2160            small GOT, the dest is fixed at reg 0.  */
2161         if (TARGET_ILP32)
2162           emit_insn (gen_tlsdesc_small_si (imm));
2163         else
2164           emit_insn (gen_tlsdesc_small_di (imm));
2165         tp = aarch64_load_tp (NULL);
2166
2167         if (mode != Pmode)
2168           tp = gen_lowpart (mode, tp);
2169
2170         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2171         if (REG_P (dest))
2172           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2173         return;
2174       }
2175
2176     case SYMBOL_SMALL_TLSIE:
2177       {
2178         /* In ILP32, the mode of dest can be either SImode or DImode,
2179            while the got entry is always of SImode size.  The mode of
2180            dest depends on how dest is used: if dest is assigned to a
2181            pointer (e.g. in the memory), it has SImode; it may have
2182            DImode if dest is dereferenced to access the memeory.
2183            This is why we have to handle three different tlsie_small
2184            patterns here (two patterns for ILP32).  */
2185         machine_mode mode = GET_MODE (dest);
2186         rtx tmp_reg = gen_reg_rtx (mode);
2187         rtx tp = aarch64_load_tp (NULL);
2188
2189         if (mode == ptr_mode)
2190           {
2191             if (mode == DImode)
2192               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2193             else
2194               {
2195                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2196                 tp = gen_lowpart (mode, tp);
2197               }
2198           }
2199         else
2200           {
2201             gcc_assert (mode == Pmode);
2202             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2203           }
2204
2205         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2206         if (REG_P (dest))
2207           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2208         return;
2209       }
2210
2211     case SYMBOL_TLSLE12:
2212     case SYMBOL_TLSLE24:
2213     case SYMBOL_TLSLE32:
2214     case SYMBOL_TLSLE48:
2215       {
2216         machine_mode mode = GET_MODE (dest);
2217         rtx tp = aarch64_load_tp (NULL);
2218
2219         if (mode != Pmode)
2220           tp = gen_lowpart (mode, tp);
2221
2222         switch (type)
2223           {
2224           case SYMBOL_TLSLE12:
2225             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2226                         (dest, tp, imm));
2227             break;
2228           case SYMBOL_TLSLE24:
2229             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2230                         (dest, tp, imm));
2231           break;
2232           case SYMBOL_TLSLE32:
2233             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2234                         (dest, imm));
2235             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2236                         (dest, dest, tp));
2237           break;
2238           case SYMBOL_TLSLE48:
2239             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2240                         (dest, imm));
2241             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2242                         (dest, dest, tp));
2243             break;
2244           default:
2245             gcc_unreachable ();
2246           }
2247
2248         if (REG_P (dest))
2249           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2250         return;
2251       }
2252
2253     case SYMBOL_TINY_GOT:
2254       emit_insn (gen_ldr_got_tiny (dest, imm));
2255       return;
2256
2257     case SYMBOL_TINY_TLSIE:
2258       {
2259         machine_mode mode = GET_MODE (dest);
2260         rtx tp = aarch64_load_tp (NULL);
2261
2262         if (mode == ptr_mode)
2263           {
2264             if (mode == DImode)
2265               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2266             else
2267               {
2268                 tp = gen_lowpart (mode, tp);
2269                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2270               }
2271           }
2272         else
2273           {
2274             gcc_assert (mode == Pmode);
2275             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2276           }
2277
2278         if (REG_P (dest))
2279           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2280         return;
2281       }
2282
2283     default:
2284       gcc_unreachable ();
2285     }
2286 }
2287
2288 /* Emit a move from SRC to DEST.  Assume that the move expanders can
2289    handle all moves if !can_create_pseudo_p ().  The distinction is
2290    important because, unlike emit_move_insn, the move expanders know
2291    how to force Pmode objects into the constant pool even when the
2292    constant pool address is not itself legitimate.  */
2293 static rtx
2294 aarch64_emit_move (rtx dest, rtx src)
2295 {
2296   return (can_create_pseudo_p ()
2297           ? emit_move_insn (dest, src)
2298           : emit_move_insn_1 (dest, src));
2299 }
2300
2301 /* Apply UNOPTAB to OP and store the result in DEST.  */
2302
2303 static void
2304 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2305 {
2306   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2307   if (dest != tmp)
2308     emit_move_insn (dest, tmp);
2309 }
2310
2311 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2312
2313 static void
2314 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2315 {
2316   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2317                           OPTAB_DIRECT);
2318   if (dest != tmp)
2319     emit_move_insn (dest, tmp);
2320 }
2321
2322 /* Split a 128-bit move operation into two 64-bit move operations,
2323    taking care to handle partial overlap of register to register
2324    copies.  Special cases are needed when moving between GP regs and
2325    FP regs.  SRC can be a register, constant or memory; DST a register
2326    or memory.  If either operand is memory it must not have any side
2327    effects.  */
2328 void
2329 aarch64_split_128bit_move (rtx dst, rtx src)
2330 {
2331   rtx dst_lo, dst_hi;
2332   rtx src_lo, src_hi;
2333
2334   machine_mode mode = GET_MODE (dst);
2335
2336   gcc_assert (mode == TImode || mode == TFmode);
2337   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2338   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2339
2340   if (REG_P (dst) && REG_P (src))
2341     {
2342       int src_regno = REGNO (src);
2343       int dst_regno = REGNO (dst);
2344
2345       /* Handle FP <-> GP regs.  */
2346       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2347         {
2348           src_lo = gen_lowpart (word_mode, src);
2349           src_hi = gen_highpart (word_mode, src);
2350
2351           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2352           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2353           return;
2354         }
2355       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2356         {
2357           dst_lo = gen_lowpart (word_mode, dst);
2358           dst_hi = gen_highpart (word_mode, dst);
2359
2360           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2361           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2362           return;
2363         }
2364     }
2365
2366   dst_lo = gen_lowpart (word_mode, dst);
2367   dst_hi = gen_highpart (word_mode, dst);
2368   src_lo = gen_lowpart (word_mode, src);
2369   src_hi = gen_highpart_mode (word_mode, mode, src);
2370
2371   /* At most one pairing may overlap.  */
2372   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2373     {
2374       aarch64_emit_move (dst_hi, src_hi);
2375       aarch64_emit_move (dst_lo, src_lo);
2376     }
2377   else
2378     {
2379       aarch64_emit_move (dst_lo, src_lo);
2380       aarch64_emit_move (dst_hi, src_hi);
2381     }
2382 }
2383
2384 bool
2385 aarch64_split_128bit_move_p (rtx dst, rtx src)
2386 {
2387   return (! REG_P (src)
2388           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2389 }
2390
2391 /* Split a complex SIMD combine.  */
2392
2393 void
2394 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2395 {
2396   machine_mode src_mode = GET_MODE (src1);
2397   machine_mode dst_mode = GET_MODE (dst);
2398
2399   gcc_assert (VECTOR_MODE_P (dst_mode));
2400   gcc_assert (register_operand (dst, dst_mode)
2401               && register_operand (src1, src_mode)
2402               && register_operand (src2, src_mode));
2403
2404   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2405   return;
2406 }
2407
2408 /* Split a complex SIMD move.  */
2409
2410 void
2411 aarch64_split_simd_move (rtx dst, rtx src)
2412 {
2413   machine_mode src_mode = GET_MODE (src);
2414   machine_mode dst_mode = GET_MODE (dst);
2415
2416   gcc_assert (VECTOR_MODE_P (dst_mode));
2417
2418   if (REG_P (dst) && REG_P (src))
2419     {
2420       gcc_assert (VECTOR_MODE_P (src_mode));
2421       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2422     }
2423 }
2424
2425 bool
2426 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2427                               machine_mode ymode, rtx y)
2428 {
2429   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2430   gcc_assert (r != NULL);
2431   return rtx_equal_p (x, r);
2432 }
2433
2434
2435 static rtx
2436 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2437 {
2438   if (can_create_pseudo_p ())
2439     return force_reg (mode, value);
2440   else
2441     {
2442       gcc_assert (x);
2443       aarch64_emit_move (x, value);
2444       return x;
2445     }
2446 }
2447
2448 /* Return true if we can move VALUE into a register using a single
2449    CNT[BHWD] instruction.  */
2450
2451 static bool
2452 aarch64_sve_cnt_immediate_p (poly_int64 value)
2453 {
2454   HOST_WIDE_INT factor = value.coeffs[0];
2455   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2456   return (value.coeffs[1] == factor
2457           && IN_RANGE (factor, 2, 16 * 16)
2458           && (factor & 1) == 0
2459           && factor <= 16 * (factor & -factor));
2460 }
2461
2462 /* Likewise for rtx X.  */
2463
2464 bool
2465 aarch64_sve_cnt_immediate_p (rtx x)
2466 {
2467   poly_int64 value;
2468   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2469 }
2470
2471 /* Return the asm string for an instruction with a CNT-like vector size
2472    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2473    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2474    first part of the operands template (the part that comes before the
2475    vector size itself).  FACTOR is the number of quadwords.
2476    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2477    If it is zero, we can use any element size.  */
2478
2479 static char *
2480 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2481                                   unsigned int factor,
2482                                   unsigned int nelts_per_vq)
2483 {
2484   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2485
2486   if (nelts_per_vq == 0)
2487     /* There is some overlap in the ranges of the four CNT instructions.
2488        Here we always use the smallest possible element size, so that the
2489        multiplier is 1 whereever possible.  */
2490     nelts_per_vq = factor & -factor;
2491   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2492   gcc_assert (IN_RANGE (shift, 1, 4));
2493   char suffix = "dwhb"[shift - 1];
2494
2495   factor >>= shift;
2496   unsigned int written;
2497   if (factor == 1)
2498     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2499                         prefix, suffix, operands);
2500   else
2501     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2502                         prefix, suffix, operands, factor);
2503   gcc_assert (written < sizeof (buffer));
2504   return buffer;
2505 }
2506
2507 /* Return the asm string for an instruction with a CNT-like vector size
2508    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2509    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2510    first part of the operands template (the part that comes before the
2511    vector size itself).  X is the value of the vector size operand,
2512    as a polynomial integer rtx.  */
2513
2514 char *
2515 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2516                                   rtx x)
2517 {
2518   poly_int64 value = rtx_to_poly_int64 (x);
2519   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2520   return aarch64_output_sve_cnt_immediate (prefix, operands,
2521                                            value.coeffs[1], 0);
2522 }
2523
2524 /* Return true if we can add VALUE to a register using a single ADDVL
2525    or ADDPL instruction.  */
2526
2527 static bool
2528 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2529 {
2530   HOST_WIDE_INT factor = value.coeffs[0];
2531   if (factor == 0 || value.coeffs[1] != factor)
2532     return false;
2533   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2534      and a value of 16 is one vector width.  */
2535   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2536           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2537 }
2538
2539 /* Likewise for rtx X.  */
2540
2541 bool
2542 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2543 {
2544   poly_int64 value;
2545   return (poly_int_rtx_p (x, &value)
2546           && aarch64_sve_addvl_addpl_immediate_p (value));
2547 }
2548
2549 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2550    and storing the result in operand 0.  */
2551
2552 char *
2553 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2554 {
2555   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2556   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2557   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2558
2559   /* Use INC or DEC if possible.  */
2560   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2561     {
2562       if (aarch64_sve_cnt_immediate_p (offset_value))
2563         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2564                                                  offset_value.coeffs[1], 0);
2565       if (aarch64_sve_cnt_immediate_p (-offset_value))
2566         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2567                                                  -offset_value.coeffs[1], 0);
2568     }
2569
2570   int factor = offset_value.coeffs[1];
2571   if ((factor & 15) == 0)
2572     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2573   else
2574     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2575   return buffer;
2576 }
2577
2578 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2579    instruction.  If it is, store the number of elements in each vector
2580    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2581    factor in *FACTOR_OUT (if nonnull).  */
2582
2583 bool
2584 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2585                                  unsigned int *nelts_per_vq_out)
2586 {
2587   rtx elt;
2588   poly_int64 value;
2589
2590   if (!const_vec_duplicate_p (x, &elt)
2591       || !poly_int_rtx_p (elt, &value))
2592     return false;
2593
2594   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2595   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2596     /* There's no vector INCB.  */
2597     return false;
2598
2599   HOST_WIDE_INT factor = value.coeffs[0];
2600   if (value.coeffs[1] != factor)
2601     return false;
2602
2603   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2604   if ((factor % nelts_per_vq) != 0
2605       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2606     return false;
2607
2608   if (factor_out)
2609     *factor_out = factor;
2610   if (nelts_per_vq_out)
2611     *nelts_per_vq_out = nelts_per_vq;
2612   return true;
2613 }
2614
2615 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2616    instruction.  */
2617
2618 bool
2619 aarch64_sve_inc_dec_immediate_p (rtx x)
2620 {
2621   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2622 }
2623
2624 /* Return the asm template for an SVE vector INC or DEC instruction.
2625    OPERANDS gives the operands before the vector count and X is the
2626    value of the vector count operand itself.  */
2627
2628 char *
2629 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2630 {
2631   int factor;
2632   unsigned int nelts_per_vq;
2633   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2634     gcc_unreachable ();
2635   if (factor < 0)
2636     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2637                                              nelts_per_vq);
2638   else
2639     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2640                                              nelts_per_vq);
2641 }
2642
2643 static int
2644 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2645                                 scalar_int_mode mode)
2646 {
2647   int i;
2648   unsigned HOST_WIDE_INT val, val2, mask;
2649   int one_match, zero_match;
2650   int num_insns;
2651
2652   val = INTVAL (imm);
2653
2654   if (aarch64_move_imm (val, mode))
2655     {
2656       if (generate)
2657         emit_insn (gen_rtx_SET (dest, imm));
2658       return 1;
2659     }
2660
2661   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2662      (with XXXX non-zero). In that case check to see if the move can be done in
2663      a smaller mode.  */
2664   val2 = val & 0xffffffff;
2665   if (mode == DImode
2666       && aarch64_move_imm (val2, SImode)
2667       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2668     {
2669       if (generate)
2670         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2671
2672       /* Check if we have to emit a second instruction by checking to see
2673          if any of the upper 32 bits of the original DI mode value is set.  */
2674       if (val == val2)
2675         return 1;
2676
2677       i = (val >> 48) ? 48 : 32;
2678
2679       if (generate)
2680          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2681                                     GEN_INT ((val >> i) & 0xffff)));
2682
2683       return 2;
2684     }
2685
2686   if ((val >> 32) == 0 || mode == SImode)
2687     {
2688       if (generate)
2689         {
2690           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2691           if (mode == SImode)
2692             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2693                                        GEN_INT ((val >> 16) & 0xffff)));
2694           else
2695             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2696                                        GEN_INT ((val >> 16) & 0xffff)));
2697         }
2698       return 2;
2699     }
2700
2701   /* Remaining cases are all for DImode.  */
2702
2703   mask = 0xffff;
2704   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2705     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2706   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2707     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2708
2709   if (zero_match != 2 && one_match != 2)
2710     {
2711       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2712          For a 64-bit bitmask try whether changing 16 bits to all ones or
2713          zeroes creates a valid bitmask.  To check any repeated bitmask,
2714          try using 16 bits from the other 32-bit half of val.  */
2715
2716       for (i = 0; i < 64; i += 16, mask <<= 16)
2717         {
2718           val2 = val & ~mask;
2719           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2720             break;
2721           val2 = val | mask;
2722           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2723             break;
2724           val2 = val2 & ~mask;
2725           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2726           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2727             break;
2728         }
2729       if (i != 64)
2730         {
2731           if (generate)
2732             {
2733               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2734               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2735                                          GEN_INT ((val >> i) & 0xffff)));
2736             }
2737           return 2;
2738         }
2739     }
2740
2741   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2742      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2743      otherwise skip zero bits.  */
2744
2745   num_insns = 1;
2746   mask = 0xffff;
2747   val2 = one_match > zero_match ? ~val : val;
2748   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2749
2750   if (generate)
2751     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2752                                            ? (val | ~(mask << i))
2753                                            : (val & (mask << i)))));
2754   for (i += 16; i < 64; i += 16)
2755     {
2756       if ((val2 & (mask << i)) == 0)
2757         continue;
2758       if (generate)
2759         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2760                                    GEN_INT ((val >> i) & 0xffff)));
2761       num_insns ++;
2762     }
2763
2764   return num_insns;
2765 }
2766
2767 /* Return whether imm is a 128-bit immediate which is simple enough to
2768    expand inline.  */
2769 bool
2770 aarch64_mov128_immediate (rtx imm)
2771 {
2772   if (GET_CODE (imm) == CONST_INT)
2773     return true;
2774
2775   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2776
2777   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2778   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2779
2780   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2781          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2782 }
2783
2784
2785 /* Return the number of temporary registers that aarch64_add_offset_1
2786    would need to add OFFSET to a register.  */
2787
2788 static unsigned int
2789 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2790 {
2791   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2792 }
2793
2794 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2795    a non-polynomial OFFSET.  MODE is the mode of the addition.
2796    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2797    be set and CFA adjustments added to the generated instructions.
2798
2799    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2800    temporary if register allocation is already complete.  This temporary
2801    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2802    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2803    the immediate again.
2804
2805    Since this function may be used to adjust the stack pointer, we must
2806    ensure that it cannot cause transient stack deallocation (for example
2807    by first incrementing SP and then decrementing when adjusting by a
2808    large immediate).  */
2809
2810 static void
2811 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2812                       rtx src, HOST_WIDE_INT offset, rtx temp1,
2813                       bool frame_related_p, bool emit_move_imm)
2814 {
2815   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2816   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2817
2818   HOST_WIDE_INT moffset = abs_hwi (offset);
2819   rtx_insn *insn;
2820
2821   if (!moffset)
2822     {
2823       if (!rtx_equal_p (dest, src))
2824         {
2825           insn = emit_insn (gen_rtx_SET (dest, src));
2826           RTX_FRAME_RELATED_P (insn) = frame_related_p;
2827         }
2828       return;
2829     }
2830
2831   /* Single instruction adjustment.  */
2832   if (aarch64_uimm12_shift (moffset))
2833     {
2834       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2835       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2836       return;
2837     }
2838
2839   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2840      and either:
2841
2842      a) the offset cannot be loaded by a 16-bit move or
2843      b) there is no spare register into which we can move it.  */
2844   if (moffset < 0x1000000
2845       && ((!temp1 && !can_create_pseudo_p ())
2846           || !aarch64_move_imm (moffset, mode)))
2847     {
2848       HOST_WIDE_INT low_off = moffset & 0xfff;
2849
2850       low_off = offset < 0 ? -low_off : low_off;
2851       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2852       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2853       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2854       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2855       return;
2856     }
2857
2858   /* Emit a move immediate if required and an addition/subtraction.  */
2859   if (emit_move_imm)
2860     {
2861       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2862       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2863     }
2864   insn = emit_insn (offset < 0
2865                     ? gen_sub3_insn (dest, src, temp1)
2866                     : gen_add3_insn (dest, src, temp1));
2867   if (frame_related_p)
2868     {
2869       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2870       rtx adj = plus_constant (mode, src, offset);
2871       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2872     }
2873 }
2874
2875 /* Return the number of temporary registers that aarch64_add_offset
2876    would need to move OFFSET into a register or add OFFSET to a register;
2877    ADD_P is true if we want the latter rather than the former.  */
2878
2879 static unsigned int
2880 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2881 {
2882   /* This follows the same structure as aarch64_add_offset.  */
2883   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2884     return 0;
2885
2886   unsigned int count = 0;
2887   HOST_WIDE_INT factor = offset.coeffs[1];
2888   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2889   poly_int64 poly_offset (factor, factor);
2890   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2891     /* Need one register for the ADDVL/ADDPL result.  */
2892     count += 1;
2893   else if (factor != 0)
2894     {
2895       factor = abs (factor);
2896       if (factor > 16 * (factor & -factor))
2897         /* Need one register for the CNT result and one for the multiplication
2898            factor.  If necessary, the second temporary can be reused for the
2899            constant part of the offset.  */
2900         return 2;
2901       /* Need one register for the CNT result (which might then
2902          be shifted).  */
2903       count += 1;
2904     }
2905   return count + aarch64_add_offset_1_temporaries (constant);
2906 }
2907
2908 /* If X can be represented as a poly_int64, return the number
2909    of temporaries that are required to add it to a register.
2910    Return -1 otherwise.  */
2911
2912 int
2913 aarch64_add_offset_temporaries (rtx x)
2914 {
2915   poly_int64 offset;
2916   if (!poly_int_rtx_p (x, &offset))
2917     return -1;
2918   return aarch64_offset_temporaries (true, offset);
2919 }
2920
2921 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
2922    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2923    be set and CFA adjustments added to the generated instructions.
2924
2925    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2926    temporary if register allocation is already complete.  This temporary
2927    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2928    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2929    false to avoid emitting the immediate again.
2930
2931    TEMP2, if nonnull, is a second temporary register that doesn't
2932    overlap either DEST or REG.
2933
2934    Since this function may be used to adjust the stack pointer, we must
2935    ensure that it cannot cause transient stack deallocation (for example
2936    by first incrementing SP and then decrementing when adjusting by a
2937    large immediate).  */
2938
2939 static void
2940 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2941                     poly_int64 offset, rtx temp1, rtx temp2,
2942                     bool frame_related_p, bool emit_move_imm = true)
2943 {
2944   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2945   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2946   gcc_assert (temp1 == NULL_RTX
2947               || !frame_related_p
2948               || !reg_overlap_mentioned_p (temp1, dest));
2949   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2950
2951   /* Try using ADDVL or ADDPL to add the whole value.  */
2952   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2953     {
2954       rtx offset_rtx = gen_int_mode (offset, mode);
2955       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2956       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2957       return;
2958     }
2959
2960   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2961      SVE vector register, over and above the minimum size of 128 bits.
2962      This is equivalent to half the value returned by CNTD with a
2963      vector shape of ALL.  */
2964   HOST_WIDE_INT factor = offset.coeffs[1];
2965   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2966
2967   /* Try using ADDVL or ADDPL to add the VG-based part.  */
2968   poly_int64 poly_offset (factor, factor);
2969   if (src != const0_rtx
2970       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2971     {
2972       rtx offset_rtx = gen_int_mode (poly_offset, mode);
2973       if (frame_related_p)
2974         {
2975           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2976           RTX_FRAME_RELATED_P (insn) = true;
2977           src = dest;
2978         }
2979       else
2980         {
2981           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2982           src = aarch64_force_temporary (mode, temp1, addr);
2983           temp1 = temp2;
2984           temp2 = NULL_RTX;
2985         }
2986     }
2987   /* Otherwise use a CNT-based sequence.  */
2988   else if (factor != 0)
2989     {
2990       /* Use a subtraction if we have a negative factor.  */
2991       rtx_code code = PLUS;
2992       if (factor < 0)
2993         {
2994           factor = -factor;
2995           code = MINUS;
2996         }
2997
2998       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
2999          into the multiplication.  */
3000       rtx val;
3001       int shift = 0;
3002       if (factor & 1)
3003         /* Use a right shift by 1.  */
3004         shift = -1;
3005       else
3006         factor /= 2;
3007       HOST_WIDE_INT low_bit = factor & -factor;
3008       if (factor <= 16 * low_bit)
3009         {
3010           if (factor > 16 * 8)
3011             {
3012               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3013                  the value with the minimum multiplier and shift it into
3014                  position.  */
3015               int extra_shift = exact_log2 (low_bit);
3016               shift += extra_shift;
3017               factor >>= extra_shift;
3018             }
3019           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3020         }
3021       else
3022         {
3023           /* Use CNTD, then multiply it by FACTOR.  */
3024           val = gen_int_mode (poly_int64 (2, 2), mode);
3025           val = aarch64_force_temporary (mode, temp1, val);
3026
3027           /* Go back to using a negative multiplication factor if we have
3028              no register from which to subtract.  */
3029           if (code == MINUS && src == const0_rtx)
3030             {
3031               factor = -factor;
3032               code = PLUS;
3033             }
3034           rtx coeff1 = gen_int_mode (factor, mode);
3035           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3036           val = gen_rtx_MULT (mode, val, coeff1);
3037         }
3038
3039       if (shift > 0)
3040         {
3041           /* Multiply by 1 << SHIFT.  */
3042           val = aarch64_force_temporary (mode, temp1, val);
3043           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3044         }
3045       else if (shift == -1)
3046         {
3047           /* Divide by 2.  */
3048           val = aarch64_force_temporary (mode, temp1, val);
3049           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3050         }
3051
3052       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
3053       if (src != const0_rtx)
3054         {
3055           val = aarch64_force_temporary (mode, temp1, val);
3056           val = gen_rtx_fmt_ee (code, mode, src, val);
3057         }
3058       else if (code == MINUS)
3059         {
3060           val = aarch64_force_temporary (mode, temp1, val);
3061           val = gen_rtx_NEG (mode, val);
3062         }
3063
3064       if (constant == 0 || frame_related_p)
3065         {
3066           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3067           if (frame_related_p)
3068             {
3069               RTX_FRAME_RELATED_P (insn) = true;
3070               add_reg_note (insn, REG_CFA_ADJUST_CFA,
3071                             gen_rtx_SET (dest, plus_constant (Pmode, src,
3072                                                               poly_offset)));
3073             }
3074           src = dest;
3075           if (constant == 0)
3076             return;
3077         }
3078       else
3079         {
3080           src = aarch64_force_temporary (mode, temp1, val);
3081           temp1 = temp2;
3082           temp2 = NULL_RTX;
3083         }
3084
3085       emit_move_imm = true;
3086     }
3087
3088   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3089                         frame_related_p, emit_move_imm);
3090 }
3091
3092 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3093    than a poly_int64.  */
3094
3095 void
3096 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3097                           rtx offset_rtx, rtx temp1, rtx temp2)
3098 {
3099   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3100                       temp1, temp2, false);
3101 }
3102
3103 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3104    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
3105    if TEMP1 already contains abs (DELTA).  */
3106
3107 static inline void
3108 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3109 {
3110   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3111                       temp1, temp2, true, emit_move_imm);
3112 }
3113
3114 /* Subtract DELTA from the stack pointer, marking the instructions
3115    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
3116    if nonnull.  */
3117
3118 static inline void
3119 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3120                 bool emit_move_imm = true)
3121 {
3122   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3123                       temp1, temp2, frame_related_p, emit_move_imm);
3124 }
3125
3126 /* Set DEST to (vec_series BASE STEP).  */
3127
3128 static void
3129 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3130 {
3131   machine_mode mode = GET_MODE (dest);
3132   scalar_mode inner = GET_MODE_INNER (mode);
3133
3134   /* Each operand can be a register or an immediate in the range [-16, 15].  */
3135   if (!aarch64_sve_index_immediate_p (base))
3136     base = force_reg (inner, base);
3137   if (!aarch64_sve_index_immediate_p (step))
3138     step = force_reg (inner, step);
3139
3140   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3141 }
3142
3143 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
3144    integer of mode INT_MODE.  Return true on success.  */
3145
3146 static bool
3147 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
3148                                       rtx src)
3149 {
3150   /* If the constant is smaller than 128 bits, we can do the move
3151      using a vector of SRC_MODEs.  */
3152   if (src_mode != TImode)
3153     {
3154       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
3155                                      GET_MODE_SIZE (src_mode));
3156       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
3157       emit_move_insn (gen_lowpart (dup_mode, dest),
3158                       gen_const_vec_duplicate (dup_mode, src));
3159       return true;
3160     }
3161
3162   /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
3163   src = force_const_mem (src_mode, src);
3164   if (!src)
3165     return false;
3166
3167   /* Make sure that the address is legitimate.  */
3168   if (!aarch64_sve_ld1r_operand_p (src))
3169     {
3170       rtx addr = force_reg (Pmode, XEXP (src, 0));
3171       src = replace_equiv_address (src, addr);
3172     }
3173
3174   machine_mode mode = GET_MODE (dest);
3175   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3176   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3177   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3178   src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
3179   emit_insn (gen_rtx_SET (dest, src));
3180   return true;
3181 }
3182
3183 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
3184    isn't a simple duplicate or series.  */
3185
3186 static void
3187 aarch64_expand_sve_const_vector (rtx dest, rtx src)
3188 {
3189   machine_mode mode = GET_MODE (src);
3190   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3191   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3192   gcc_assert (npatterns > 1);
3193
3194   if (nelts_per_pattern == 1)
3195     {
3196       /* The constant is a repeating seqeuence of at least two elements,
3197          where the repeating elements occupy no more than 128 bits.
3198          Get an integer representation of the replicated value.  */
3199       scalar_int_mode int_mode;
3200       if (BYTES_BIG_ENDIAN)
3201         /* For now, always use LD1RQ to load the value on big-endian
3202            targets, since the handling of smaller integers includes a
3203            subreg that is semantically an element reverse.  */
3204         int_mode = TImode;
3205       else
3206         {
3207           unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
3208           gcc_assert (int_bits <= 128);
3209           int_mode = int_mode_for_size (int_bits, 0).require ();
3210         }
3211       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
3212       if (int_value
3213           && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
3214         return;
3215     }
3216
3217   /* Expand each pattern individually.  */
3218   rtx_vector_builder builder;
3219   auto_vec<rtx, 16> vectors (npatterns);
3220   for (unsigned int i = 0; i < npatterns; ++i)
3221     {
3222       builder.new_vector (mode, 1, nelts_per_pattern);
3223       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3224         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3225       vectors.quick_push (force_reg (mode, builder.build ()));
3226     }
3227
3228   /* Use permutes to interleave the separate vectors.  */
3229   while (npatterns > 1)
3230     {
3231       npatterns /= 2;
3232       for (unsigned int i = 0; i < npatterns; ++i)
3233         {
3234           rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
3235           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3236           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3237           vectors[i] = tmp;
3238         }
3239     }
3240   gcc_assert (vectors[0] == dest);
3241 }
3242
3243 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
3244    is a pattern that can be used to set DEST to a replicated scalar
3245    element.  */
3246
3247 void
3248 aarch64_expand_mov_immediate (rtx dest, rtx imm,
3249                               rtx (*gen_vec_duplicate) (rtx, rtx))
3250 {
3251   machine_mode mode = GET_MODE (dest);
3252
3253   /* Check on what type of symbol it is.  */
3254   scalar_int_mode int_mode;
3255   if ((GET_CODE (imm) == SYMBOL_REF
3256        || GET_CODE (imm) == LABEL_REF
3257        || GET_CODE (imm) == CONST
3258        || GET_CODE (imm) == CONST_POLY_INT)
3259       && is_a <scalar_int_mode> (mode, &int_mode))
3260     {
3261       rtx mem;
3262       poly_int64 offset;
3263       HOST_WIDE_INT const_offset;
3264       enum aarch64_symbol_type sty;
3265
3266       /* If we have (const (plus symbol offset)), separate out the offset
3267          before we start classifying the symbol.  */
3268       rtx base = strip_offset (imm, &offset);
3269
3270       /* We must always add an offset involving VL separately, rather than
3271          folding it into the relocation.  */
3272       if (!offset.is_constant (&const_offset))
3273         {
3274           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
3275             emit_insn (gen_rtx_SET (dest, imm));
3276           else
3277             {
3278               /* Do arithmetic on 32-bit values if the result is smaller
3279                  than that.  */
3280               if (partial_subreg_p (int_mode, SImode))
3281                 {
3282                   /* It is invalid to do symbol calculations in modes
3283                      narrower than SImode.  */
3284                   gcc_assert (base == const0_rtx);
3285                   dest = gen_lowpart (SImode, dest);
3286                   int_mode = SImode;
3287                 }
3288               if (base != const0_rtx)
3289                 {
3290                   base = aarch64_force_temporary (int_mode, dest, base);
3291                   aarch64_add_offset (int_mode, dest, base, offset,
3292                                       NULL_RTX, NULL_RTX, false);
3293                 }
3294               else
3295                 aarch64_add_offset (int_mode, dest, base, offset,
3296                                     dest, NULL_RTX, false);
3297             }
3298           return;
3299         }
3300
3301       sty = aarch64_classify_symbol (base, const_offset);
3302       switch (sty)
3303         {
3304         case SYMBOL_FORCE_TO_MEM:
3305           if (const_offset != 0
3306               && targetm.cannot_force_const_mem (int_mode, imm))
3307             {
3308               gcc_assert (can_create_pseudo_p ());
3309               base = aarch64_force_temporary (int_mode, dest, base);
3310               aarch64_add_offset (int_mode, dest, base, const_offset,
3311                                   NULL_RTX, NULL_RTX, false);
3312               return;
3313             }
3314
3315           mem = force_const_mem (ptr_mode, imm);
3316           gcc_assert (mem);
3317
3318           /* If we aren't generating PC relative literals, then
3319              we need to expand the literal pool access carefully.
3320              This is something that needs to be done in a number
3321              of places, so could well live as a separate function.  */
3322           if (!aarch64_pcrelative_literal_loads)
3323             {
3324               gcc_assert (can_create_pseudo_p ());
3325               base = gen_reg_rtx (ptr_mode);
3326               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3327               if (ptr_mode != Pmode)
3328                 base = convert_memory_address (Pmode, base);
3329               mem = gen_rtx_MEM (ptr_mode, base);
3330             }
3331
3332           if (int_mode != ptr_mode)
3333             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3334
3335           emit_insn (gen_rtx_SET (dest, mem));
3336
3337           return;
3338
3339         case SYMBOL_SMALL_TLSGD:
3340         case SYMBOL_SMALL_TLSDESC:
3341         case SYMBOL_SMALL_TLSIE:
3342         case SYMBOL_SMALL_GOT_28K:
3343         case SYMBOL_SMALL_GOT_4G:
3344         case SYMBOL_TINY_GOT:
3345         case SYMBOL_TINY_TLSIE:
3346           if (const_offset != 0)
3347             {
3348               gcc_assert(can_create_pseudo_p ());
3349               base = aarch64_force_temporary (int_mode, dest, base);
3350               aarch64_add_offset (int_mode, dest, base, const_offset,
3351                                   NULL_RTX, NULL_RTX, false);
3352               return;
3353             }
3354           /* FALLTHRU */
3355
3356         case SYMBOL_SMALL_ABSOLUTE:
3357         case SYMBOL_TINY_ABSOLUTE:
3358         case SYMBOL_TLSLE12:
3359         case SYMBOL_TLSLE24:
3360         case SYMBOL_TLSLE32:
3361         case SYMBOL_TLSLE48:
3362           aarch64_load_symref_appropriately (dest, imm, sty);
3363           return;
3364
3365         default:
3366           gcc_unreachable ();
3367         }
3368     }
3369
3370   if (!CONST_INT_P (imm))
3371     {
3372       rtx base, step, value;
3373       if (GET_CODE (imm) == HIGH
3374           || aarch64_simd_valid_immediate (imm, NULL))
3375         emit_insn (gen_rtx_SET (dest, imm));
3376       else if (const_vec_series_p (imm, &base, &step))
3377         aarch64_expand_vec_series (dest, base, step);
3378       else if (const_vec_duplicate_p (imm, &value))
3379         {
3380           /* If the constant is out of range of an SVE vector move,
3381              load it from memory if we can, otherwise move it into
3382              a register and use a DUP.  */
3383           scalar_mode inner_mode = GET_MODE_INNER (mode);
3384           rtx op = force_const_mem (inner_mode, value);
3385           if (!op)
3386             op = force_reg (inner_mode, value);
3387           else if (!aarch64_sve_ld1r_operand_p (op))
3388             {
3389               rtx addr = force_reg (Pmode, XEXP (op, 0));
3390               op = replace_equiv_address (op, addr);
3391             }
3392           emit_insn (gen_vec_duplicate (dest, op));
3393         }
3394       else if (GET_CODE (imm) == CONST_VECTOR
3395                && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3396         aarch64_expand_sve_const_vector (dest, imm);
3397       else
3398         {
3399           rtx mem = force_const_mem (mode, imm);
3400           gcc_assert (mem);
3401           emit_move_insn (dest, mem);
3402         }
3403
3404       return;
3405     }
3406
3407   aarch64_internal_mov_immediate (dest, imm, true,
3408                                   as_a <scalar_int_mode> (mode));
3409 }
3410
3411 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3412    that is known to contain PTRUE.  */
3413
3414 void
3415 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3416 {
3417   expand_operand ops[3];
3418   machine_mode mode = GET_MODE (dest);
3419   create_output_operand (&ops[0], dest, mode);
3420   create_input_operand (&ops[1], pred, GET_MODE(pred));
3421   create_input_operand (&ops[2], src, mode);
3422   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
3423 }
3424
3425 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3426    operand is in memory.  In this case we need to use the predicated LD1
3427    and ST1 instead of LDR and STR, both for correctness on big-endian
3428    targets and because LD1 and ST1 support a wider range of addressing modes.
3429    PRED_MODE is the mode of the predicate.
3430
3431    See the comment at the head of aarch64-sve.md for details about the
3432    big-endian handling.  */
3433
3434 void
3435 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3436 {
3437   machine_mode mode = GET_MODE (dest);
3438   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3439   if (!register_operand (src, mode)
3440       && !register_operand (dest, mode))
3441     {
3442       rtx tmp = gen_reg_rtx (mode);
3443       if (MEM_P (src))
3444         aarch64_emit_sve_pred_move (tmp, ptrue, src);
3445       else
3446         emit_move_insn (tmp, src);
3447       src = tmp;
3448     }
3449   aarch64_emit_sve_pred_move (dest, ptrue, src);
3450 }
3451
3452 /* Called only on big-endian targets.  See whether an SVE vector move
3453    from SRC to DEST is effectively a REV[BHW] instruction, because at
3454    least one operand is a subreg of an SVE vector that has wider or
3455    narrower elements.  Return true and emit the instruction if so.
3456
3457    For example:
3458
3459      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3460
3461    represents a VIEW_CONVERT between the following vectors, viewed
3462    in memory order:
3463
3464      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
3465      R1: { [0],      [1],      [2],      [3],     ... }
3466
3467    The high part of lane X in R2 should therefore correspond to lane X*2
3468    of R1, but the register representations are:
3469
3470          msb                                      lsb
3471      R2: ...... [1].high  [1].low   [0].high  [0].low
3472      R1: ...... [3]       [2]       [1]       [0]
3473
3474    where the low part of lane X in R2 corresponds to lane X*2 in R1.
3475    We therefore need a reverse operation to swap the high and low values
3476    around.
3477
3478    This is purely an optimization.  Without it we would spill the
3479    subreg operand to the stack in one mode and reload it in the
3480    other mode, which has the same effect as the REV.  */
3481
3482 bool
3483 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3484 {
3485   gcc_assert (BYTES_BIG_ENDIAN);
3486   if (GET_CODE (dest) == SUBREG)
3487     dest = SUBREG_REG (dest);
3488   if (GET_CODE (src) == SUBREG)
3489     src = SUBREG_REG (src);
3490
3491   /* The optimization handles two single SVE REGs with different element
3492      sizes.  */
3493   if (!REG_P (dest)
3494       || !REG_P (src)
3495       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3496       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3497       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3498           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3499     return false;
3500
3501   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
3502   rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3503   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3504                                UNSPEC_REV_SUBREG);
3505   emit_insn (gen_rtx_SET (dest, unspec));
3506   return true;
3507 }
3508
3509 /* Return a copy of X with mode MODE, without changing its other
3510    attributes.  Unlike gen_lowpart, this doesn't care whether the
3511    mode change is valid.  */
3512
3513 static rtx
3514 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3515 {
3516   if (GET_MODE (x) == mode)
3517     return x;
3518
3519   x = shallow_copy_rtx (x);
3520   set_mode_and_regno (x, mode, REGNO (x));
3521   return x;
3522 }
3523
3524 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3525    operands.  */
3526
3527 void
3528 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3529 {
3530   /* Decide which REV operation we need.  The mode with narrower elements
3531      determines the mode of the operands and the mode with the wider
3532      elements determines the reverse width.  */
3533   machine_mode mode_with_wider_elts = GET_MODE (dest);
3534   machine_mode mode_with_narrower_elts = GET_MODE (src);
3535   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3536       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3537     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3538
3539   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3540   unsigned int unspec;
3541   if (wider_bytes == 8)
3542     unspec = UNSPEC_REV64;
3543   else if (wider_bytes == 4)
3544     unspec = UNSPEC_REV32;
3545   else if (wider_bytes == 2)
3546     unspec = UNSPEC_REV16;
3547   else
3548     gcc_unreachable ();
3549   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3550
3551   /* Emit:
3552
3553        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3554                          UNSPEC_MERGE_PTRUE))
3555
3556      with the appropriate modes.  */
3557   ptrue = gen_lowpart (pred_mode, ptrue);
3558   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3559   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3560   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3561   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3562                         UNSPEC_MERGE_PTRUE);
3563   emit_insn (gen_rtx_SET (dest, src));
3564 }
3565
3566 static bool
3567 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3568                                  tree exp ATTRIBUTE_UNUSED)
3569 {
3570   if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
3571     return false;
3572
3573   return true;
3574 }
3575
3576 /* Implement TARGET_PASS_BY_REFERENCE.  */
3577
3578 static bool
3579 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3580                            machine_mode mode,
3581                            const_tree type,
3582                            bool named ATTRIBUTE_UNUSED)
3583 {
3584   HOST_WIDE_INT size;
3585   machine_mode dummymode;
3586   int nregs;
3587
3588   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3589   if (mode == BLKmode && type)
3590     size = int_size_in_bytes (type);
3591   else
3592     /* No frontends can create types with variable-sized modes, so we
3593        shouldn't be asked to pass or return them.  */
3594     size = GET_MODE_SIZE (mode).to_constant ();
3595
3596   /* Aggregates are passed by reference based on their size.  */
3597   if (type && AGGREGATE_TYPE_P (type))
3598     {
3599       size = int_size_in_bytes (type);
3600     }
3601
3602   /* Variable sized arguments are always returned by reference.  */
3603   if (size < 0)
3604     return true;
3605
3606   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3607   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3608                                                &dummymode, &nregs,
3609                                                NULL))
3610     return false;
3611
3612   /* Arguments which are variable sized or larger than 2 registers are
3613      passed by reference unless they are a homogenous floating point
3614      aggregate.  */
3615   return size > 2 * UNITS_PER_WORD;
3616 }
3617
3618 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3619 static bool
3620 aarch64_return_in_msb (const_tree valtype)
3621 {
3622   machine_mode dummy_mode;
3623   int dummy_int;
3624
3625   /* Never happens in little-endian mode.  */
3626   if (!BYTES_BIG_ENDIAN)
3627     return false;
3628
3629   /* Only composite types smaller than or equal to 16 bytes can
3630      be potentially returned in registers.  */
3631   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3632       || int_size_in_bytes (valtype) <= 0
3633       || int_size_in_bytes (valtype) > 16)
3634     return false;
3635
3636   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3637      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3638      is always passed/returned in the least significant bits of fp/simd
3639      register(s).  */
3640   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3641                                                &dummy_mode, &dummy_int, NULL))
3642     return false;
3643
3644   return true;
3645 }
3646
3647 /* Implement TARGET_FUNCTION_VALUE.
3648    Define how to find the value returned by a function.  */
3649
3650 static rtx
3651 aarch64_function_value (const_tree type, const_tree func,
3652                         bool outgoing ATTRIBUTE_UNUSED)
3653 {
3654   machine_mode mode;
3655   int unsignedp;
3656   int count;
3657   machine_mode ag_mode;
3658
3659   mode = TYPE_MODE (type);
3660   if (INTEGRAL_TYPE_P (type))
3661     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3662
3663   if (aarch64_return_in_msb (type))
3664     {
3665       HOST_WIDE_INT size = int_size_in_bytes (type);
3666
3667       if (size % UNITS_PER_WORD != 0)
3668         {
3669           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3670           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3671         }
3672     }
3673
3674   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3675                                                &ag_mode, &count, NULL))
3676     {
3677       if (!aarch64_composite_type_p (type, mode))
3678         {
3679           gcc_assert (count == 1 && mode == ag_mode);
3680           return gen_rtx_REG (mode, V0_REGNUM);
3681         }
3682       else
3683         {
3684           int i;
3685           rtx par;
3686
3687           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3688           for (i = 0; i < count; i++)
3689             {
3690               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3691               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3692               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3693               XVECEXP (par, 0, i) = tmp;
3694             }
3695           return par;
3696         }
3697     }
3698   else
3699     return gen_rtx_REG (mode, R0_REGNUM);
3700 }
3701
3702 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3703    Return true if REGNO is the number of a hard register in which the values
3704    of called function may come back.  */
3705
3706 static bool
3707 aarch64_function_value_regno_p (const unsigned int regno)
3708 {
3709   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3710      of 16-byte return values are: 128-bit integers and 16-byte small
3711      structures (excluding homogeneous floating-point aggregates).  */
3712   if (regno == R0_REGNUM || regno == R1_REGNUM)
3713     return true;
3714
3715   /* Up to four fp/simd registers can return a function value, e.g. a
3716      homogeneous floating-point aggregate having four members.  */
3717   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3718     return TARGET_FLOAT;
3719
3720   return false;
3721 }
3722
3723 /* Implement TARGET_RETURN_IN_MEMORY.
3724
3725    If the type T of the result of a function is such that
3726      void func (T arg)
3727    would require that arg be passed as a value in a register (or set of
3728    registers) according to the parameter passing rules, then the result
3729    is returned in the same registers as would be used for such an
3730    argument.  */
3731
3732 static bool
3733 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3734 {
3735   HOST_WIDE_INT size;
3736   machine_mode ag_mode;
3737   int count;
3738
3739   if (!AGGREGATE_TYPE_P (type)
3740       && TREE_CODE (type) != COMPLEX_TYPE
3741       && TREE_CODE (type) != VECTOR_TYPE)
3742     /* Simple scalar types always returned in registers.  */
3743     return false;
3744
3745   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3746                                                type,
3747                                                &ag_mode,
3748                                                &count,
3749                                                NULL))
3750     return false;
3751
3752   /* Types larger than 2 registers returned in memory.  */
3753   size = int_size_in_bytes (type);
3754   return (size < 0 || size > 2 * UNITS_PER_WORD);
3755 }
3756
3757 static bool
3758 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3759                                const_tree type, int *nregs)
3760 {
3761   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3762   return aarch64_vfp_is_call_or_return_candidate (mode,
3763                                                   type,
3764                                                   &pcum->aapcs_vfp_rmode,
3765                                                   nregs,
3766                                                   NULL);
3767 }
3768
3769 /* Given MODE and TYPE of a function argument, return the alignment in
3770    bits.  The idea is to suppress any stronger alignment requested by
3771    the user and opt for the natural alignment (specified in AAPCS64 \S
3772    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
3773    calculated in versions of GCC prior to GCC-9.  This is a helper
3774    function for local use only.  */
3775
3776 static unsigned int
3777 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
3778                                 bool *abi_break)
3779 {
3780   *abi_break = false;
3781   if (!type)
3782     return GET_MODE_ALIGNMENT (mode);
3783
3784   if (integer_zerop (TYPE_SIZE (type)))
3785     return 0;
3786
3787   gcc_assert (TYPE_MODE (type) == mode);
3788
3789   if (!AGGREGATE_TYPE_P (type))
3790     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3791
3792   if (TREE_CODE (type) == ARRAY_TYPE)
3793     return TYPE_ALIGN (TREE_TYPE (type));
3794
3795   unsigned int alignment = 0;
3796   unsigned int bitfield_alignment = 0;
3797   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3798     if (TREE_CODE (field) == FIELD_DECL)
3799       {
3800         alignment = std::max (alignment, DECL_ALIGN (field));
3801         if (DECL_BIT_FIELD_TYPE (field))
3802           bitfield_alignment
3803             = std::max (bitfield_alignment,
3804                         TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
3805       }
3806
3807   if (bitfield_alignment > alignment)
3808     {
3809       *abi_break = true;
3810       return bitfield_alignment;
3811     }
3812
3813   return alignment;
3814 }
3815
3816 /* Layout a function argument according to the AAPCS64 rules.  The rule
3817    numbers refer to the rule numbers in the AAPCS64.  */
3818
3819 static void
3820 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3821                     const_tree type,
3822                     bool named ATTRIBUTE_UNUSED)
3823 {
3824   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3825   int ncrn, nvrn, nregs;
3826   bool allocate_ncrn, allocate_nvrn;
3827   HOST_WIDE_INT size;
3828   bool abi_break;
3829
3830   /* We need to do this once per argument.  */
3831   if (pcum->aapcs_arg_processed)
3832     return;
3833
3834   pcum->aapcs_arg_processed = true;
3835
3836   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
3837   if (type)
3838     size = int_size_in_bytes (type);
3839   else
3840     /* No frontends can create types with variable-sized modes, so we
3841        shouldn't be asked to pass or return them.  */
3842     size = GET_MODE_SIZE (mode).to_constant ();
3843   size = ROUND_UP (size, UNITS_PER_WORD);
3844
3845   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3846   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3847                                                  mode,
3848                                                  type,
3849                                                  &nregs);
3850
3851   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3852      The following code thus handles passing by SIMD/FP registers first.  */
3853
3854   nvrn = pcum->aapcs_nvrn;
3855
3856   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3857      and homogenous short-vector aggregates (HVA).  */
3858   if (allocate_nvrn)
3859     {
3860       if (!TARGET_FLOAT)
3861         aarch64_err_no_fpadvsimd (mode);
3862
3863       if (nvrn + nregs <= NUM_FP_ARG_REGS)
3864         {
3865           pcum->aapcs_nextnvrn = nvrn + nregs;
3866           if (!aarch64_composite_type_p (type, mode))
3867             {
3868               gcc_assert (nregs == 1);
3869               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3870             }
3871           else
3872             {
3873               rtx par;
3874               int i;
3875               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3876               for (i = 0; i < nregs; i++)
3877                 {
3878                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3879                                          V0_REGNUM + nvrn + i);
3880                   rtx offset = gen_int_mode
3881                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3882                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3883                   XVECEXP (par, 0, i) = tmp;
3884                 }
3885               pcum->aapcs_reg = par;
3886             }
3887           return;
3888         }
3889       else
3890         {
3891           /* C.3 NSRN is set to 8.  */
3892           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3893           goto on_stack;
3894         }
3895     }
3896
3897   ncrn = pcum->aapcs_ncrn;
3898   nregs = size / UNITS_PER_WORD;
3899
3900   /* C6 - C9.  though the sign and zero extension semantics are
3901      handled elsewhere.  This is the case where the argument fits
3902      entirely general registers.  */
3903   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3904     {
3905       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3906
3907       /* C.8 if the argument has an alignment of 16 then the NGRN is
3908          rounded up to the next even number.  */
3909       if (nregs == 2
3910           && ncrn % 2
3911           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3912              comparison is there because for > 16 * BITS_PER_UNIT
3913              alignment nregs should be > 2 and therefore it should be
3914              passed by reference rather than value.  */
3915           && (aarch64_function_arg_alignment (mode, type, &abi_break)
3916               == 16 * BITS_PER_UNIT))
3917         {
3918           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
3919             inform (input_location, "parameter passing for argument of type "
3920                     "%qT changed in GCC 9.1", type);
3921           ++ncrn;
3922           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3923         }
3924
3925       /* NREGS can be 0 when e.g. an empty structure is to be passed.
3926          A reg is still generated for it, but the caller should be smart
3927          enough not to use it.  */
3928       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3929         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3930       else
3931         {
3932           rtx par;
3933           int i;
3934
3935           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3936           for (i = 0; i < nregs; i++)
3937             {
3938               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3939               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3940                                        GEN_INT (i * UNITS_PER_WORD));
3941               XVECEXP (par, 0, i) = tmp;
3942             }
3943           pcum->aapcs_reg = par;
3944         }
3945
3946       pcum->aapcs_nextncrn = ncrn + nregs;
3947       return;
3948     }
3949
3950   /* C.11  */
3951   pcum->aapcs_nextncrn = NUM_ARG_REGS;
3952
3953   /* The argument is passed on stack; record the needed number of words for
3954      this argument and align the total size if necessary.  */
3955 on_stack:
3956   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3957
3958   if (aarch64_function_arg_alignment (mode, type, &abi_break)
3959       == 16 * BITS_PER_UNIT)
3960     {
3961       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
3962       if (pcum->aapcs_stack_size != new_size)
3963         {
3964           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
3965             inform (input_location, "parameter passing for argument of type "
3966                     "%qT changed in GCC 9.1", type);
3967           pcum->aapcs_stack_size = new_size;
3968         }
3969     }
3970   return;
3971 }
3972
3973 /* Implement TARGET_FUNCTION_ARG.  */
3974
3975 static rtx
3976 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3977                       const_tree type, bool named)
3978 {
3979   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3980   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3981
3982   if (mode == VOIDmode)
3983     return NULL_RTX;
3984
3985   aarch64_layout_arg (pcum_v, mode, type, named);
3986   return pcum->aapcs_reg;
3987 }
3988
3989 void
3990 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3991                            const_tree fntype ATTRIBUTE_UNUSED,
3992                            rtx libname ATTRIBUTE_UNUSED,
3993                            const_tree fndecl ATTRIBUTE_UNUSED,
3994                            unsigned n_named ATTRIBUTE_UNUSED)
3995 {
3996   pcum->aapcs_ncrn = 0;
3997   pcum->aapcs_nvrn = 0;
3998   pcum->aapcs_nextncrn = 0;
3999   pcum->aapcs_nextnvrn = 0;
4000   pcum->pcs_variant = ARM_PCS_AAPCS64;
4001   pcum->aapcs_reg = NULL_RTX;
4002   pcum->aapcs_arg_processed = false;
4003   pcum->aapcs_stack_words = 0;
4004   pcum->aapcs_stack_size = 0;
4005
4006   if (!TARGET_FLOAT
4007       && fndecl && TREE_PUBLIC (fndecl)
4008       && fntype && fntype != error_mark_node)
4009     {
4010       const_tree type = TREE_TYPE (fntype);
4011       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
4012       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
4013       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4014                                                    &mode, &nregs, NULL))
4015         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4016     }
4017   return;
4018 }
4019
4020 static void
4021 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4022                               machine_mode mode,
4023                               const_tree type,
4024                               bool named)
4025 {
4026   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4027   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4028     {
4029       aarch64_layout_arg (pcum_v, mode, type, named);
4030       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4031                   != (pcum->aapcs_stack_words != 0));
4032       pcum->aapcs_arg_processed = false;
4033       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4034       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4035       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4036       pcum->aapcs_stack_words = 0;
4037       pcum->aapcs_reg = NULL_RTX;
4038     }
4039 }
4040
4041 bool
4042 aarch64_function_arg_regno_p (unsigned regno)
4043 {
4044   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4045           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4046 }
4047
4048 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
4049    PARM_BOUNDARY bits of alignment, but will be given anything up
4050    to STACK_BOUNDARY bits if the type requires it.  This makes sure
4051    that both before and after the layout of each argument, the Next
4052    Stacked Argument Address (NSAA) will have a minimum alignment of
4053    8 bytes.  */
4054
4055 static unsigned int
4056 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4057 {
4058   bool abi_break;
4059   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4060                                                            &abi_break);
4061   if (abi_break & warn_psabi)
4062     inform (input_location, "parameter passing for argument of type "
4063             "%qT changed in GCC 9.1", type);
4064
4065   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4066 }
4067
4068 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
4069
4070 static fixed_size_mode
4071 aarch64_get_reg_raw_mode (int regno)
4072 {
4073   if (TARGET_SVE && FP_REGNUM_P (regno))
4074     /* Don't use the SVE part of the register for __builtin_apply and
4075        __builtin_return.  The SVE registers aren't used by the normal PCS,
4076        so using them there would be a waste of time.  The PCS extensions
4077        for SVE types are fundamentally incompatible with the
4078        __builtin_return/__builtin_apply interface.  */
4079     return as_a <fixed_size_mode> (V16QImode);
4080   return default_get_reg_raw_mode (regno);
4081 }
4082
4083 /* Implement TARGET_FUNCTION_ARG_PADDING.
4084
4085    Small aggregate types are placed in the lowest memory address.
4086
4087    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
4088
4089 static pad_direction
4090 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4091 {
4092   /* On little-endian targets, the least significant byte of every stack
4093      argument is passed at the lowest byte address of the stack slot.  */
4094   if (!BYTES_BIG_ENDIAN)
4095     return PAD_UPWARD;
4096
4097   /* Otherwise, integral, floating-point and pointer types are padded downward:
4098      the least significant byte of a stack argument is passed at the highest
4099      byte address of the stack slot.  */
4100   if (type
4101       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4102          || POINTER_TYPE_P (type))
4103       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4104     return PAD_DOWNWARD;
4105
4106   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
4107   return PAD_UPWARD;
4108 }
4109
4110 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4111
4112    It specifies padding for the last (may also be the only)
4113    element of a block move between registers and memory.  If
4114    assuming the block is in the memory, padding upward means that
4115    the last element is padded after its highest significant byte,
4116    while in downward padding, the last element is padded at the
4117    its least significant byte side.
4118
4119    Small aggregates and small complex types are always padded
4120    upwards.
4121
4122    We don't need to worry about homogeneous floating-point or
4123    short-vector aggregates; their move is not affected by the
4124    padding direction determined here.  Regardless of endianness,
4125    each element of such an aggregate is put in the least
4126    significant bits of a fp/simd register.
4127
4128    Return !BYTES_BIG_ENDIAN if the least significant byte of the
4129    register has useful data, and return the opposite if the most
4130    significant byte does.  */
4131
4132 bool
4133 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4134                      bool first ATTRIBUTE_UNUSED)
4135 {
4136
4137   /* Small composite types are always padded upward.  */
4138   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4139     {
4140       HOST_WIDE_INT size;
4141       if (type)
4142         size = int_size_in_bytes (type);
4143       else
4144         /* No frontends can create types with variable-sized modes, so we
4145            shouldn't be asked to pass or return them.  */
4146         size = GET_MODE_SIZE (mode).to_constant ();
4147       if (size < 2 * UNITS_PER_WORD)
4148         return true;
4149     }
4150
4151   /* Otherwise, use the default padding.  */
4152   return !BYTES_BIG_ENDIAN;
4153 }
4154
4155 static scalar_int_mode
4156 aarch64_libgcc_cmp_return_mode (void)
4157 {
4158   return SImode;
4159 }
4160
4161 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4162
4163 /* We use the 12-bit shifted immediate arithmetic instructions so values
4164    must be multiple of (1 << 12), i.e. 4096.  */
4165 #define ARITH_FACTOR 4096
4166
4167 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4168 #error Cannot use simple address calculation for stack probing
4169 #endif
4170
4171 /* The pair of scratch registers used for stack probing.  */
4172 #define PROBE_STACK_FIRST_REG  R9_REGNUM
4173 #define PROBE_STACK_SECOND_REG R10_REGNUM
4174
4175 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4176    inclusive.  These are offsets from the current stack pointer.  */
4177
4178 static void
4179 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4180 {
4181   HOST_WIDE_INT size;
4182   if (!poly_size.is_constant (&size))
4183     {
4184       sorry ("stack probes for SVE frames");
4185       return;
4186     }
4187
4188   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4189
4190   /* See the same assertion on PROBE_INTERVAL above.  */
4191   gcc_assert ((first % ARITH_FACTOR) == 0);
4192
4193   /* See if we have a constant small number of probes to generate.  If so,
4194      that's the easy case.  */
4195   if (size <= PROBE_INTERVAL)
4196     {
4197       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4198
4199       emit_set_insn (reg1,
4200                      plus_constant (Pmode,
4201                                     stack_pointer_rtx, -(first + base)));
4202       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4203     }
4204
4205   /* The run-time loop is made up of 8 insns in the generic case while the
4206      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
4207   else if (size <= 4 * PROBE_INTERVAL)
4208     {
4209       HOST_WIDE_INT i, rem;
4210
4211       emit_set_insn (reg1,
4212                      plus_constant (Pmode,
4213                                     stack_pointer_rtx,
4214                                     -(first + PROBE_INTERVAL)));
4215       emit_stack_probe (reg1);
4216
4217       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4218          it exceeds SIZE.  If only two probes are needed, this will not
4219          generate any code.  Then probe at FIRST + SIZE.  */
4220       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4221         {
4222           emit_set_insn (reg1,
4223                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4224           emit_stack_probe (reg1);
4225         }
4226
4227       rem = size - (i - PROBE_INTERVAL);
4228       if (rem > 256)
4229         {
4230           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4231
4232           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4233           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4234         }
4235       else
4236         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4237     }
4238
4239   /* Otherwise, do the same as above, but in a loop.  Note that we must be
4240      extra careful with variables wrapping around because we might be at
4241      the very top (or the very bottom) of the address space and we have
4242      to be able to handle this case properly; in particular, we use an
4243      equality test for the loop condition.  */
4244   else
4245     {
4246       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
4247
4248       /* Step 1: round SIZE to the previous multiple of the interval.  */
4249
4250       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
4251
4252
4253       /* Step 2: compute initial and final value of the loop counter.  */
4254
4255       /* TEST_ADDR = SP + FIRST.  */
4256       emit_set_insn (reg1,
4257                      plus_constant (Pmode, stack_pointer_rtx, -first));
4258
4259       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
4260       HOST_WIDE_INT adjustment = - (first + rounded_size);
4261       if (! aarch64_uimm12_shift (adjustment))
4262         {
4263           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
4264                                           true, Pmode);
4265           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
4266         }
4267       else
4268         emit_set_insn (reg2,
4269                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
4270
4271       /* Step 3: the loop
4272
4273          do
4274            {
4275              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4276              probe at TEST_ADDR
4277            }
4278          while (TEST_ADDR != LAST_ADDR)
4279
4280          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4281          until it is equal to ROUNDED_SIZE.  */
4282
4283       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
4284
4285
4286       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4287          that SIZE is equal to ROUNDED_SIZE.  */
4288
4289       if (size != rounded_size)
4290         {
4291           HOST_WIDE_INT rem = size - rounded_size;
4292
4293           if (rem > 256)
4294             {
4295               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4296
4297               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
4298               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
4299             }
4300           else
4301             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
4302         }
4303     }
4304
4305   /* Make sure nothing is scheduled before we are done.  */
4306   emit_insn (gen_blockage ());
4307 }
4308
4309 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
4310    absolute addresses.  */
4311
4312 const char *
4313 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
4314 {
4315   static int labelno = 0;
4316   char loop_lab[32];
4317   rtx xops[2];
4318
4319   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
4320
4321   /* Loop.  */
4322   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
4323
4324   HOST_WIDE_INT stack_clash_probe_interval
4325     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
4326
4327   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
4328   xops[0] = reg1;
4329   HOST_WIDE_INT interval;
4330   if (flag_stack_clash_protection)
4331     interval = stack_clash_probe_interval;
4332   else
4333     interval = PROBE_INTERVAL;
4334
4335   gcc_assert (aarch64_uimm12_shift (interval));
4336   xops[1] = GEN_INT (interval);
4337
4338   output_asm_insn ("sub\t%0, %0, %1", xops);
4339
4340   /* If doing stack clash protection then we probe up by the ABI specified
4341      amount.  We do this because we're dropping full pages at a time in the
4342      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
4343   if (flag_stack_clash_protection)
4344     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
4345   else
4346     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
4347
4348   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
4349      by this amount for each iteration.  */
4350   output_asm_insn ("str\txzr, [%0, %1]", xops);
4351
4352   /* Test if TEST_ADDR == LAST_ADDR.  */
4353   xops[1] = reg2;
4354   output_asm_insn ("cmp\t%0, %1", xops);
4355
4356   /* Branch.  */
4357   fputs ("\tb.ne\t", asm_out_file);
4358   assemble_name_raw (asm_out_file, loop_lab);
4359   fputc ('\n', asm_out_file);
4360
4361   return "";
4362 }
4363
4364 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4365    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4366    of GUARD_SIZE.  When a probe is emitted it is done at most
4367    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4368    at most MIN_PROBE_THRESHOLD.  By the end of this function
4369    BASE = BASE - ADJUSTMENT.  */
4370
4371 const char *
4372 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
4373                                       rtx min_probe_threshold, rtx guard_size)
4374 {
4375   /* This function is not allowed to use any instruction generation function
4376      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
4377      so instead emit the code you want using output_asm_insn.  */
4378   gcc_assert (flag_stack_clash_protection);
4379   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
4380   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
4381
4382   /* The minimum required allocation before the residual requires probing.  */
4383   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
4384
4385   /* Clamp the value down to the nearest value that can be used with a cmp.  */
4386   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
4387   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
4388
4389   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
4390   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
4391
4392   static int labelno = 0;
4393   char loop_start_lab[32];
4394   char loop_end_lab[32];
4395   rtx xops[2];
4396
4397   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
4398   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
4399
4400   /* Emit loop start label.  */
4401   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
4402
4403   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
4404   xops[0] = adjustment;
4405   xops[1] = probe_offset_value_rtx;
4406   output_asm_insn ("cmp\t%0, %1", xops);
4407
4408   /* Branch to end if not enough adjustment to probe.  */
4409   fputs ("\tb.lt\t", asm_out_file);
4410   assemble_name_raw (asm_out_file, loop_end_lab);
4411   fputc ('\n', asm_out_file);
4412
4413   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
4414   xops[0] = base;
4415   xops[1] = probe_offset_value_rtx;
4416   output_asm_insn ("sub\t%0, %0, %1", xops);
4417
4418   /* Probe at BASE.  */
4419   xops[1] = const0_rtx;
4420   output_asm_insn ("str\txzr, [%0, %1]", xops);
4421
4422   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
4423   xops[0] = adjustment;
4424   xops[1] = probe_offset_value_rtx;
4425   output_asm_insn ("sub\t%0, %0, %1", xops);
4426
4427   /* Branch to start if still more bytes to allocate.  */
4428   fputs ("\tb\t", asm_out_file);
4429   assemble_name_raw (asm_out_file, loop_start_lab);
4430   fputc ('\n', asm_out_file);
4431
4432   /* No probe leave.  */
4433   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
4434
4435   /* BASE = BASE - ADJUSTMENT.  */
4436   xops[0] = base;
4437   xops[1] = adjustment;
4438   output_asm_insn ("sub\t%0, %0, %1", xops);
4439   return "";
4440 }
4441
4442 /* Determine whether a frame chain needs to be generated.  */
4443 static bool
4444 aarch64_needs_frame_chain (void)
4445 {
4446   /* Force a frame chain for EH returns so the return address is at FP+8.  */
4447   if (frame_pointer_needed || crtl->calls_eh_return)
4448     return true;
4449
4450   /* A leaf function cannot have calls or write LR.  */
4451   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4452
4453   /* Don't use a frame chain in leaf functions if leaf frame pointers
4454      are disabled.  */
4455   if (flag_omit_leaf_frame_pointer && is_leaf)
4456     return false;
4457
4458   return aarch64_use_frame_pointer;
4459 }
4460
4461 /* Mark the registers that need to be saved by the callee and calculate
4462    the size of the callee-saved registers area and frame record (both FP
4463    and LR may be omitted).  */
4464 static void
4465 aarch64_layout_frame (void)
4466 {
4467   HOST_WIDE_INT offset = 0;
4468   int regno, last_fp_reg = INVALID_REGNUM;
4469   bool simd_function = aarch64_simd_decl_p (cfun->decl);
4470
4471   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4472
4473   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
4474      the mid-end is doing.  */
4475   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
4476
4477 #define SLOT_NOT_REQUIRED (-2)
4478 #define SLOT_REQUIRED     (-1)
4479
4480   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4481   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4482
4483   /* If this is a non-leaf simd function with calls we assume that
4484      at least one of those calls is to a non-simd function and thus
4485      we must save V8 to V23 in the prologue.  */
4486
4487   if (simd_function && !crtl->is_leaf)
4488     {
4489       for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4490         if (FP_SIMD_SAVED_REGNUM_P (regno))
4491           df_set_regs_ever_live (regno, true);
4492     }
4493
4494   /* First mark all the registers that really need to be saved...  */
4495   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4496     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4497
4498   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4499     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4500
4501   /* ... that includes the eh data registers (if needed)...  */
4502   if (crtl->calls_eh_return)
4503     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4504       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4505         = SLOT_REQUIRED;
4506
4507   /* ... and any callee saved register that dataflow says is live.  */
4508   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4509     if (df_regs_ever_live_p (regno)
4510         && (regno == R30_REGNUM
4511             || !call_used_regs[regno]))
4512       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4513
4514   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4515     if (df_regs_ever_live_p (regno)
4516         && (!call_used_regs[regno]
4517             || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
4518       {
4519         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4520         last_fp_reg = regno;
4521       }
4522
4523   if (cfun->machine->frame.emit_frame_chain)
4524     {
4525       /* FP and LR are placed in the linkage record.  */
4526       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4527       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4528       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4529       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4530       offset = 2 * UNITS_PER_WORD;
4531     }
4532
4533   /* With stack-clash, LR must be saved in non-leaf functions.  */
4534   gcc_assert (crtl->is_leaf
4535               || (cfun->machine->frame.reg_offset[R30_REGNUM]
4536                   != SLOT_NOT_REQUIRED));
4537
4538   /* Now assign stack slots for them.  */
4539   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4540     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4541       {
4542         cfun->machine->frame.reg_offset[regno] = offset;
4543         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4544           cfun->machine->frame.wb_candidate1 = regno;
4545         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4546           cfun->machine->frame.wb_candidate2 = regno;
4547         offset += UNITS_PER_WORD;
4548       }
4549
4550   HOST_WIDE_INT max_int_offset = offset;
4551   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4552   bool has_align_gap = offset != max_int_offset;
4553
4554   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4555     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4556       {
4557         /* If there is an alignment gap between integer and fp callee-saves,
4558            allocate the last fp register to it if possible.  */
4559         if (regno == last_fp_reg
4560             && has_align_gap
4561             && !simd_function
4562             && (offset & 8) == 0)
4563           {
4564             cfun->machine->frame.reg_offset[regno] = max_int_offset;
4565             break;
4566           }
4567
4568         cfun->machine->frame.reg_offset[regno] = offset;
4569         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4570           cfun->machine->frame.wb_candidate1 = regno;
4571         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4572                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4573           cfun->machine->frame.wb_candidate2 = regno;
4574         offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
4575       }
4576
4577   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4578
4579   cfun->machine->frame.saved_regs_size = offset;
4580
4581   HOST_WIDE_INT varargs_and_saved_regs_size
4582     = offset + cfun->machine->frame.saved_varargs_size;
4583
4584   cfun->machine->frame.hard_fp_offset
4585     = aligned_upper_bound (varargs_and_saved_regs_size
4586                            + get_frame_size (),
4587                            STACK_BOUNDARY / BITS_PER_UNIT);
4588
4589   /* Both these values are already aligned.  */
4590   gcc_assert (multiple_p (crtl->outgoing_args_size,
4591                           STACK_BOUNDARY / BITS_PER_UNIT));
4592   cfun->machine->frame.frame_size
4593     = (cfun->machine->frame.hard_fp_offset
4594        + crtl->outgoing_args_size);
4595
4596   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4597
4598   cfun->machine->frame.initial_adjust = 0;
4599   cfun->machine->frame.final_adjust = 0;
4600   cfun->machine->frame.callee_adjust = 0;
4601   cfun->machine->frame.callee_offset = 0;
4602
4603   HOST_WIDE_INT max_push_offset = 0;
4604   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4605     max_push_offset = 512;
4606   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4607     max_push_offset = 256;
4608
4609   HOST_WIDE_INT const_size, const_fp_offset;
4610   if (cfun->machine->frame.frame_size.is_constant (&const_size)
4611       && const_size < max_push_offset
4612       && known_eq (crtl->outgoing_args_size, 0))
4613     {
4614       /* Simple, small frame with no outgoing arguments:
4615          stp reg1, reg2, [sp, -frame_size]!
4616          stp reg3, reg4, [sp, 16]  */
4617       cfun->machine->frame.callee_adjust = const_size;
4618     }
4619   else if (known_lt (crtl->outgoing_args_size
4620                      + cfun->machine->frame.saved_regs_size, 512)
4621            && !(cfun->calls_alloca
4622                 && known_lt (cfun->machine->frame.hard_fp_offset,
4623                              max_push_offset)))
4624     {
4625       /* Frame with small outgoing arguments:
4626          sub sp, sp, frame_size
4627          stp reg1, reg2, [sp, outgoing_args_size]
4628          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
4629       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4630       cfun->machine->frame.callee_offset
4631         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4632     }
4633   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4634            && const_fp_offset < max_push_offset)
4635     {
4636       /* Frame with large outgoing arguments but a small local area:
4637          stp reg1, reg2, [sp, -hard_fp_offset]!
4638          stp reg3, reg4, [sp, 16]
4639          sub sp, sp, outgoing_args_size  */
4640       cfun->machine->frame.callee_adjust = const_fp_offset;
4641       cfun->machine->frame.final_adjust
4642         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4643     }
4644   else
4645     {
4646       /* Frame with large local area and outgoing arguments using frame pointer:
4647          sub sp, sp, hard_fp_offset
4648          stp x29, x30, [sp, 0]
4649          add x29, sp, 0
4650          stp reg3, reg4, [sp, 16]
4651          sub sp, sp, outgoing_args_size  */
4652       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4653       cfun->machine->frame.final_adjust
4654         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4655     }
4656
4657   cfun->machine->frame.laid_out = true;
4658 }
4659
4660 /* Return true if the register REGNO is saved on entry to
4661    the current function.  */
4662
4663 static bool
4664 aarch64_register_saved_on_entry (int regno)
4665 {
4666   return cfun->machine->frame.reg_offset[regno] >= 0;
4667 }
4668
4669 /* Return the next register up from REGNO up to LIMIT for the callee
4670    to save.  */
4671
4672 static unsigned
4673 aarch64_next_callee_save (unsigned regno, unsigned limit)
4674 {
4675   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4676     regno ++;
4677   return regno;
4678 }
4679
4680 /* Push the register number REGNO of mode MODE to the stack with write-back
4681    adjusting the stack by ADJUSTMENT.  */
4682
4683 static void
4684 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4685                            HOST_WIDE_INT adjustment)
4686  {
4687   rtx base_rtx = stack_pointer_rtx;
4688   rtx insn, reg, mem;
4689
4690   reg = gen_rtx_REG (mode, regno);
4691   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4692                             plus_constant (Pmode, base_rtx, -adjustment));
4693   mem = gen_frame_mem (mode, mem);
4694
4695   insn = emit_move_insn (mem, reg);
4696   RTX_FRAME_RELATED_P (insn) = 1;
4697 }
4698
4699 /* Generate and return an instruction to store the pair of registers
4700    REG and REG2 of mode MODE to location BASE with write-back adjusting
4701    the stack location BASE by ADJUSTMENT.  */
4702
4703 static rtx
4704 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4705                           HOST_WIDE_INT adjustment)
4706 {
4707   switch (mode)
4708     {
4709     case E_DImode:
4710       return gen_storewb_pairdi_di (base, base, reg, reg2,
4711                                     GEN_INT (-adjustment),
4712                                     GEN_INT (UNITS_PER_WORD - adjustment));
4713     case E_DFmode:
4714       return gen_storewb_pairdf_di (base, base, reg, reg2,
4715                                     GEN_INT (-adjustment),
4716                                     GEN_INT (UNITS_PER_WORD - adjustment));
4717     case E_TFmode:
4718       return gen_storewb_pairtf_di (base, base, reg, reg2,
4719                                     GEN_INT (-adjustment),
4720                                     GEN_INT (UNITS_PER_VREG - adjustment));
4721     default:
4722       gcc_unreachable ();
4723     }
4724 }
4725
4726 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4727    stack pointer by ADJUSTMENT.  */
4728
4729 static void
4730 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4731 {
4732   rtx_insn *insn;
4733   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4734
4735   if (regno2 == INVALID_REGNUM)
4736     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4737
4738   rtx reg1 = gen_rtx_REG (mode, regno1);
4739   rtx reg2 = gen_rtx_REG (mode, regno2);
4740
4741   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4742                                               reg2, adjustment));
4743   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4744   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4745   RTX_FRAME_RELATED_P (insn) = 1;
4746 }
4747
4748 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4749    adjusting it by ADJUSTMENT afterwards.  */
4750
4751 static rtx
4752 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4753                          HOST_WIDE_INT adjustment)
4754 {
4755   switch (mode)
4756     {
4757     case E_DImode:
4758       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4759                                    GEN_INT (UNITS_PER_WORD));
4760     case E_DFmode:
4761       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4762                                    GEN_INT (UNITS_PER_WORD));
4763     case E_TFmode:
4764       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
4765                                    GEN_INT (UNITS_PER_VREG));
4766     default:
4767       gcc_unreachable ();
4768     }
4769 }
4770
4771 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4772    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4773    into CFI_OPS.  */
4774
4775 static void
4776 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4777                   rtx *cfi_ops)
4778 {
4779   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4780   rtx reg1 = gen_rtx_REG (mode, regno1);
4781
4782   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4783
4784   if (regno2 == INVALID_REGNUM)
4785     {
4786       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4787       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4788       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4789     }
4790   else
4791     {
4792       rtx reg2 = gen_rtx_REG (mode, regno2);
4793       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4794       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4795                                           reg2, adjustment));
4796     }
4797 }
4798
4799 /* Generate and return a store pair instruction of mode MODE to store
4800    register REG1 to MEM1 and register REG2 to MEM2.  */
4801
4802 static rtx
4803 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4804                         rtx reg2)
4805 {
4806   switch (mode)
4807     {
4808     case E_DImode:
4809       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4810
4811     case E_DFmode:
4812       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4813
4814     case E_TFmode:
4815       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
4816
4817     default:
4818       gcc_unreachable ();
4819     }
4820 }
4821
4822 /* Generate and regurn a load pair isntruction of mode MODE to load register
4823    REG1 from MEM1 and register REG2 from MEM2.  */
4824
4825 static rtx
4826 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4827                        rtx mem2)
4828 {
4829   switch (mode)
4830     {
4831     case E_DImode:
4832       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4833
4834     case E_DFmode:
4835       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4836
4837     case E_TFmode:
4838       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
4839
4840     default:
4841       gcc_unreachable ();
4842     }
4843 }
4844
4845 /* Return TRUE if return address signing should be enabled for the current
4846    function, otherwise return FALSE.  */
4847
4848 bool
4849 aarch64_return_address_signing_enabled (void)
4850 {
4851   /* This function should only be called after frame laid out.   */
4852   gcc_assert (cfun->machine->frame.laid_out);
4853
4854   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4855      if it's LR is pushed onto stack.  */
4856   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4857           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4858               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4859 }
4860
4861 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
4862 bool
4863 aarch64_bti_enabled (void)
4864 {
4865   return (aarch64_enable_bti == 1);
4866 }
4867
4868 /* Emit code to save the callee-saved registers from register number START
4869    to LIMIT to the stack at the location starting at offset START_OFFSET,
4870    skipping any write-back candidates if SKIP_WB is true.  */
4871
4872 static void
4873 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4874                            unsigned start, unsigned limit, bool skip_wb)
4875 {
4876   rtx_insn *insn;
4877   unsigned regno;
4878   unsigned regno2;
4879
4880   for (regno = aarch64_next_callee_save (start, limit);
4881        regno <= limit;
4882        regno = aarch64_next_callee_save (regno + 1, limit))
4883     {
4884       rtx reg, mem;
4885       poly_int64 offset;
4886       int offset_diff;
4887
4888       if (skip_wb
4889           && (regno == cfun->machine->frame.wb_candidate1
4890               || regno == cfun->machine->frame.wb_candidate2))
4891         continue;
4892
4893       if (cfun->machine->reg_is_wrapped_separately[regno])
4894        continue;
4895
4896       reg = gen_rtx_REG (mode, regno);
4897       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4898       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4899                                                 offset));
4900
4901       regno2 = aarch64_next_callee_save (regno + 1, limit);
4902       offset_diff = cfun->machine->frame.reg_offset[regno2]
4903                     - cfun->machine->frame.reg_offset[regno];
4904
4905       if (regno2 <= limit
4906           && !cfun->machine->reg_is_wrapped_separately[regno2]
4907           && known_eq (GET_MODE_SIZE (mode), offset_diff))
4908         {
4909           rtx reg2 = gen_rtx_REG (mode, regno2);
4910           rtx mem2;
4911
4912           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4913           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4914                                                      offset));
4915           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4916                                                     reg2));
4917
4918           /* The first part of a frame-related parallel insn is
4919              always assumed to be relevant to the frame
4920              calculations; subsequent parts, are only
4921              frame-related if explicitly marked.  */
4922           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4923           regno = regno2;
4924         }
4925       else
4926         insn = emit_move_insn (mem, reg);
4927
4928       RTX_FRAME_RELATED_P (insn) = 1;
4929     }
4930 }
4931
4932 /* Emit code to restore the callee registers of mode MODE from register
4933    number START up to and including LIMIT.  Restore from the stack offset
4934    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4935    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
4936
4937 static void
4938 aarch64_restore_callee_saves (machine_mode mode,
4939                               poly_int64 start_offset, unsigned start,
4940                               unsigned limit, bool skip_wb, rtx *cfi_ops)
4941 {
4942   rtx base_rtx = stack_pointer_rtx;
4943   unsigned regno;
4944   unsigned regno2;
4945   poly_int64 offset;
4946
4947   for (regno = aarch64_next_callee_save (start, limit);
4948        regno <= limit;
4949        regno = aarch64_next_callee_save (regno + 1, limit))
4950     {
4951       if (cfun->machine->reg_is_wrapped_separately[regno])
4952        continue;
4953
4954       rtx reg, mem;
4955       int offset_diff;
4956
4957       if (skip_wb
4958           && (regno == cfun->machine->frame.wb_candidate1
4959               || regno == cfun->machine->frame.wb_candidate2))
4960         continue;
4961
4962       reg = gen_rtx_REG (mode, regno);
4963       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4964       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4965
4966       regno2 = aarch64_next_callee_save (regno + 1, limit);
4967       offset_diff = cfun->machine->frame.reg_offset[regno2]
4968                     - cfun->machine->frame.reg_offset[regno];
4969
4970       if (regno2 <= limit
4971           && !cfun->machine->reg_is_wrapped_separately[regno2]
4972           && known_eq (GET_MODE_SIZE (mode), offset_diff))
4973         {
4974           rtx reg2 = gen_rtx_REG (mode, regno2);
4975           rtx mem2;
4976
4977           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4978           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4979           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4980
4981           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4982           regno = regno2;
4983         }
4984       else
4985         emit_move_insn (reg, mem);
4986       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4987     }
4988 }
4989
4990 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4991    of MODE.  */
4992
4993 static inline bool
4994 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4995 {
4996   HOST_WIDE_INT multiple;
4997   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4998           && IN_RANGE (multiple, -8, 7));
4999 }
5000
5001 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5002    of MODE.  */
5003
5004 static inline bool
5005 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5006 {
5007   HOST_WIDE_INT multiple;
5008   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5009           && IN_RANGE (multiple, 0, 63));
5010 }
5011
5012 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5013    of MODE.  */
5014
5015 bool
5016 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5017 {
5018   HOST_WIDE_INT multiple;
5019   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5020           && IN_RANGE (multiple, -64, 63));
5021 }
5022
5023 /* Return true if OFFSET is a signed 9-bit value.  */
5024
5025 bool
5026 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5027                                        poly_int64 offset)
5028 {
5029   HOST_WIDE_INT const_offset;
5030   return (offset.is_constant (&const_offset)
5031           && IN_RANGE (const_offset, -256, 255));
5032 }
5033
5034 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5035    of MODE.  */
5036
5037 static inline bool
5038 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5039 {
5040   HOST_WIDE_INT multiple;
5041   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5042           && IN_RANGE (multiple, -256, 255));
5043 }
5044
5045 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5046    of MODE.  */
5047
5048 static inline bool
5049 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5050 {
5051   HOST_WIDE_INT multiple;
5052   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5053           && IN_RANGE (multiple, 0, 4095));
5054 }
5055
5056 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
5057
5058 static sbitmap
5059 aarch64_get_separate_components (void)
5060 {
5061   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5062   bitmap_clear (components);
5063
5064   /* The registers we need saved to the frame.  */
5065   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5066     if (aarch64_register_saved_on_entry (regno))
5067       {
5068         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5069         if (!frame_pointer_needed)
5070           offset += cfun->machine->frame.frame_size
5071                     - cfun->machine->frame.hard_fp_offset;
5072         /* Check that we can access the stack slot of the register with one
5073            direct load with no adjustments needed.  */
5074         if (offset_12bit_unsigned_scaled_p (DImode, offset))
5075           bitmap_set_bit (components, regno);
5076       }
5077
5078   /* Don't mess with the hard frame pointer.  */
5079   if (frame_pointer_needed)
5080     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5081
5082   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5083   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5084   /* If registers have been chosen to be stored/restored with
5085      writeback don't interfere with them to avoid having to output explicit
5086      stack adjustment instructions.  */
5087   if (reg2 != INVALID_REGNUM)
5088     bitmap_clear_bit (components, reg2);
5089   if (reg1 != INVALID_REGNUM)
5090     bitmap_clear_bit (components, reg1);
5091
5092   bitmap_clear_bit (components, LR_REGNUM);
5093   bitmap_clear_bit (components, SP_REGNUM);
5094
5095   return components;
5096 }
5097
5098 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
5099
5100 static sbitmap
5101 aarch64_components_for_bb (basic_block bb)
5102 {
5103   bitmap in = DF_LIVE_IN (bb);
5104   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5105   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5106   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5107
5108   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5109   bitmap_clear (components);
5110
5111   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
5112   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5113     if ((!call_used_regs[regno]
5114         || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5115        && (bitmap_bit_p (in, regno)
5116            || bitmap_bit_p (gen, regno)
5117            || bitmap_bit_p (kill, regno)))
5118       {
5119         unsigned regno2, offset, offset2;
5120         bitmap_set_bit (components, regno);
5121
5122         /* If there is a callee-save at an adjacent offset, add it too
5123            to increase the use of LDP/STP.  */
5124         offset = cfun->machine->frame.reg_offset[regno];
5125         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5126
5127         if (regno2 <= LAST_SAVED_REGNUM)
5128           {
5129             offset2 = cfun->machine->frame.reg_offset[regno2];
5130             if ((offset & ~8) == (offset2 & ~8))
5131               bitmap_set_bit (components, regno2);
5132           }
5133       }
5134
5135   return components;
5136 }
5137
5138 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5139    Nothing to do for aarch64.  */
5140
5141 static void
5142 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5143 {
5144 }
5145
5146 /* Return the next set bit in BMP from START onwards.  Return the total number
5147    of bits in BMP if no set bit is found at or after START.  */
5148
5149 static unsigned int
5150 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5151 {
5152   unsigned int nbits = SBITMAP_SIZE (bmp);
5153   if (start == nbits)
5154     return start;
5155
5156   gcc_assert (start < nbits);
5157   for (unsigned int i = start; i < nbits; i++)
5158     if (bitmap_bit_p (bmp, i))
5159       return i;
5160
5161   return nbits;
5162 }
5163
5164 /* Do the work for aarch64_emit_prologue_components and
5165    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
5166    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5167    for these components or the epilogue sequence.  That is, it determines
5168    whether we should emit stores or loads and what kind of CFA notes to attach
5169    to the insns.  Otherwise the logic for the two sequences is very
5170    similar.  */
5171
5172 static void
5173 aarch64_process_components (sbitmap components, bool prologue_p)
5174 {
5175   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5176                              ? HARD_FRAME_POINTER_REGNUM
5177                              : STACK_POINTER_REGNUM);
5178
5179   unsigned last_regno = SBITMAP_SIZE (components);
5180   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5181   rtx_insn *insn = NULL;
5182
5183   while (regno != last_regno)
5184     {
5185       /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5186          so DFmode for the vector registers is enough.  For simd functions
5187          we want to save the low 128 bits.  */
5188       machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5189
5190       rtx reg = gen_rtx_REG (mode, regno);
5191       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5192       if (!frame_pointer_needed)
5193         offset += cfun->machine->frame.frame_size
5194                   - cfun->machine->frame.hard_fp_offset;
5195       rtx addr = plus_constant (Pmode, ptr_reg, offset);
5196       rtx mem = gen_frame_mem (mode, addr);
5197
5198       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5199       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5200       /* No more registers to handle after REGNO.
5201          Emit a single save/restore and exit.  */
5202       if (regno2 == last_regno)
5203         {
5204           insn = emit_insn (set);
5205           RTX_FRAME_RELATED_P (insn) = 1;
5206           if (prologue_p)
5207             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5208           else
5209             add_reg_note (insn, REG_CFA_RESTORE, reg);
5210           break;
5211         }
5212
5213       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5214       /* The next register is not of the same class or its offset is not
5215          mergeable with the current one into a pair.  */
5216       if (!satisfies_constraint_Ump (mem)
5217           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5218           || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5219           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5220                        GET_MODE_SIZE (mode)))
5221         {
5222           insn = emit_insn (set);
5223           RTX_FRAME_RELATED_P (insn) = 1;
5224           if (prologue_p)
5225             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5226           else
5227             add_reg_note (insn, REG_CFA_RESTORE, reg);
5228
5229           regno = regno2;
5230           continue;
5231         }
5232
5233       /* REGNO2 can be saved/restored in a pair with REGNO.  */
5234       rtx reg2 = gen_rtx_REG (mode, regno2);
5235       if (!frame_pointer_needed)
5236         offset2 += cfun->machine->frame.frame_size
5237                   - cfun->machine->frame.hard_fp_offset;
5238       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5239       rtx mem2 = gen_frame_mem (mode, addr2);
5240       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5241                              : gen_rtx_SET (reg2, mem2);
5242
5243       if (prologue_p)
5244         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
5245       else
5246         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5247
5248       RTX_FRAME_RELATED_P (insn) = 1;
5249       if (prologue_p)
5250         {
5251           add_reg_note (insn, REG_CFA_OFFSET, set);
5252           add_reg_note (insn, REG_CFA_OFFSET, set2);
5253         }
5254       else
5255         {
5256           add_reg_note (insn, REG_CFA_RESTORE, reg);
5257           add_reg_note (insn, REG_CFA_RESTORE, reg2);
5258         }
5259
5260       regno = aarch64_get_next_set_bit (components, regno2 + 1);
5261     }
5262 }
5263
5264 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
5265
5266 static void
5267 aarch64_emit_prologue_components (sbitmap components)
5268 {
5269   aarch64_process_components (components, true);
5270 }
5271
5272 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
5273
5274 static void
5275 aarch64_emit_epilogue_components (sbitmap components)
5276 {
5277   aarch64_process_components (components, false);
5278 }
5279
5280 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
5281
5282 static void
5283 aarch64_set_handled_components (sbitmap components)
5284 {
5285   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5286     if (bitmap_bit_p (components, regno))
5287       cfun->machine->reg_is_wrapped_separately[regno] = true;
5288 }
5289
5290 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
5291    determining the probe offset for alloca.  */
5292
5293 static HOST_WIDE_INT
5294 aarch64_stack_clash_protection_alloca_probe_range (void)
5295 {
5296   return STACK_CLASH_CALLER_GUARD;
5297 }
5298
5299
5300 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5301    registers.  If POLY_SIZE is not large enough to require a probe this function
5302    will only adjust the stack.  When allocating the stack space
5303    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5304    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5305    arguments.  If we are then we ensure that any allocation larger than the ABI
5306    defined buffer needs a probe so that the invariant of having a 1KB buffer is
5307    maintained.
5308
5309    We emit barriers after each stack adjustment to prevent optimizations from
5310    breaking the invariant that we never drop the stack more than a page.  This
5311    invariant is needed to make it easier to correctly handle asynchronous
5312    events, e.g. if we were to allow the stack to be dropped by more than a page
5313    and then have multiple probes up and we take a signal somewhere in between
5314    then the signal handler doesn't know the state of the stack and can make no
5315    assumptions about which pages have been probed.  */
5316
5317 static void
5318 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
5319                                         poly_int64 poly_size,
5320                                         bool frame_related_p,
5321                                         bool final_adjustment_p)
5322 {
5323   HOST_WIDE_INT guard_size
5324     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5325   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5326   /* When doing the final adjustment for the outgoing argument size we can't
5327      assume that LR was saved at position 0.  So subtract it's offset from the
5328      ABI safe buffer so that we don't accidentally allow an adjustment that
5329      would result in an allocation larger than the ABI buffer without
5330      probing.  */
5331   HOST_WIDE_INT min_probe_threshold
5332     = final_adjustment_p
5333       ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
5334       : guard_size - guard_used_by_caller;
5335
5336   poly_int64 frame_size = cfun->machine->frame.frame_size;
5337
5338   /* We should always have a positive probe threshold.  */
5339   gcc_assert (min_probe_threshold > 0);
5340
5341   if (flag_stack_clash_protection && !final_adjustment_p)
5342     {
5343       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5344       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5345
5346       if (known_eq (frame_size, 0))
5347         {
5348           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
5349         }
5350       else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
5351                && known_lt (final_adjust, guard_used_by_caller))
5352         {
5353           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
5354         }
5355     }
5356
5357   /* If SIZE is not large enough to require probing, just adjust the stack and
5358      exit.  */
5359   if (known_lt (poly_size, min_probe_threshold)
5360       || !flag_stack_clash_protection)
5361     {
5362       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
5363       return;
5364     }
5365
5366   HOST_WIDE_INT size;
5367   /* Handle the SVE non-constant case first.  */
5368   if (!poly_size.is_constant (&size))
5369     {
5370      if (dump_file)
5371       {
5372         fprintf (dump_file, "Stack clash SVE prologue: ");
5373         print_dec (poly_size, dump_file);
5374         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
5375       }
5376
5377       /* First calculate the amount of bytes we're actually spilling.  */
5378       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
5379                           poly_size, temp1, temp2, false, true);
5380
5381       rtx_insn *insn = get_last_insn ();
5382
5383       if (frame_related_p)
5384         {
5385           /* This is done to provide unwinding information for the stack
5386              adjustments we're about to do, however to prevent the optimizers
5387              from removing the R11 move and leaving the CFA note (which would be
5388              very wrong) we tie the old and new stack pointer together.
5389              The tie will expand to nothing but the optimizers will not touch
5390              the instruction.  */
5391           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
5392           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
5393           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
5394
5395           /* We want the CFA independent of the stack pointer for the
5396              duration of the loop.  */
5397           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
5398           RTX_FRAME_RELATED_P (insn) = 1;
5399         }
5400
5401       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
5402       rtx guard_const = gen_int_mode (guard_size, Pmode);
5403
5404       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
5405                                                    stack_pointer_rtx, temp1,
5406                                                    probe_const, guard_const));
5407
5408       /* Now reset the CFA register if needed.  */
5409       if (frame_related_p)
5410         {
5411           add_reg_note (insn, REG_CFA_DEF_CFA,
5412                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
5413                                       gen_int_mode (poly_size, Pmode)));
5414           RTX_FRAME_RELATED_P (insn) = 1;
5415         }
5416
5417       return;
5418     }
5419
5420   if (dump_file)
5421     fprintf (dump_file,
5422              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5423              " bytes, probing will be required.\n", size);
5424
5425   /* Round size to the nearest multiple of guard_size, and calculate the
5426      residual as the difference between the original size and the rounded
5427      size.  */
5428   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
5429   HOST_WIDE_INT residual = size - rounded_size;
5430
5431   /* We can handle a small number of allocations/probes inline.  Otherwise
5432      punt to a loop.  */
5433   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
5434     {
5435       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
5436         {
5437           aarch64_sub_sp (NULL, temp2, guard_size, true);
5438           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5439                                            guard_used_by_caller));
5440           emit_insn (gen_blockage ());
5441         }
5442       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
5443     }
5444   else
5445     {
5446       /* Compute the ending address.  */
5447       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
5448                           temp1, NULL, false, true);
5449       rtx_insn *insn = get_last_insn ();
5450
5451       /* For the initial allocation, we don't have a frame pointer
5452          set up, so we always need CFI notes.  If we're doing the
5453          final allocation, then we may have a frame pointer, in which
5454          case it is the CFA, otherwise we need CFI notes.
5455
5456          We can determine which allocation we are doing by looking at
5457          the value of FRAME_RELATED_P since the final allocations are not
5458          frame related.  */
5459       if (frame_related_p)
5460         {
5461           /* We want the CFA independent of the stack pointer for the
5462              duration of the loop.  */
5463           add_reg_note (insn, REG_CFA_DEF_CFA,
5464                         plus_constant (Pmode, temp1, rounded_size));
5465           RTX_FRAME_RELATED_P (insn) = 1;
5466         }
5467
5468       /* This allocates and probes the stack.  Note that this re-uses some of
5469          the existing Ada stack protection code.  However we are guaranteed not
5470          to enter the non loop or residual branches of that code.
5471
5472          The non-loop part won't be entered because if our allocation amount
5473          doesn't require a loop, the case above would handle it.
5474
5475          The residual amount won't be entered because TEMP1 is a mutliple of
5476          the allocation size.  The residual will always be 0.  As such, the only
5477          part we are actually using from that code is the loop setup.  The
5478          actual probing is done in aarch64_output_probe_stack_range.  */
5479       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
5480                                                stack_pointer_rtx, temp1));
5481
5482       /* Now reset the CFA register if needed.  */
5483       if (frame_related_p)
5484         {
5485           add_reg_note (insn, REG_CFA_DEF_CFA,
5486                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
5487           RTX_FRAME_RELATED_P (insn) = 1;
5488         }
5489
5490       emit_insn (gen_blockage ());
5491       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
5492     }
5493
5494   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
5495      be probed.  This maintains the requirement that each page is probed at
5496      least once.  For initial probing we probe only if the allocation is
5497      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
5498      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
5499      GUARD_SIZE.  This works that for any allocation that is large enough to
5500      trigger a probe here, we'll have at least one, and if they're not large
5501      enough for this code to emit anything for them, The page would have been
5502      probed by the saving of FP/LR either by this function or any callees.  If
5503      we don't have any callees then we won't have more stack adjustments and so
5504      are still safe.  */
5505   if (residual)
5506     {
5507       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
5508       /* If we're doing final adjustments, and we've done any full page
5509          allocations then any residual needs to be probed.  */
5510       if (final_adjustment_p && rounded_size != 0)
5511         min_probe_threshold = 0;
5512       /* If doing a small final adjustment, we always probe at offset 0.
5513          This is done to avoid issues when LR is not at position 0 or when
5514          the final adjustment is smaller than the probing offset.  */
5515       else if (final_adjustment_p && rounded_size == 0)
5516         residual_probe_offset = 0;
5517
5518       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
5519       if (residual >= min_probe_threshold)
5520         {
5521           if (dump_file)
5522             fprintf (dump_file,
5523                      "Stack clash AArch64 prologue residuals: "
5524                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
5525                      "\n", residual);
5526
5527             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5528                                              residual_probe_offset));
5529           emit_insn (gen_blockage ());
5530         }
5531     }
5532 }
5533
5534 /* Return 1 if the register is used by the epilogue.  We need to say the
5535    return register is used, but only after epilogue generation is complete.
5536    Note that in the case of sibcalls, the values "used by the epilogue" are
5537    considered live at the start of the called function.
5538
5539    For SIMD functions we need to return 1 for FP registers that are saved and
5540    restored by a function but are not zero in call_used_regs.  If we do not do
5541    this optimizations may remove the restore of the register.  */
5542
5543 int
5544 aarch64_epilogue_uses (int regno)
5545 {
5546   if (epilogue_completed)
5547     {
5548       if (regno == LR_REGNUM)
5549         return 1;
5550       if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
5551         return 1;
5552     }
5553   return 0;
5554 }
5555
5556 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
5557    is saved at BASE + OFFSET.  */
5558
5559 static void
5560 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
5561                             rtx base, poly_int64 offset)
5562 {
5563   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
5564   add_reg_note (insn, REG_CFA_EXPRESSION,
5565                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
5566 }
5567
5568 /* AArch64 stack frames generated by this compiler look like:
5569
5570         +-------------------------------+
5571         |                               |
5572         |  incoming stack arguments     |
5573         |                               |
5574         +-------------------------------+
5575         |                               | <-- incoming stack pointer (aligned)
5576         |  callee-allocated save area   |
5577         |  for register varargs         |
5578         |                               |
5579         +-------------------------------+
5580         |  local variables              | <-- frame_pointer_rtx
5581         |                               |
5582         +-------------------------------+
5583         |  padding                      | \
5584         +-------------------------------+  |
5585         |  callee-saved registers       |  | frame.saved_regs_size
5586         +-------------------------------+  |
5587         |  LR'                          |  |
5588         +-------------------------------+  |
5589         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
5590         +-------------------------------+
5591         |  dynamic allocation           |
5592         +-------------------------------+
5593         |  padding                      |
5594         +-------------------------------+
5595         |  outgoing stack arguments     | <-- arg_pointer
5596         |                               |
5597         +-------------------------------+
5598         |                               | <-- stack_pointer_rtx (aligned)
5599
5600    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
5601    but leave frame_pointer_rtx and hard_frame_pointer_rtx
5602    unchanged.
5603
5604    By default for stack-clash we assume the guard is at least 64KB, but this
5605    value is configurable to either 4KB or 64KB.  We also force the guard size to
5606    be the same as the probing interval and both values are kept in sync.
5607
5608    With those assumptions the callee can allocate up to 63KB (or 3KB depending
5609    on the guard size) of stack space without probing.
5610
5611    When probing is needed, we emit a probe at the start of the prologue
5612    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
5613
5614    We have to track how much space has been allocated and the only stores
5615    to the stack we track as implicit probes are the FP/LR stores.
5616
5617    For outgoing arguments we probe if the size is larger than 1KB, such that
5618    the ABI specified buffer is maintained for the next callee.
5619
5620    The following registers are reserved during frame layout and should not be
5621    used for any other purpose:
5622
5623    - r11: Used by stack clash protection when SVE is enabled.
5624    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
5625    - r14 and r15: Used for speculation tracking.
5626    - r16(IP0), r17(IP1): Used by indirect tailcalls.
5627    - r30(LR), r29(FP): Used by standard frame layout.
5628
5629    These registers must be avoided in frame layout related code unless the
5630    explicit intention is to interact with one of the features listed above.  */
5631
5632 /* Generate the prologue instructions for entry into a function.
5633    Establish the stack frame by decreasing the stack pointer with a
5634    properly calculated size and, if necessary, create a frame record
5635    filled with the values of LR and previous frame pointer.  The
5636    current FP is also set up if it is in use.  */
5637
5638 void
5639 aarch64_expand_prologue (void)
5640 {
5641   poly_int64 frame_size = cfun->machine->frame.frame_size;
5642   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5643   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5644   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5645   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5646   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5647   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5648   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
5649   rtx_insn *insn;
5650
5651   /* Sign return address for functions.  */
5652   if (aarch64_return_address_signing_enabled ())
5653     {
5654       insn = emit_insn (gen_pacisp ());
5655       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5656       RTX_FRAME_RELATED_P (insn) = 1;
5657     }
5658
5659   if (flag_stack_usage_info)
5660     current_function_static_stack_size = constant_lower_bound (frame_size);
5661
5662   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5663     {
5664       if (crtl->is_leaf && !cfun->calls_alloca)
5665         {
5666           if (maybe_gt (frame_size, PROBE_INTERVAL)
5667               && maybe_gt (frame_size, get_stack_check_protect ()))
5668             aarch64_emit_probe_stack_range (get_stack_check_protect (),
5669                                             (frame_size
5670                                              - get_stack_check_protect ()));
5671         }
5672       else if (maybe_gt (frame_size, 0))
5673         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
5674     }
5675
5676   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5677   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5678
5679   /* In theory we should never have both an initial adjustment
5680      and a callee save adjustment.  Verify that is the case since the
5681      code below does not handle it for -fstack-clash-protection.  */
5682   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
5683
5684   /* Will only probe if the initial adjustment is larger than the guard
5685      less the amount of the guard reserved for use by the caller's
5686      outgoing args.  */
5687   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
5688                                           true, false);
5689
5690   if (callee_adjust != 0)
5691     aarch64_push_regs (reg1, reg2, callee_adjust);
5692
5693   if (emit_frame_chain)
5694     {
5695       poly_int64 reg_offset = callee_adjust;
5696       if (callee_adjust == 0)
5697         {
5698           reg1 = R29_REGNUM;
5699           reg2 = R30_REGNUM;
5700           reg_offset = callee_offset;
5701           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
5702         }
5703       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
5704                           stack_pointer_rtx, callee_offset,
5705                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
5706       if (frame_pointer_needed && !frame_size.is_constant ())
5707         {
5708           /* Variable-sized frames need to describe the save slot
5709              address using DW_CFA_expression rather than DW_CFA_offset.
5710              This means that, without taking further action, the
5711              locations of the registers that we've already saved would
5712              remain based on the stack pointer even after we redefine
5713              the CFA based on the frame pointer.  We therefore need new
5714              DW_CFA_expressions to re-express the save slots with addresses
5715              based on the frame pointer.  */
5716           rtx_insn *insn = get_last_insn ();
5717           gcc_assert (RTX_FRAME_RELATED_P (insn));
5718
5719           /* Add an explicit CFA definition if this was previously
5720              implicit.  */
5721           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
5722             {
5723               rtx src = plus_constant (Pmode, stack_pointer_rtx,
5724                                        callee_offset);
5725               add_reg_note (insn, REG_CFA_ADJUST_CFA,
5726                             gen_rtx_SET (hard_frame_pointer_rtx, src));
5727             }
5728
5729           /* Change the save slot expressions for the registers that
5730              we've already saved.  */
5731           reg_offset -= callee_offset;
5732           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
5733                                       reg_offset + UNITS_PER_WORD);
5734           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
5735                                       reg_offset);
5736         }
5737       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
5738     }
5739
5740   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5741                              callee_adjust != 0 || emit_frame_chain);
5742   if (aarch64_simd_decl_p (cfun->decl))
5743     aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5744                                callee_adjust != 0 || emit_frame_chain);
5745   else
5746     aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5747                                callee_adjust != 0 || emit_frame_chain);
5748
5749   /* We may need to probe the final adjustment if it is larger than the guard
5750      that is assumed by the called.  */
5751   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
5752                                           !frame_pointer_needed, true);
5753 }
5754
5755 /* Return TRUE if we can use a simple_return insn.
5756
5757    This function checks whether the callee saved stack is empty, which
5758    means no restore actions are need. The pro_and_epilogue will use
5759    this to check whether shrink-wrapping opt is feasible.  */
5760
5761 bool
5762 aarch64_use_return_insn_p (void)
5763 {
5764   if (!reload_completed)
5765     return false;
5766
5767   if (crtl->profile)
5768     return false;
5769
5770   return known_eq (cfun->machine->frame.frame_size, 0);
5771 }
5772
5773 /* Return false for non-leaf SIMD functions in order to avoid
5774    shrink-wrapping them.  Doing this will lose the necessary
5775    save/restore of FP registers.  */
5776
5777 bool
5778 aarch64_use_simple_return_insn_p (void)
5779 {
5780   if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
5781     return false;
5782
5783   return true;
5784 }
5785
5786 /* Generate the epilogue instructions for returning from a function.
5787    This is almost exactly the reverse of the prolog sequence, except
5788    that we need to insert barriers to avoid scheduling loads that read
5789    from a deallocated stack, and we optimize the unwind records by
5790    emitting them all together if possible.  */
5791 void
5792 aarch64_expand_epilogue (bool for_sibcall)
5793 {
5794   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5795   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5796   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5797   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5798   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5799   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5800   rtx cfi_ops = NULL;
5801   rtx_insn *insn;
5802   /* A stack clash protection prologue may not have left EP0_REGNUM or
5803      EP1_REGNUM in a usable state.  The same is true for allocations
5804      with an SVE component, since we then need both temporary registers
5805      for each allocation.  For stack clash we are in a usable state if
5806      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
5807   HOST_WIDE_INT guard_size
5808     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5809   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5810
5811   /* We can re-use the registers when the allocation amount is smaller than
5812      guard_size - guard_used_by_caller because we won't be doing any probes
5813      then.  In such situations the register should remain live with the correct
5814      value.  */
5815   bool can_inherit_p = (initial_adjust.is_constant ()
5816                         && final_adjust.is_constant ())
5817                         && (!flag_stack_clash_protection
5818                             || known_lt (initial_adjust,
5819                                          guard_size - guard_used_by_caller));
5820
5821   /* We need to add memory barrier to prevent read from deallocated stack.  */
5822   bool need_barrier_p
5823     = maybe_ne (get_frame_size ()
5824                 + cfun->machine->frame.saved_varargs_size, 0);
5825
5826   /* Emit a barrier to prevent loads from a deallocated stack.  */
5827   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5828       || cfun->calls_alloca
5829       || crtl->calls_eh_return)
5830     {
5831       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5832       need_barrier_p = false;
5833     }
5834
5835   /* Restore the stack pointer from the frame pointer if it may not
5836      be the same as the stack pointer.  */
5837   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5838   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5839   if (frame_pointer_needed
5840       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5841     /* If writeback is used when restoring callee-saves, the CFA
5842        is restored on the instruction doing the writeback.  */
5843     aarch64_add_offset (Pmode, stack_pointer_rtx,
5844                         hard_frame_pointer_rtx, -callee_offset,
5845                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
5846   else
5847      /* The case where we need to re-use the register here is very rare, so
5848         avoid the complicated condition and just always emit a move if the
5849         immediate doesn't fit.  */
5850      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
5851
5852   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5853                                 callee_adjust != 0, &cfi_ops);
5854   if (aarch64_simd_decl_p (cfun->decl))
5855     aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5856                                   callee_adjust != 0, &cfi_ops);
5857   else
5858     aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5859                                   callee_adjust != 0, &cfi_ops);
5860
5861   if (need_barrier_p)
5862     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5863
5864   if (callee_adjust != 0)
5865     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5866
5867   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5868     {
5869       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
5870       insn = get_last_insn ();
5871       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5872       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5873       RTX_FRAME_RELATED_P (insn) = 1;
5874       cfi_ops = NULL;
5875     }
5876
5877   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
5878      add restriction on emit_move optimization to leaf functions.  */
5879   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
5880                   (!can_inherit_p || !crtl->is_leaf
5881                    || df_regs_ever_live_p (EP0_REGNUM)));
5882
5883   if (cfi_ops)
5884     {
5885       /* Emit delayed restores and reset the CFA to be SP.  */
5886       insn = get_last_insn ();
5887       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5888       REG_NOTES (insn) = cfi_ops;
5889       RTX_FRAME_RELATED_P (insn) = 1;
5890     }
5891
5892   /* We prefer to emit the combined return/authenticate instruction RETAA,
5893      however there are three cases in which we must instead emit an explicit
5894      authentication instruction.
5895
5896         1) Sibcalls don't return in a normal way, so if we're about to call one
5897            we must authenticate.
5898
5899         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5900            generating code for !TARGET_ARMV8_3 we can't use it and must
5901            explicitly authenticate.
5902
5903         3) On an eh_return path we make extra stack adjustments to update the
5904            canonical frame address to be the exception handler's CFA.  We want
5905            to authenticate using the CFA of the function which calls eh_return.
5906     */
5907   if (aarch64_return_address_signing_enabled ()
5908       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5909     {
5910       insn = emit_insn (gen_autisp ());
5911       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5912       RTX_FRAME_RELATED_P (insn) = 1;
5913     }
5914
5915   /* Stack adjustment for exception handler.  */
5916   if (crtl->calls_eh_return)
5917     {
5918       /* We need to unwind the stack by the offset computed by
5919          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
5920          to be SP; letting the CFA move during this adjustment
5921          is just as correct as retaining the CFA from the body
5922          of the function.  Therefore, do nothing special.  */
5923       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5924     }
5925
5926   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5927   if (!for_sibcall)
5928     emit_jump_insn (ret_rtx);
5929 }
5930
5931 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
5932    normally or return to a previous frame after unwinding.
5933
5934    An EH return uses a single shared return sequence.  The epilogue is
5935    exactly like a normal epilogue except that it has an extra input
5936    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5937    that must be applied after the frame has been destroyed.  An extra label
5938    is inserted before the epilogue which initializes this register to zero,
5939    and this is the entry point for a normal return.
5940
5941    An actual EH return updates the return address, initializes the stack
5942    adjustment and jumps directly into the epilogue (bypassing the zeroing
5943    of the adjustment).  Since the return address is typically saved on the
5944    stack when a function makes a call, the saved LR must be updated outside
5945    the epilogue.
5946
5947    This poses problems as the store is generated well before the epilogue,
5948    so the offset of LR is not known yet.  Also optimizations will remove the
5949    store as it appears dead, even after the epilogue is generated (as the
5950    base or offset for loading LR is different in many cases).
5951
5952    To avoid these problems this implementation forces the frame pointer
5953    in eh_return functions so that the location of LR is fixed and known early.
5954    It also marks the store volatile, so no optimization is permitted to
5955    remove the store.  */
5956 rtx
5957 aarch64_eh_return_handler_rtx (void)
5958 {
5959   rtx tmp = gen_frame_mem (Pmode,
5960     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5961
5962   /* Mark the store volatile, so no optimization is permitted to remove it.  */
5963   MEM_VOLATILE_P (tmp) = true;
5964   return tmp;
5965 }
5966
5967 /* Output code to add DELTA to the first argument, and then jump
5968    to FUNCTION.  Used for C++ multiple inheritance.  */
5969 static void
5970 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5971                          HOST_WIDE_INT delta,
5972                          HOST_WIDE_INT vcall_offset,
5973                          tree function)
5974 {
5975   /* The this pointer is always in x0.  Note that this differs from
5976      Arm where the this pointer maybe bumped to r1 if r0 is required
5977      to return a pointer to an aggregate.  On AArch64 a result value
5978      pointer will be in x8.  */
5979   int this_regno = R0_REGNUM;
5980   rtx this_rtx, temp0, temp1, addr, funexp;
5981   rtx_insn *insn;
5982
5983   reload_completed = 1;
5984   emit_note (NOTE_INSN_PROLOGUE_END);
5985
5986   this_rtx = gen_rtx_REG (Pmode, this_regno);
5987   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
5988   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
5989
5990   if (vcall_offset == 0)
5991     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5992   else
5993     {
5994       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5995
5996       addr = this_rtx;
5997       if (delta != 0)
5998         {
5999           if (delta >= -256 && delta < 256)
6000             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6001                                        plus_constant (Pmode, this_rtx, delta));
6002           else
6003             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6004                                 temp1, temp0, false);
6005         }
6006
6007       if (Pmode == ptr_mode)
6008         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6009       else
6010         aarch64_emit_move (temp0,
6011                            gen_rtx_ZERO_EXTEND (Pmode,
6012                                                 gen_rtx_MEM (ptr_mode, addr)));
6013
6014       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6015           addr = plus_constant (Pmode, temp0, vcall_offset);
6016       else
6017         {
6018           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6019                                           Pmode);
6020           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6021         }
6022
6023       if (Pmode == ptr_mode)
6024         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6025       else
6026         aarch64_emit_move (temp1,
6027                            gen_rtx_SIGN_EXTEND (Pmode,
6028                                                 gen_rtx_MEM (ptr_mode, addr)));
6029
6030       emit_insn (gen_add2_insn (this_rtx, temp1));
6031     }
6032
6033   /* Generate a tail call to the target function.  */
6034   if (!TREE_USED (function))
6035     {
6036       assemble_external (function);
6037       TREE_USED (function) = 1;
6038     }
6039   funexp = XEXP (DECL_RTL (function), 0);
6040   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6041   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6042   SIBLING_CALL_P (insn) = 1;
6043
6044   insn = get_insns ();
6045   shorten_branches (insn);
6046   final_start_function (insn, file, 1);
6047   final (insn, file, 1);
6048   final_end_function ();
6049
6050   /* Stop pretending to be a post-reload pass.  */
6051   reload_completed = 0;
6052 }
6053
6054 static bool
6055 aarch64_tls_referenced_p (rtx x)
6056 {
6057   if (!TARGET_HAVE_TLS)
6058     return false;
6059   subrtx_iterator::array_type array;
6060   FOR_EACH_SUBRTX (iter, array, x, ALL)
6061     {
6062       const_rtx x = *iter;
6063       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6064         return true;
6065       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6066          TLS offsets, not real symbol references.  */
6067       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6068         iter.skip_subrtxes ();
6069     }
6070   return false;
6071 }
6072
6073
6074 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6075    a left shift of 0 or 12 bits.  */
6076 bool
6077 aarch64_uimm12_shift (HOST_WIDE_INT val)
6078 {
6079   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6080           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6081           );
6082 }
6083
6084 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6085    that can be created with a left shift of 0 or 12.  */
6086 static HOST_WIDE_INT
6087 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6088 {
6089   /* Check to see if the value fits in 24 bits, as that is the maximum we can
6090      handle correctly.  */
6091   gcc_assert ((val & 0xffffff) == val);
6092
6093   if (((val & 0xfff) << 0) == val)
6094     return val;
6095
6096   return val & (0xfff << 12);
6097 }
6098
6099 /* Return true if val is an immediate that can be loaded into a
6100    register by a MOVZ instruction.  */
6101 static bool
6102 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6103 {
6104   if (GET_MODE_SIZE (mode) > 4)
6105     {
6106       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6107           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6108         return 1;
6109     }
6110   else
6111     {
6112       /* Ignore sign extension.  */
6113       val &= (HOST_WIDE_INT) 0xffffffff;
6114     }
6115   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6116           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6117 }
6118
6119 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
6120    64-bit (DImode) integer.  */
6121
6122 static unsigned HOST_WIDE_INT
6123 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6124 {
6125   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6126   while (size < 64)
6127     {
6128       val &= (HOST_WIDE_INT_1U << size) - 1;
6129       val |= val << size;
6130       size *= 2;
6131     }
6132   return val;
6133 }
6134
6135 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
6136
6137 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6138   {
6139     0x0000000100000001ull,
6140     0x0001000100010001ull,
6141     0x0101010101010101ull,
6142     0x1111111111111111ull,
6143     0x5555555555555555ull,
6144   };
6145
6146
6147 /* Return true if val is a valid bitmask immediate.  */
6148
6149 bool
6150 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6151 {
6152   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6153   int bits;
6154
6155   /* Check for a single sequence of one bits and return quickly if so.
6156      The special cases of all ones and all zeroes returns false.  */
6157   val = aarch64_replicate_bitmask_imm (val_in, mode);
6158   tmp = val + (val & -val);
6159
6160   if (tmp == (tmp & -tmp))
6161     return (val + 1) > 1;
6162
6163   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
6164   if (mode == SImode)
6165     val = (val << 32) | (val & 0xffffffff);
6166
6167   /* Invert if the immediate doesn't start with a zero bit - this means we
6168      only need to search for sequences of one bits.  */
6169   if (val & 1)
6170     val = ~val;
6171
6172   /* Find the first set bit and set tmp to val with the first sequence of one
6173      bits removed.  Return success if there is a single sequence of ones.  */
6174   first_one = val & -val;
6175   tmp = val & (val + first_one);
6176
6177   if (tmp == 0)
6178     return true;
6179
6180   /* Find the next set bit and compute the difference in bit position.  */
6181   next_one = tmp & -tmp;
6182   bits = clz_hwi (first_one) - clz_hwi (next_one);
6183   mask = val ^ tmp;
6184
6185   /* Check the bit position difference is a power of 2, and that the first
6186      sequence of one bits fits within 'bits' bits.  */
6187   if ((mask >> bits) != 0 || bits != (bits & -bits))
6188     return false;
6189
6190   /* Check the sequence of one bits is repeated 64/bits times.  */
6191   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6192 }
6193
6194 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6195    Assumed precondition: VAL_IN Is not zero.  */
6196
6197 unsigned HOST_WIDE_INT
6198 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6199 {
6200   int lowest_bit_set = ctz_hwi (val_in);
6201   int highest_bit_set = floor_log2 (val_in);
6202   gcc_assert (val_in != 0);
6203
6204   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6205           (HOST_WIDE_INT_1U << lowest_bit_set));
6206 }
6207
6208 /* Create constant where bits outside of lowest bit set to highest bit set
6209    are set to 1.  */
6210
6211 unsigned HOST_WIDE_INT
6212 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6213 {
6214   return val_in | ~aarch64_and_split_imm1 (val_in);
6215 }
6216
6217 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
6218
6219 bool
6220 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
6221 {
6222   scalar_int_mode int_mode;
6223   if (!is_a <scalar_int_mode> (mode, &int_mode))
6224     return false;
6225
6226   if (aarch64_bitmask_imm (val_in, int_mode))
6227     return false;
6228
6229   if (aarch64_move_imm (val_in, int_mode))
6230     return false;
6231
6232   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
6233
6234   return aarch64_bitmask_imm (imm2, int_mode);
6235 }
6236
6237 /* Return true if val is an immediate that can be loaded into a
6238    register in a single instruction.  */
6239 bool
6240 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
6241 {
6242   scalar_int_mode int_mode;
6243   if (!is_a <scalar_int_mode> (mode, &int_mode))
6244     return false;
6245
6246   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
6247     return 1;
6248   return aarch64_bitmask_imm (val, int_mode);
6249 }
6250
6251 static bool
6252 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
6253 {
6254   rtx base, offset;
6255
6256   if (GET_CODE (x) == HIGH)
6257     return true;
6258
6259   /* There's no way to calculate VL-based values using relocations.  */
6260   subrtx_iterator::array_type array;
6261   FOR_EACH_SUBRTX (iter, array, x, ALL)
6262     if (GET_CODE (*iter) == CONST_POLY_INT)
6263       return true;
6264
6265   split_const (x, &base, &offset);
6266   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
6267     {
6268       if (aarch64_classify_symbol (base, INTVAL (offset))
6269           != SYMBOL_FORCE_TO_MEM)
6270         return true;
6271       else
6272         /* Avoid generating a 64-bit relocation in ILP32; leave
6273            to aarch64_expand_mov_immediate to handle it properly.  */
6274         return mode != ptr_mode;
6275     }
6276
6277   return aarch64_tls_referenced_p (x);
6278 }
6279
6280 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6281    The expansion for a table switch is quite expensive due to the number
6282    of instructions, the table lookup and hard to predict indirect jump.
6283    When optimizing for speed, and -O3 enabled, use the per-core tuning if
6284    set, otherwise use tables for > 16 cases as a tradeoff between size and
6285    performance.  When optimizing for size, use the default setting.  */
6286
6287 static unsigned int
6288 aarch64_case_values_threshold (void)
6289 {
6290   /* Use the specified limit for the number of cases before using jump
6291      tables at higher optimization levels.  */
6292   if (optimize > 2
6293       && selected_cpu->tune->max_case_values != 0)
6294     return selected_cpu->tune->max_case_values;
6295   else
6296     return optimize_size ? default_case_values_threshold () : 17;
6297 }
6298
6299 /* Return true if register REGNO is a valid index register.
6300    STRICT_P is true if REG_OK_STRICT is in effect.  */
6301
6302 bool
6303 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
6304 {
6305   if (!HARD_REGISTER_NUM_P (regno))
6306     {
6307       if (!strict_p)
6308         return true;
6309
6310       if (!reg_renumber)
6311         return false;
6312
6313       regno = reg_renumber[regno];
6314     }
6315   return GP_REGNUM_P (regno);
6316 }
6317
6318 /* Return true if register REGNO is a valid base register for mode MODE.
6319    STRICT_P is true if REG_OK_STRICT is in effect.  */
6320
6321 bool
6322 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
6323 {
6324   if (!HARD_REGISTER_NUM_P (regno))
6325     {
6326       if (!strict_p)
6327         return true;
6328
6329       if (!reg_renumber)
6330         return false;
6331
6332       regno = reg_renumber[regno];
6333     }
6334
6335   /* The fake registers will be eliminated to either the stack or
6336      hard frame pointer, both of which are usually valid base registers.
6337      Reload deals with the cases where the eliminated form isn't valid.  */
6338   return (GP_REGNUM_P (regno)
6339           || regno == SP_REGNUM
6340           || regno == FRAME_POINTER_REGNUM
6341           || regno == ARG_POINTER_REGNUM);
6342 }
6343
6344 /* Return true if X is a valid base register for mode MODE.
6345    STRICT_P is true if REG_OK_STRICT is in effect.  */
6346
6347 static bool
6348 aarch64_base_register_rtx_p (rtx x, bool strict_p)
6349 {
6350   if (!strict_p
6351       && GET_CODE (x) == SUBREG
6352       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
6353     x = SUBREG_REG (x);
6354
6355   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
6356 }
6357
6358 /* Return true if address offset is a valid index.  If it is, fill in INFO
6359    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6360
6361 static bool
6362 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
6363                         machine_mode mode, bool strict_p)
6364 {
6365   enum aarch64_address_type type;
6366   rtx index;
6367   int shift;
6368
6369   /* (reg:P) */
6370   if ((REG_P (x) || GET_CODE (x) == SUBREG)
6371       && GET_MODE (x) == Pmode)
6372     {
6373       type = ADDRESS_REG_REG;
6374       index = x;
6375       shift = 0;
6376     }
6377   /* (sign_extend:DI (reg:SI)) */
6378   else if ((GET_CODE (x) == SIGN_EXTEND
6379             || GET_CODE (x) == ZERO_EXTEND)
6380            && GET_MODE (x) == DImode
6381            && GET_MODE (XEXP (x, 0)) == SImode)
6382     {
6383       type = (GET_CODE (x) == SIGN_EXTEND)
6384         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6385       index = XEXP (x, 0);
6386       shift = 0;
6387     }
6388   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6389   else if (GET_CODE (x) == MULT
6390            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6391                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6392            && GET_MODE (XEXP (x, 0)) == DImode
6393            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6394            && CONST_INT_P (XEXP (x, 1)))
6395     {
6396       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6397         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6398       index = XEXP (XEXP (x, 0), 0);
6399       shift = exact_log2 (INTVAL (XEXP (x, 1)));
6400     }
6401   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6402   else if (GET_CODE (x) == ASHIFT
6403            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6404                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6405            && GET_MODE (XEXP (x, 0)) == DImode
6406            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6407            && CONST_INT_P (XEXP (x, 1)))
6408     {
6409       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6410         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6411       index = XEXP (XEXP (x, 0), 0);
6412       shift = INTVAL (XEXP (x, 1));
6413     }
6414   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6415   else if ((GET_CODE (x) == SIGN_EXTRACT
6416             || GET_CODE (x) == ZERO_EXTRACT)
6417            && GET_MODE (x) == DImode
6418            && GET_CODE (XEXP (x, 0)) == MULT
6419            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6420            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6421     {
6422       type = (GET_CODE (x) == SIGN_EXTRACT)
6423         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6424       index = XEXP (XEXP (x, 0), 0);
6425       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6426       if (INTVAL (XEXP (x, 1)) != 32 + shift
6427           || INTVAL (XEXP (x, 2)) != 0)
6428         shift = -1;
6429     }
6430   /* (and:DI (mult:DI (reg:DI) (const_int scale))
6431      (const_int 0xffffffff<<shift)) */
6432   else if (GET_CODE (x) == AND
6433            && GET_MODE (x) == DImode
6434            && GET_CODE (XEXP (x, 0)) == MULT
6435            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6436            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6437            && CONST_INT_P (XEXP (x, 1)))
6438     {
6439       type = ADDRESS_REG_UXTW;
6440       index = XEXP (XEXP (x, 0), 0);
6441       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6442       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6443         shift = -1;
6444     }
6445   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
6446   else if ((GET_CODE (x) == SIGN_EXTRACT
6447             || GET_CODE (x) == ZERO_EXTRACT)
6448            && GET_MODE (x) == DImode
6449            && GET_CODE (XEXP (x, 0)) == ASHIFT
6450            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6451            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6452     {
6453       type = (GET_CODE (x) == SIGN_EXTRACT)
6454         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6455       index = XEXP (XEXP (x, 0), 0);
6456       shift = INTVAL (XEXP (XEXP (x, 0), 1));
6457       if (INTVAL (XEXP (x, 1)) != 32 + shift
6458           || INTVAL (XEXP (x, 2)) != 0)
6459         shift = -1;
6460     }
6461   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
6462      (const_int 0xffffffff<<shift)) */
6463   else if (GET_CODE (x) == AND
6464            && GET_MODE (x) == DImode
6465            && GET_CODE (XEXP (x, 0)) == ASHIFT
6466            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6467            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6468            && CONST_INT_P (XEXP (x, 1)))
6469     {
6470       type = ADDRESS_REG_UXTW;
6471       index = XEXP (XEXP (x, 0), 0);
6472       shift = INTVAL (XEXP (XEXP (x, 0), 1));
6473       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6474         shift = -1;
6475     }
6476   /* (mult:P (reg:P) (const_int scale)) */
6477   else if (GET_CODE (x) == MULT
6478            && GET_MODE (x) == Pmode
6479            && GET_MODE (XEXP (x, 0)) == Pmode
6480            && CONST_INT_P (XEXP (x, 1)))
6481     {
6482       type = ADDRESS_REG_REG;
6483       index = XEXP (x, 0);
6484       shift = exact_log2 (INTVAL (XEXP (x, 1)));
6485     }
6486   /* (ashift:P (reg:P) (const_int shift)) */
6487   else if (GET_CODE (x) == ASHIFT
6488            && GET_MODE (x) == Pmode
6489            && GET_MODE (XEXP (x, 0)) == Pmode
6490            && CONST_INT_P (XEXP (x, 1)))
6491     {
6492       type = ADDRESS_REG_REG;
6493       index = XEXP (x, 0);
6494       shift = INTVAL (XEXP (x, 1));
6495     }
6496   else
6497     return false;
6498
6499   if (!strict_p
6500       && GET_CODE (index) == SUBREG
6501       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
6502     index = SUBREG_REG (index);
6503
6504   if (aarch64_sve_data_mode_p (mode))
6505     {
6506       if (type != ADDRESS_REG_REG
6507           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
6508         return false;
6509     }
6510   else
6511     {
6512       if (shift != 0
6513           && !(IN_RANGE (shift, 1, 3)
6514                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
6515         return false;
6516     }
6517
6518   if (REG_P (index)
6519       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
6520     {
6521       info->type = type;
6522       info->offset = index;
6523       info->shift = shift;
6524       return true;
6525     }
6526
6527   return false;
6528 }
6529
6530 /* Return true if MODE is one of the modes for which we
6531    support LDP/STP operations.  */
6532
6533 static bool
6534 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
6535 {
6536   return mode == SImode || mode == DImode
6537          || mode == SFmode || mode == DFmode
6538          || (aarch64_vector_mode_supported_p (mode)
6539              && (known_eq (GET_MODE_SIZE (mode), 8)
6540                  || (known_eq (GET_MODE_SIZE (mode), 16)
6541                     && (aarch64_tune_params.extra_tuning_flags
6542                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
6543 }
6544
6545 /* Return true if REGNO is a virtual pointer register, or an eliminable
6546    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
6547    include stack_pointer or hard_frame_pointer.  */
6548 static bool
6549 virt_or_elim_regno_p (unsigned regno)
6550 {
6551   return ((regno >= FIRST_VIRTUAL_REGISTER
6552            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
6553           || regno == FRAME_POINTER_REGNUM
6554           || regno == ARG_POINTER_REGNUM);
6555 }
6556
6557 /* Return true if X is a valid address of type TYPE for machine mode MODE.
6558    If it is, fill in INFO appropriately.  STRICT_P is true if
6559    REG_OK_STRICT is in effect.  */
6560
6561 bool
6562 aarch64_classify_address (struct aarch64_address_info *info,
6563                           rtx x, machine_mode mode, bool strict_p,
6564                           aarch64_addr_query_type type)
6565 {
6566   enum rtx_code code = GET_CODE (x);
6567   rtx op0, op1;
6568   poly_int64 offset;
6569
6570   HOST_WIDE_INT const_size;
6571
6572   /* On BE, we use load/store pair for all large int mode load/stores.
6573      TI/TFmode may also use a load/store pair.  */
6574   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6575   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
6576   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
6577                             || type == ADDR_QUERY_LDP_STP_N
6578                             || mode == TImode
6579                             || mode == TFmode
6580                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
6581
6582   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
6583      corresponds to the actual size of the memory being loaded/stored and the
6584      mode of the corresponding addressing mode is half of that.  */
6585   if (type == ADDR_QUERY_LDP_STP_N
6586       && known_eq (GET_MODE_SIZE (mode), 16))
6587     mode = DFmode;
6588
6589   bool allow_reg_index_p = (!load_store_pair_p
6590                             && (known_lt (GET_MODE_SIZE (mode), 16)
6591                                 || vec_flags == VEC_ADVSIMD
6592                                 || vec_flags == VEC_SVE_DATA));
6593
6594   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
6595      [Rn, #offset, MUL VL].  */
6596   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
6597       && (code != REG && code != PLUS))
6598     return false;
6599
6600   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
6601      REG addressing.  */
6602   if (advsimd_struct_p
6603       && !BYTES_BIG_ENDIAN
6604       && (code != POST_INC && code != REG))
6605     return false;
6606
6607   gcc_checking_assert (GET_MODE (x) == VOIDmode
6608                        || SCALAR_INT_MODE_P (GET_MODE (x)));
6609
6610   switch (code)
6611     {
6612     case REG:
6613     case SUBREG:
6614       info->type = ADDRESS_REG_IMM;
6615       info->base = x;
6616       info->offset = const0_rtx;
6617       info->const_offset = 0;
6618       return aarch64_base_register_rtx_p (x, strict_p);
6619
6620     case PLUS:
6621       op0 = XEXP (x, 0);
6622       op1 = XEXP (x, 1);
6623
6624       if (! strict_p
6625           && REG_P (op0)
6626           && virt_or_elim_regno_p (REGNO (op0))
6627           && poly_int_rtx_p (op1, &offset))
6628         {
6629           info->type = ADDRESS_REG_IMM;
6630           info->base = op0;
6631           info->offset = op1;
6632           info->const_offset = offset;
6633
6634           return true;
6635         }
6636
6637       if (maybe_ne (GET_MODE_SIZE (mode), 0)
6638           && aarch64_base_register_rtx_p (op0, strict_p)
6639           && poly_int_rtx_p (op1, &offset))
6640         {
6641           info->type = ADDRESS_REG_IMM;
6642           info->base = op0;
6643           info->offset = op1;
6644           info->const_offset = offset;
6645
6646           /* TImode and TFmode values are allowed in both pairs of X
6647              registers and individual Q registers.  The available
6648              address modes are:
6649              X,X: 7-bit signed scaled offset
6650              Q:   9-bit signed offset
6651              We conservatively require an offset representable in either mode.
6652              When performing the check for pairs of X registers i.e.  LDP/STP
6653              pass down DImode since that is the natural size of the LDP/STP
6654              instruction memory accesses.  */
6655           if (mode == TImode || mode == TFmode)
6656             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
6657                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6658                         || offset_12bit_unsigned_scaled_p (mode, offset)));
6659
6660           /* A 7bit offset check because OImode will emit a ldp/stp
6661              instruction (only big endian will get here).
6662              For ldp/stp instructions, the offset is scaled for the size of a
6663              single element of the pair.  */
6664           if (mode == OImode)
6665             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
6666
6667           /* Three 9/12 bit offsets checks because CImode will emit three
6668              ldr/str instructions (only big endian will get here).  */
6669           if (mode == CImode)
6670             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6671                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
6672                                                                offset + 32)
6673                         || offset_12bit_unsigned_scaled_p (V16QImode,
6674                                                            offset + 32)));
6675
6676           /* Two 7bit offsets checks because XImode will emit two ldp/stp
6677              instructions (only big endian will get here).  */
6678           if (mode == XImode)
6679             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6680                     && aarch64_offset_7bit_signed_scaled_p (TImode,
6681                                                             offset + 32));
6682
6683           /* Make "m" use the LD1 offset range for SVE data modes, so
6684              that pre-RTL optimizers like ivopts will work to that
6685              instead of the wider LDR/STR range.  */
6686           if (vec_flags == VEC_SVE_DATA)
6687             return (type == ADDR_QUERY_M
6688                     ? offset_4bit_signed_scaled_p (mode, offset)
6689                     : offset_9bit_signed_scaled_p (mode, offset));
6690
6691           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
6692             {
6693               poly_int64 end_offset = (offset
6694                                        + GET_MODE_SIZE (mode)
6695                                        - BYTES_PER_SVE_VECTOR);
6696               return (type == ADDR_QUERY_M
6697                       ? offset_4bit_signed_scaled_p (mode, offset)
6698                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
6699                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
6700                                                          end_offset)));
6701             }
6702
6703           if (vec_flags == VEC_SVE_PRED)
6704             return offset_9bit_signed_scaled_p (mode, offset);
6705
6706           if (load_store_pair_p)
6707             return ((known_eq (GET_MODE_SIZE (mode), 4)
6708                      || known_eq (GET_MODE_SIZE (mode), 8)
6709                      || known_eq (GET_MODE_SIZE (mode), 16))
6710                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6711           else
6712             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6713                     || offset_12bit_unsigned_scaled_p (mode, offset));
6714         }
6715
6716       if (allow_reg_index_p)
6717         {
6718           /* Look for base + (scaled/extended) index register.  */
6719           if (aarch64_base_register_rtx_p (op0, strict_p)
6720               && aarch64_classify_index (info, op1, mode, strict_p))
6721             {
6722               info->base = op0;
6723               return true;
6724             }
6725           if (aarch64_base_register_rtx_p (op1, strict_p)
6726               && aarch64_classify_index (info, op0, mode, strict_p))
6727             {
6728               info->base = op1;
6729               return true;
6730             }
6731         }
6732
6733       return false;
6734
6735     case POST_INC:
6736     case POST_DEC:
6737     case PRE_INC:
6738     case PRE_DEC:
6739       info->type = ADDRESS_REG_WB;
6740       info->base = XEXP (x, 0);
6741       info->offset = NULL_RTX;
6742       return aarch64_base_register_rtx_p (info->base, strict_p);
6743
6744     case POST_MODIFY:
6745     case PRE_MODIFY:
6746       info->type = ADDRESS_REG_WB;
6747       info->base = XEXP (x, 0);
6748       if (GET_CODE (XEXP (x, 1)) == PLUS
6749           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
6750           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
6751           && aarch64_base_register_rtx_p (info->base, strict_p))
6752         {
6753           info->offset = XEXP (XEXP (x, 1), 1);
6754           info->const_offset = offset;
6755
6756           /* TImode and TFmode values are allowed in both pairs of X
6757              registers and individual Q registers.  The available
6758              address modes are:
6759              X,X: 7-bit signed scaled offset
6760              Q:   9-bit signed offset
6761              We conservatively require an offset representable in either mode.
6762            */
6763           if (mode == TImode || mode == TFmode)
6764             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
6765                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
6766
6767           if (load_store_pair_p)
6768             return ((known_eq (GET_MODE_SIZE (mode), 4)
6769                      || known_eq (GET_MODE_SIZE (mode), 8)
6770                      || known_eq (GET_MODE_SIZE (mode), 16))
6771                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6772           else
6773             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
6774         }
6775       return false;
6776
6777     case CONST:
6778     case SYMBOL_REF:
6779     case LABEL_REF:
6780       /* load literal: pc-relative constant pool entry.  Only supported
6781          for SI mode or larger.  */
6782       info->type = ADDRESS_SYMBOLIC;
6783
6784       if (!load_store_pair_p
6785           && GET_MODE_SIZE (mode).is_constant (&const_size)
6786           && const_size >= 4)
6787         {
6788           rtx sym, addend;
6789
6790           split_const (x, &sym, &addend);
6791           return ((GET_CODE (sym) == LABEL_REF
6792                    || (GET_CODE (sym) == SYMBOL_REF
6793                        && CONSTANT_POOL_ADDRESS_P (sym)
6794                        && aarch64_pcrelative_literal_loads)));
6795         }
6796       return false;
6797
6798     case LO_SUM:
6799       info->type = ADDRESS_LO_SUM;
6800       info->base = XEXP (x, 0);
6801       info->offset = XEXP (x, 1);
6802       if (allow_reg_index_p
6803           && aarch64_base_register_rtx_p (info->base, strict_p))
6804         {
6805           rtx sym, offs;
6806           split_const (info->offset, &sym, &offs);
6807           if (GET_CODE (sym) == SYMBOL_REF
6808               && (aarch64_classify_symbol (sym, INTVAL (offs))
6809                   == SYMBOL_SMALL_ABSOLUTE))
6810             {
6811               /* The symbol and offset must be aligned to the access size.  */
6812               unsigned int align;
6813
6814               if (CONSTANT_POOL_ADDRESS_P (sym))
6815                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
6816               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
6817                 {
6818                   tree exp = SYMBOL_REF_DECL (sym);
6819                   align = TYPE_ALIGN (TREE_TYPE (exp));
6820                   align = aarch64_constant_alignment (exp, align);
6821                 }
6822               else if (SYMBOL_REF_DECL (sym))
6823                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
6824               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
6825                        && SYMBOL_REF_BLOCK (sym) != NULL)
6826                 align = SYMBOL_REF_BLOCK (sym)->alignment;
6827               else
6828                 align = BITS_PER_UNIT;
6829
6830               poly_int64 ref_size = GET_MODE_SIZE (mode);
6831               if (known_eq (ref_size, 0))
6832                 ref_size = GET_MODE_SIZE (DImode);
6833
6834               return (multiple_p (INTVAL (offs), ref_size)
6835                       && multiple_p (align / BITS_PER_UNIT, ref_size));
6836             }
6837         }
6838       return false;
6839
6840     default:
6841       return false;
6842     }
6843 }
6844
6845 /* Return true if the address X is valid for a PRFM instruction.
6846    STRICT_P is true if we should do strict checking with
6847    aarch64_classify_address.  */
6848
6849 bool
6850 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
6851 {
6852   struct aarch64_address_info addr;
6853
6854   /* PRFM accepts the same addresses as DImode...  */
6855   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6856   if (!res)
6857     return false;
6858
6859   /* ... except writeback forms.  */
6860   return addr.type != ADDRESS_REG_WB;
6861 }
6862
6863 bool
6864 aarch64_symbolic_address_p (rtx x)
6865 {
6866   rtx offset;
6867
6868   split_const (x, &x, &offset);
6869   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6870 }
6871
6872 /* Classify the base of symbolic expression X.  */
6873
6874 enum aarch64_symbol_type
6875 aarch64_classify_symbolic_expression (rtx x)
6876 {
6877   rtx offset;
6878
6879   split_const (x, &x, &offset);
6880   return aarch64_classify_symbol (x, INTVAL (offset));
6881 }
6882
6883
6884 /* Return TRUE if X is a legitimate address for accessing memory in
6885    mode MODE.  */
6886 static bool
6887 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6888 {
6889   struct aarch64_address_info addr;
6890
6891   return aarch64_classify_address (&addr, x, mode, strict_p);
6892 }
6893
6894 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6895    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6896 bool
6897 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6898                               aarch64_addr_query_type type)
6899 {
6900   struct aarch64_address_info addr;
6901
6902   return aarch64_classify_address (&addr, x, mode, strict_p, type);
6903 }
6904
6905 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
6906
6907 static bool
6908 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6909                                          poly_int64 orig_offset,
6910                                          machine_mode mode)
6911 {
6912   HOST_WIDE_INT size;
6913   if (GET_MODE_SIZE (mode).is_constant (&size))
6914     {
6915       HOST_WIDE_INT const_offset, second_offset;
6916
6917       /* A general SVE offset is A * VQ + B.  Remove the A component from
6918          coefficient 0 in order to get the constant B.  */
6919       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6920
6921       /* Split an out-of-range address displacement into a base and
6922          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
6923          range otherwise to increase opportunities for sharing the base
6924          address of different sizes.  Unaligned accesses use the signed
6925          9-bit range, TImode/TFmode use the intersection of signed
6926          scaled 7-bit and signed 9-bit offset.  */
6927       if (mode == TImode || mode == TFmode)
6928         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6929       else if ((const_offset & (size - 1)) != 0)
6930         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6931       else
6932         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6933
6934       if (second_offset == 0 || known_eq (orig_offset, second_offset))
6935         return false;
6936
6937       /* Split the offset into second_offset and the rest.  */
6938       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6939       *offset2 = gen_int_mode (second_offset, Pmode);
6940       return true;
6941     }
6942   else
6943     {
6944       /* Get the mode we should use as the basis of the range.  For structure
6945          modes this is the mode of one vector.  */
6946       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6947       machine_mode step_mode
6948         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6949
6950       /* Get the "mul vl" multiplier we'd like to use.  */
6951       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6952       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6953       if (vec_flags & VEC_SVE_DATA)
6954         /* LDR supports a 9-bit range, but the move patterns for
6955            structure modes require all vectors to be in range of the
6956            same base.  The simplest way of accomodating that while still
6957            promoting reuse of anchor points between different modes is
6958            to use an 8-bit range unconditionally.  */
6959         vnum = ((vnum + 128) & 255) - 128;
6960       else
6961         /* Predicates are only handled singly, so we might as well use
6962            the full range.  */
6963         vnum = ((vnum + 256) & 511) - 256;
6964       if (vnum == 0)
6965         return false;
6966
6967       /* Convert the "mul vl" multiplier into a byte offset.  */
6968       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6969       if (known_eq (second_offset, orig_offset))
6970         return false;
6971
6972       /* Split the offset into second_offset and the rest.  */
6973       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6974       *offset2 = gen_int_mode (second_offset, Pmode);
6975       return true;
6976     }
6977 }
6978
6979 /* Return the binary representation of floating point constant VALUE in INTVAL.
6980    If the value cannot be converted, return false without setting INTVAL.
6981    The conversion is done in the given MODE.  */
6982 bool
6983 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6984 {
6985
6986   /* We make a general exception for 0.  */
6987   if (aarch64_float_const_zero_rtx_p (value))
6988     {
6989       *intval = 0;
6990       return true;
6991     }
6992
6993   scalar_float_mode mode;
6994   if (GET_CODE (value) != CONST_DOUBLE
6995       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6996       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6997       /* Only support up to DF mode.  */
6998       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6999     return false;
7000
7001   unsigned HOST_WIDE_INT ival = 0;
7002
7003   long res[2];
7004   real_to_target (res,
7005                   CONST_DOUBLE_REAL_VALUE (value),
7006                   REAL_MODE_FORMAT (mode));
7007
7008   if (mode == DFmode)
7009     {
7010       int order = BYTES_BIG_ENDIAN ? 1 : 0;
7011       ival = zext_hwi (res[order], 32);
7012       ival |= (zext_hwi (res[1 - order], 32) << 32);
7013     }
7014   else
7015       ival = zext_hwi (res[0], 32);
7016
7017   *intval = ival;
7018   return true;
7019 }
7020
7021 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7022    single MOV(+MOVK) followed by an FMOV.  */
7023 bool
7024 aarch64_float_const_rtx_p (rtx x)
7025 {
7026   machine_mode mode = GET_MODE (x);
7027   if (mode == VOIDmode)
7028     return false;
7029
7030   /* Determine whether it's cheaper to write float constants as
7031      mov/movk pairs over ldr/adrp pairs.  */
7032   unsigned HOST_WIDE_INT ival;
7033
7034   if (GET_CODE (x) == CONST_DOUBLE
7035       && SCALAR_FLOAT_MODE_P (mode)
7036       && aarch64_reinterpret_float_as_int (x, &ival))
7037     {
7038       scalar_int_mode imode = (mode == HFmode
7039                                ? SImode
7040                                : int_mode_for_mode (mode).require ());
7041       int num_instr = aarch64_internal_mov_immediate
7042                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7043       return num_instr < 3;
7044     }
7045
7046   return false;
7047 }
7048
7049 /* Return TRUE if rtx X is immediate constant 0.0 */
7050 bool
7051 aarch64_float_const_zero_rtx_p (rtx x)
7052 {
7053   if (GET_MODE (x) == VOIDmode)
7054     return false;
7055
7056   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7057     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7058   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7059 }
7060
7061 /* Return TRUE if rtx X is immediate constant that fits in a single
7062    MOVI immediate operation.  */
7063 bool
7064 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7065 {
7066   if (!TARGET_SIMD)
7067      return false;
7068
7069   machine_mode vmode;
7070   scalar_int_mode imode;
7071   unsigned HOST_WIDE_INT ival;
7072
7073   if (GET_CODE (x) == CONST_DOUBLE
7074       && SCALAR_FLOAT_MODE_P (mode))
7075     {
7076       if (!aarch64_reinterpret_float_as_int (x, &ival))
7077         return false;
7078
7079       /* We make a general exception for 0.  */
7080       if (aarch64_float_const_zero_rtx_p (x))
7081         return true;
7082
7083       imode = int_mode_for_mode (mode).require ();
7084     }
7085   else if (GET_CODE (x) == CONST_INT
7086            && is_a <scalar_int_mode> (mode, &imode))
7087     ival = INTVAL (x);
7088   else
7089     return false;
7090
7091    /* use a 64 bit mode for everything except for DI/DF mode, where we use
7092      a 128 bit vector mode.  */
7093   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7094
7095   vmode = aarch64_simd_container_mode (imode, width);
7096   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7097
7098   return aarch64_simd_valid_immediate (v_op, NULL);
7099 }
7100
7101
7102 /* Return the fixed registers used for condition codes.  */
7103
7104 static bool
7105 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7106 {
7107   *p1 = CC_REGNUM;
7108   *p2 = INVALID_REGNUM;
7109   return true;
7110 }
7111
7112 /* This function is used by the call expanders of the machine description.
7113    RESULT is the register in which the result is returned.  It's NULL for
7114    "call" and "sibcall".
7115    MEM is the location of the function call.
7116    SIBCALL indicates whether this function call is normal call or sibling call.
7117    It will generate different pattern accordingly.  */
7118
7119 void
7120 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7121 {
7122   rtx call, callee, tmp;
7123   rtvec vec;
7124   machine_mode mode;
7125
7126   gcc_assert (MEM_P (mem));
7127   callee = XEXP (mem, 0);
7128   mode = GET_MODE (callee);
7129   gcc_assert (mode == Pmode);
7130
7131   /* Decide if we should generate indirect calls by loading the
7132      address of the callee into a register before performing
7133      the branch-and-link.  */
7134   if (SYMBOL_REF_P (callee)
7135       ? (aarch64_is_long_call_p (callee)
7136          || aarch64_is_noplt_call_p (callee))
7137       : !REG_P (callee))
7138     XEXP (mem, 0) = force_reg (mode, callee);
7139
7140   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7141
7142   if (result != NULL_RTX)
7143     call = gen_rtx_SET (result, call);
7144
7145   if (sibcall)
7146     tmp = ret_rtx;
7147   else
7148     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7149
7150   vec = gen_rtvec (2, call, tmp);
7151   call = gen_rtx_PARALLEL (VOIDmode, vec);
7152
7153   aarch64_emit_call_insn (call);
7154 }
7155
7156 /* Emit call insn with PAT and do aarch64-specific handling.  */
7157
7158 void
7159 aarch64_emit_call_insn (rtx pat)
7160 {
7161   rtx insn = emit_call_insn (pat);
7162
7163   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7164   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7165   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7166 }
7167
7168 machine_mode
7169 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7170 {
7171   machine_mode mode_x = GET_MODE (x);
7172   rtx_code code_x = GET_CODE (x);
7173
7174   /* All floating point compares return CCFP if it is an equality
7175      comparison, and CCFPE otherwise.  */
7176   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
7177     {
7178       switch (code)
7179         {
7180         case EQ:
7181         case NE:
7182         case UNORDERED:
7183         case ORDERED:
7184         case UNLT:
7185         case UNLE:
7186         case UNGT:
7187         case UNGE:
7188         case UNEQ:
7189           return CCFPmode;
7190
7191         case LT:
7192         case LE:
7193         case GT:
7194         case GE:
7195         case LTGT:
7196           return CCFPEmode;
7197
7198         default:
7199           gcc_unreachable ();
7200         }
7201     }
7202
7203   /* Equality comparisons of short modes against zero can be performed
7204      using the TST instruction with the appropriate bitmask.  */
7205   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
7206       && (code == EQ || code == NE)
7207       && (mode_x == HImode || mode_x == QImode))
7208     return CC_NZmode;
7209
7210   /* Similarly, comparisons of zero_extends from shorter modes can
7211      be performed using an ANDS with an immediate mask.  */
7212   if (y == const0_rtx && code_x == ZERO_EXTEND
7213       && (mode_x == SImode || mode_x == DImode)
7214       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
7215       && (code == EQ || code == NE))
7216     return CC_NZmode;
7217
7218   if ((mode_x == SImode || mode_x == DImode)
7219       && y == const0_rtx
7220       && (code == EQ || code == NE || code == LT || code == GE)
7221       && (code_x == PLUS || code_x == MINUS || code_x == AND
7222           || code_x == NEG
7223           || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7224               && CONST_INT_P (XEXP (x, 2)))))
7225     return CC_NZmode;
7226
7227   /* A compare with a shifted operand.  Because of canonicalization,
7228      the comparison will have to be swapped when we emit the assembly
7229      code.  */
7230   if ((mode_x == SImode || mode_x == DImode)
7231       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
7232       && (code_x == ASHIFT || code_x == ASHIFTRT
7233           || code_x == LSHIFTRT
7234           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
7235     return CC_SWPmode;
7236
7237   /* Similarly for a negated operand, but we can only do this for
7238      equalities.  */
7239   if ((mode_x == SImode || mode_x == DImode)
7240       && (REG_P (y) || GET_CODE (y) == SUBREG)
7241       && (code == EQ || code == NE)
7242       && code_x == NEG)
7243     return CC_Zmode;
7244
7245   /* A test for unsigned overflow from an addition.  */
7246   if ((mode_x == DImode || mode_x == TImode)
7247       && (code == LTU || code == GEU)
7248       && code_x == PLUS
7249       && rtx_equal_p (XEXP (x, 0), y))
7250     return CC_Cmode;
7251
7252   /* A test for unsigned overflow from an add with carry.  */
7253   if ((mode_x == DImode || mode_x == TImode)
7254       && (code == LTU || code == GEU)
7255       && code_x == PLUS
7256       && CONST_SCALAR_INT_P (y)
7257       && (rtx_mode_t (y, mode_x)
7258           == (wi::shwi (1, mode_x)
7259               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
7260     return CC_ADCmode;
7261
7262   /* A test for signed overflow.  */
7263   if ((mode_x == DImode || mode_x == TImode)
7264       && code == NE
7265       && code_x == PLUS
7266       && GET_CODE (y) == SIGN_EXTEND)
7267     return CC_Vmode;
7268
7269   /* For everything else, return CCmode.  */
7270   return CCmode;
7271 }
7272
7273 static int
7274 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
7275
7276 int
7277 aarch64_get_condition_code (rtx x)
7278 {
7279   machine_mode mode = GET_MODE (XEXP (x, 0));
7280   enum rtx_code comp_code = GET_CODE (x);
7281
7282   if (GET_MODE_CLASS (mode) != MODE_CC)
7283     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
7284   return aarch64_get_condition_code_1 (mode, comp_code);
7285 }
7286
7287 static int
7288 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
7289 {
7290   switch (mode)
7291     {
7292     case E_CCFPmode:
7293     case E_CCFPEmode:
7294       switch (comp_code)
7295         {
7296         case GE: return AARCH64_GE;
7297         case GT: return AARCH64_GT;
7298         case LE: return AARCH64_LS;
7299         case LT: return AARCH64_MI;
7300         case NE: return AARCH64_NE;
7301         case EQ: return AARCH64_EQ;
7302         case ORDERED: return AARCH64_VC;
7303         case UNORDERED: return AARCH64_VS;
7304         case UNLT: return AARCH64_LT;
7305         case UNLE: return AARCH64_LE;
7306         case UNGT: return AARCH64_HI;
7307         case UNGE: return AARCH64_PL;
7308         default: return -1;
7309         }
7310       break;
7311
7312     case E_CCmode:
7313       switch (comp_code)
7314         {
7315         case NE: return AARCH64_NE;
7316         case EQ: return AARCH64_EQ;
7317         case GE: return AARCH64_GE;
7318         case GT: return AARCH64_GT;
7319         case LE: return AARCH64_LE;
7320         case LT: return AARCH64_LT;
7321         case GEU: return AARCH64_CS;
7322         case GTU: return AARCH64_HI;
7323         case LEU: return AARCH64_LS;
7324         case LTU: return AARCH64_CC;
7325         default: return -1;
7326         }
7327       break;
7328
7329     case E_CC_SWPmode:
7330       switch (comp_code)
7331         {
7332         case NE: return AARCH64_NE;
7333         case EQ: return AARCH64_EQ;
7334         case GE: return AARCH64_LE;
7335         case GT: return AARCH64_LT;
7336         case LE: return AARCH64_GE;
7337         case LT: return AARCH64_GT;
7338         case GEU: return AARCH64_LS;
7339         case GTU: return AARCH64_CC;
7340         case LEU: return AARCH64_CS;
7341         case LTU: return AARCH64_HI;
7342         default: return -1;
7343         }
7344       break;
7345
7346     case E_CC_NZmode:
7347       switch (comp_code)
7348         {
7349         case NE: return AARCH64_NE;
7350         case EQ: return AARCH64_EQ;
7351         case GE: return AARCH64_PL;
7352         case LT: return AARCH64_MI;
7353         default: return -1;
7354         }
7355       break;
7356
7357     case E_CC_Zmode:
7358       switch (comp_code)
7359         {
7360         case NE: return AARCH64_NE;
7361         case EQ: return AARCH64_EQ;
7362         default: return -1;
7363         }
7364       break;
7365
7366     case E_CC_Cmode:
7367       switch (comp_code)
7368         {
7369         case LTU: return AARCH64_CS;
7370         case GEU: return AARCH64_CC;
7371         default: return -1;
7372         }
7373       break;
7374
7375     case E_CC_ADCmode:
7376       switch (comp_code)
7377         {
7378         case GEU: return AARCH64_CS;
7379         case LTU: return AARCH64_CC;
7380         default: return -1;
7381         }
7382       break;
7383
7384     case E_CC_Vmode:
7385       switch (comp_code)
7386         {
7387         case NE: return AARCH64_VS;
7388         case EQ: return AARCH64_VC;
7389         default: return -1;
7390         }
7391       break;
7392
7393     default:
7394       return -1;
7395     }
7396
7397   return -1;
7398 }
7399
7400 bool
7401 aarch64_const_vec_all_same_in_range_p (rtx x,
7402                                        HOST_WIDE_INT minval,
7403                                        HOST_WIDE_INT maxval)
7404 {
7405   rtx elt;
7406   return (const_vec_duplicate_p (x, &elt)
7407           && CONST_INT_P (elt)
7408           && IN_RANGE (INTVAL (elt), minval, maxval));
7409 }
7410
7411 bool
7412 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
7413 {
7414   return aarch64_const_vec_all_same_in_range_p (x, val, val);
7415 }
7416
7417 /* Return true if VEC is a constant in which every element is in the range
7418    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
7419
7420 static bool
7421 aarch64_const_vec_all_in_range_p (rtx vec,
7422                                   HOST_WIDE_INT minval,
7423                                   HOST_WIDE_INT maxval)
7424 {
7425   if (GET_CODE (vec) != CONST_VECTOR
7426       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
7427     return false;
7428
7429   int nunits;
7430   if (!CONST_VECTOR_STEPPED_P (vec))
7431     nunits = const_vector_encoded_nelts (vec);
7432   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
7433     return false;
7434
7435   for (int i = 0; i < nunits; i++)
7436     {
7437       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
7438       if (!CONST_INT_P (vec_elem)
7439           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
7440         return false;
7441     }
7442   return true;
7443 }
7444
7445 /* N Z C V.  */
7446 #define AARCH64_CC_V 1
7447 #define AARCH64_CC_C (1 << 1)
7448 #define AARCH64_CC_Z (1 << 2)
7449 #define AARCH64_CC_N (1 << 3)
7450
7451 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
7452 static const int aarch64_nzcv_codes[] =
7453 {
7454   0,            /* EQ, Z == 1.  */
7455   AARCH64_CC_Z, /* NE, Z == 0.  */
7456   0,            /* CS, C == 1.  */
7457   AARCH64_CC_C, /* CC, C == 0.  */
7458   0,            /* MI, N == 1.  */
7459   AARCH64_CC_N, /* PL, N == 0.  */
7460   0,            /* VS, V == 1.  */
7461   AARCH64_CC_V, /* VC, V == 0.  */
7462   0,            /* HI, C ==1 && Z == 0.  */
7463   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
7464   AARCH64_CC_V, /* GE, N == V.  */
7465   0,            /* LT, N != V.  */
7466   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
7467   0,            /* LE, !(Z == 0 && N == V).  */
7468   0,            /* AL, Any.  */
7469   0             /* NV, Any.  */
7470 };
7471
7472 /* Print floating-point vector immediate operand X to F, negating it
7473    first if NEGATE is true.  Return true on success, false if it isn't
7474    a constant we can handle.  */
7475
7476 static bool
7477 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
7478 {
7479   rtx elt;
7480
7481   if (!const_vec_duplicate_p (x, &elt))
7482     return false;
7483
7484   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
7485   if (negate)
7486     r = real_value_negate (&r);
7487
7488   /* We only handle the SVE single-bit immediates here.  */
7489   if (real_equal (&r, &dconst0))
7490     asm_fprintf (f, "0.0");
7491   else if (real_equal (&r, &dconst1))
7492     asm_fprintf (f, "1.0");
7493   else if (real_equal (&r, &dconsthalf))
7494     asm_fprintf (f, "0.5");
7495   else
7496     return false;
7497
7498   return true;
7499 }
7500
7501 /* Return the equivalent letter for size.  */
7502 static char
7503 sizetochar (int size)
7504 {
7505   switch (size)
7506     {
7507     case 64: return 'd';
7508     case 32: return 's';
7509     case 16: return 'h';
7510     case 8 : return 'b';
7511     default: gcc_unreachable ();
7512     }
7513 }
7514
7515 /* Print operand X to file F in a target specific manner according to CODE.
7516    The acceptable formatting commands given by CODE are:
7517      'c':               An integer or symbol address without a preceding #
7518                         sign.
7519      'C':               Take the duplicated element in a vector constant
7520                         and print it in hex.
7521      'D':               Take the duplicated element in a vector constant
7522                         and print it as an unsigned integer, in decimal.
7523      'e':               Print the sign/zero-extend size as a character 8->b,
7524                         16->h, 32->w.
7525      'p':               Prints N such that 2^N == X (X must be power of 2 and
7526                         const int).
7527      'P':               Print the number of non-zero bits in X (a const_int).
7528      'H':               Print the higher numbered register of a pair (TImode)
7529                         of regs.
7530      'm':               Print a condition (eq, ne, etc).
7531      'M':               Same as 'm', but invert condition.
7532      'N':               Take the duplicated element in a vector constant
7533                         and print the negative of it in decimal.
7534      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
7535      'S/T/U/V':         Print a FP/SIMD register name for a register list.
7536                         The register printed is the FP/SIMD register name
7537                         of X + 0/1/2/3 for S/T/U/V.
7538      'R':               Print a scalar FP/SIMD register name + 1.
7539      'X':               Print bottom 16 bits of integer constant in hex.
7540      'w/x':             Print a general register name or the zero register
7541                         (32-bit or 64-bit).
7542      '0':               Print a normal operand, if it's a general register,
7543                         then we assume DImode.
7544      'k':               Print NZCV for conditional compare instructions.
7545      'A':               Output address constant representing the first
7546                         argument of X, specifying a relocation offset
7547                         if appropriate.
7548      'L':               Output constant address specified by X
7549                         with a relocation offset if appropriate.
7550      'G':               Prints address of X, specifying a PC relative
7551                         relocation mode if appropriate.
7552      'y':               Output address of LDP or STP - this is used for
7553                         some LDP/STPs which don't use a PARALLEL in their
7554                         pattern (so the mode needs to be adjusted).
7555      'z':               Output address of a typical LDP or STP.  */
7556
7557 static void
7558 aarch64_print_operand (FILE *f, rtx x, int code)
7559 {
7560   rtx elt;
7561   switch (code)
7562     {
7563     case 'c':
7564       switch (GET_CODE (x))
7565         {
7566         case CONST_INT:
7567           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
7568           break;
7569
7570         case SYMBOL_REF:
7571           output_addr_const (f, x);
7572           break;
7573
7574         case CONST:
7575           if (GET_CODE (XEXP (x, 0)) == PLUS
7576               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
7577             {
7578               output_addr_const (f, x);
7579               break;
7580             }
7581           /* Fall through.  */
7582
7583         default:
7584           output_operand_lossage ("unsupported operand for code '%c'", code);
7585         }
7586       break;
7587
7588     case 'e':
7589       {
7590         int n;
7591
7592         if (!CONST_INT_P (x)
7593             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
7594           {
7595             output_operand_lossage ("invalid operand for '%%%c'", code);
7596             return;
7597           }
7598
7599         switch (n)
7600           {
7601           case 3:
7602             fputc ('b', f);
7603             break;
7604           case 4:
7605             fputc ('h', f);
7606             break;
7607           case 5:
7608             fputc ('w', f);
7609             break;
7610           default:
7611             output_operand_lossage ("invalid operand for '%%%c'", code);
7612             return;
7613           }
7614       }
7615       break;
7616
7617     case 'p':
7618       {
7619         int n;
7620
7621         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
7622           {
7623             output_operand_lossage ("invalid operand for '%%%c'", code);
7624             return;
7625           }
7626
7627         asm_fprintf (f, "%d", n);
7628       }
7629       break;
7630
7631     case 'P':
7632       if (!CONST_INT_P (x))
7633         {
7634           output_operand_lossage ("invalid operand for '%%%c'", code);
7635           return;
7636         }
7637
7638       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
7639       break;
7640
7641     case 'H':
7642       if (x == const0_rtx)
7643         {
7644           asm_fprintf (f, "xzr");
7645           break;
7646         }
7647
7648       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
7649         {
7650           output_operand_lossage ("invalid operand for '%%%c'", code);
7651           return;
7652         }
7653
7654       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
7655       break;
7656
7657     case 'M':
7658     case 'm':
7659       {
7660         int cond_code;
7661         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
7662         if (x == const_true_rtx)
7663           {
7664             if (code == 'M')
7665               fputs ("nv", f);
7666             return;
7667           }
7668
7669         if (!COMPARISON_P (x))
7670           {
7671             output_operand_lossage ("invalid operand for '%%%c'", code);
7672             return;
7673           }
7674
7675         cond_code = aarch64_get_condition_code (x);
7676         gcc_assert (cond_code >= 0);
7677         if (code == 'M')
7678           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
7679         fputs (aarch64_condition_codes[cond_code], f);
7680       }
7681       break;
7682
7683     case 'N':
7684       if (!const_vec_duplicate_p (x, &elt))
7685         {
7686           output_operand_lossage ("invalid vector constant");
7687           return;
7688         }
7689
7690       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7691         asm_fprintf (f, "%wd", -INTVAL (elt));
7692       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7693                && aarch64_print_vector_float_operand (f, x, true))
7694         ;
7695       else
7696         {
7697           output_operand_lossage ("invalid vector constant");
7698           return;
7699         }
7700       break;
7701
7702     case 'b':
7703     case 'h':
7704     case 's':
7705     case 'd':
7706     case 'q':
7707       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7708         {
7709           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7710           return;
7711         }
7712       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
7713       break;
7714
7715     case 'S':
7716     case 'T':
7717     case 'U':
7718     case 'V':
7719       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7720         {
7721           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7722           return;
7723         }
7724       asm_fprintf (f, "%c%d",
7725                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
7726                    REGNO (x) - V0_REGNUM + (code - 'S'));
7727       break;
7728
7729     case 'R':
7730       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7731         {
7732           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7733           return;
7734         }
7735       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
7736       break;
7737
7738     case 'X':
7739       if (!CONST_INT_P (x))
7740         {
7741           output_operand_lossage ("invalid operand for '%%%c'", code);
7742           return;
7743         }
7744       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
7745       break;
7746
7747     case 'C':
7748       {
7749         /* Print a replicated constant in hex.  */
7750         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7751           {
7752             output_operand_lossage ("invalid operand for '%%%c'", code);
7753             return;
7754           }
7755         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7756         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7757       }
7758       break;
7759
7760     case 'D':
7761       {
7762         /* Print a replicated constant in decimal, treating it as
7763            unsigned.  */
7764         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7765           {
7766             output_operand_lossage ("invalid operand for '%%%c'", code);
7767             return;
7768           }
7769         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7770         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7771       }
7772       break;
7773
7774     case 'w':
7775     case 'x':
7776       if (x == const0_rtx
7777           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
7778         {
7779           asm_fprintf (f, "%czr", code);
7780           break;
7781         }
7782
7783       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
7784         {
7785           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
7786           break;
7787         }
7788
7789       if (REG_P (x) && REGNO (x) == SP_REGNUM)
7790         {
7791           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
7792           break;
7793         }
7794
7795       /* Fall through */
7796
7797     case 0:
7798       if (x == NULL)
7799         {
7800           output_operand_lossage ("missing operand");
7801           return;
7802         }
7803
7804       switch (GET_CODE (x))
7805         {
7806         case REG:
7807           if (aarch64_sve_data_mode_p (GET_MODE (x)))
7808             {
7809               if (REG_NREGS (x) == 1)
7810                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
7811               else
7812                 {
7813                   char suffix
7814                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
7815                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
7816                                REGNO (x) - V0_REGNUM, suffix,
7817                                END_REGNO (x) - V0_REGNUM - 1, suffix);
7818                 }
7819             }
7820           else
7821             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
7822           break;
7823
7824         case MEM:
7825           output_address (GET_MODE (x), XEXP (x, 0));
7826           break;
7827
7828         case LABEL_REF:
7829         case SYMBOL_REF:
7830           output_addr_const (asm_out_file, x);
7831           break;
7832
7833         case CONST_INT:
7834           asm_fprintf (f, "%wd", INTVAL (x));
7835           break;
7836
7837         case CONST:
7838           if (!VECTOR_MODE_P (GET_MODE (x)))
7839             {
7840               output_addr_const (asm_out_file, x);
7841               break;
7842             }
7843           /* fall through */
7844
7845         case CONST_VECTOR:
7846           if (!const_vec_duplicate_p (x, &elt))
7847             {
7848               output_operand_lossage ("invalid vector constant");
7849               return;
7850             }
7851
7852           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7853             asm_fprintf (f, "%wd", INTVAL (elt));
7854           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7855                    && aarch64_print_vector_float_operand (f, x, false))
7856             ;
7857           else
7858             {
7859               output_operand_lossage ("invalid vector constant");
7860               return;
7861             }
7862           break;
7863
7864         case CONST_DOUBLE:
7865           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
7866              be getting CONST_DOUBLEs holding integers.  */
7867           gcc_assert (GET_MODE (x) != VOIDmode);
7868           if (aarch64_float_const_zero_rtx_p (x))
7869             {
7870               fputc ('0', f);
7871               break;
7872             }
7873           else if (aarch64_float_const_representable_p (x))
7874             {
7875 #define buf_size 20
7876               char float_buf[buf_size] = {'\0'};
7877               real_to_decimal_for_mode (float_buf,
7878                                         CONST_DOUBLE_REAL_VALUE (x),
7879                                         buf_size, buf_size,
7880                                         1, GET_MODE (x));
7881               asm_fprintf (asm_out_file, "%s", float_buf);
7882               break;
7883 #undef buf_size
7884             }
7885           output_operand_lossage ("invalid constant");
7886           return;
7887         default:
7888           output_operand_lossage ("invalid operand");
7889           return;
7890         }
7891       break;
7892
7893     case 'A':
7894       if (GET_CODE (x) == HIGH)
7895         x = XEXP (x, 0);
7896
7897       switch (aarch64_classify_symbolic_expression (x))
7898         {
7899         case SYMBOL_SMALL_GOT_4G:
7900           asm_fprintf (asm_out_file, ":got:");
7901           break;
7902
7903         case SYMBOL_SMALL_TLSGD:
7904           asm_fprintf (asm_out_file, ":tlsgd:");
7905           break;
7906
7907         case SYMBOL_SMALL_TLSDESC:
7908           asm_fprintf (asm_out_file, ":tlsdesc:");
7909           break;
7910
7911         case SYMBOL_SMALL_TLSIE:
7912           asm_fprintf (asm_out_file, ":gottprel:");
7913           break;
7914
7915         case SYMBOL_TLSLE24:
7916           asm_fprintf (asm_out_file, ":tprel:");
7917           break;
7918
7919         case SYMBOL_TINY_GOT:
7920           gcc_unreachable ();
7921           break;
7922
7923         default:
7924           break;
7925         }
7926       output_addr_const (asm_out_file, x);
7927       break;
7928
7929     case 'L':
7930       switch (aarch64_classify_symbolic_expression (x))
7931         {
7932         case SYMBOL_SMALL_GOT_4G:
7933           asm_fprintf (asm_out_file, ":lo12:");
7934           break;
7935
7936         case SYMBOL_SMALL_TLSGD:
7937           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7938           break;
7939
7940         case SYMBOL_SMALL_TLSDESC:
7941           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7942           break;
7943
7944         case SYMBOL_SMALL_TLSIE:
7945           asm_fprintf (asm_out_file, ":gottprel_lo12:");
7946           break;
7947
7948         case SYMBOL_TLSLE12:
7949           asm_fprintf (asm_out_file, ":tprel_lo12:");
7950           break;
7951
7952         case SYMBOL_TLSLE24:
7953           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7954           break;
7955
7956         case SYMBOL_TINY_GOT:
7957           asm_fprintf (asm_out_file, ":got:");
7958           break;
7959
7960         case SYMBOL_TINY_TLSIE:
7961           asm_fprintf (asm_out_file, ":gottprel:");
7962           break;
7963
7964         default:
7965           break;
7966         }
7967       output_addr_const (asm_out_file, x);
7968       break;
7969
7970     case 'G':
7971       switch (aarch64_classify_symbolic_expression (x))
7972         {
7973         case SYMBOL_TLSLE24:
7974           asm_fprintf (asm_out_file, ":tprel_hi12:");
7975           break;
7976         default:
7977           break;
7978         }
7979       output_addr_const (asm_out_file, x);
7980       break;
7981
7982     case 'k':
7983       {
7984         HOST_WIDE_INT cond_code;
7985
7986         if (!CONST_INT_P (x))
7987           {
7988             output_operand_lossage ("invalid operand for '%%%c'", code);
7989             return;
7990           }
7991
7992         cond_code = INTVAL (x);
7993         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7994         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7995       }
7996       break;
7997
7998     case 'y':
7999     case 'z':
8000       {
8001         machine_mode mode = GET_MODE (x);
8002
8003         if (GET_CODE (x) != MEM
8004             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8005           {
8006             output_operand_lossage ("invalid operand for '%%%c'", code);
8007             return;
8008           }
8009
8010         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8011                                             code == 'y'
8012                                             ? ADDR_QUERY_LDP_STP_N
8013                                             : ADDR_QUERY_LDP_STP))
8014           output_operand_lossage ("invalid operand prefix '%%%c'", code);
8015       }
8016       break;
8017
8018     default:
8019       output_operand_lossage ("invalid operand prefix '%%%c'", code);
8020       return;
8021     }
8022 }
8023
8024 /* Print address 'x' of a memory access with mode 'mode'.
8025    'op' is the context required by aarch64_classify_address.  It can either be
8026    MEM for a normal memory access or PARALLEL for LDP/STP.  */
8027 static bool
8028 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8029                                 aarch64_addr_query_type type)
8030 {
8031   struct aarch64_address_info addr;
8032   unsigned int size;
8033
8034   /* Check all addresses are Pmode - including ILP32.  */
8035   if (GET_MODE (x) != Pmode
8036       && (!CONST_INT_P (x)
8037           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8038     {
8039       output_operand_lossage ("invalid address mode");
8040       return false;
8041     }
8042
8043   if (aarch64_classify_address (&addr, x, mode, true, type))
8044     switch (addr.type)
8045       {
8046       case ADDRESS_REG_IMM:
8047         if (known_eq (addr.const_offset, 0))
8048           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8049         else if (aarch64_sve_data_mode_p (mode))
8050           {
8051             HOST_WIDE_INT vnum
8052               = exact_div (addr.const_offset,
8053                            BYTES_PER_SVE_VECTOR).to_constant ();
8054             asm_fprintf (f, "[%s, #%wd, mul vl]",
8055                          reg_names[REGNO (addr.base)], vnum);
8056           }
8057         else if (aarch64_sve_pred_mode_p (mode))
8058           {
8059             HOST_WIDE_INT vnum
8060               = exact_div (addr.const_offset,
8061                            BYTES_PER_SVE_PRED).to_constant ();
8062             asm_fprintf (f, "[%s, #%wd, mul vl]",
8063                          reg_names[REGNO (addr.base)], vnum);
8064           }
8065         else
8066           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8067                        INTVAL (addr.offset));
8068         return true;
8069
8070       case ADDRESS_REG_REG:
8071         if (addr.shift == 0)
8072           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8073                        reg_names [REGNO (addr.offset)]);
8074         else
8075           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8076                        reg_names [REGNO (addr.offset)], addr.shift);
8077         return true;
8078
8079       case ADDRESS_REG_UXTW:
8080         if (addr.shift == 0)
8081           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8082                        REGNO (addr.offset) - R0_REGNUM);
8083         else
8084           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8085                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8086         return true;
8087
8088       case ADDRESS_REG_SXTW:
8089         if (addr.shift == 0)
8090           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8091                        REGNO (addr.offset) - R0_REGNUM);
8092         else
8093           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8094                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8095         return true;
8096
8097       case ADDRESS_REG_WB:
8098         /* Writeback is only supported for fixed-width modes.  */
8099         size = GET_MODE_SIZE (mode).to_constant ();
8100         switch (GET_CODE (x))
8101           {
8102           case PRE_INC:
8103             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8104             return true;
8105           case POST_INC:
8106             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8107             return true;
8108           case PRE_DEC:
8109             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8110             return true;
8111           case POST_DEC:
8112             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
8113             return true;
8114           case PRE_MODIFY:
8115             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
8116                          INTVAL (addr.offset));
8117             return true;
8118           case POST_MODIFY:
8119             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8120                          INTVAL (addr.offset));
8121             return true;
8122           default:
8123             break;
8124           }
8125         break;
8126
8127       case ADDRESS_LO_SUM:
8128         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8129         output_addr_const (f, addr.offset);
8130         asm_fprintf (f, "]");
8131         return true;
8132
8133       case ADDRESS_SYMBOLIC:
8134         output_addr_const (f, x);
8135         return true;
8136       }
8137
8138   return false;
8139 }
8140
8141 /* Print address 'x' of a memory access with mode 'mode'.  */
8142 static void
8143 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8144 {
8145   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
8146     output_addr_const (f, x);
8147 }
8148
8149 bool
8150 aarch64_label_mentioned_p (rtx x)
8151 {
8152   const char *fmt;
8153   int i;
8154
8155   if (GET_CODE (x) == LABEL_REF)
8156     return true;
8157
8158   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8159      referencing instruction, but they are constant offsets, not
8160      symbols.  */
8161   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8162     return false;
8163
8164   fmt = GET_RTX_FORMAT (GET_CODE (x));
8165   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8166     {
8167       if (fmt[i] == 'E')
8168         {
8169           int j;
8170
8171           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8172             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8173               return 1;
8174         }
8175       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
8176         return 1;
8177     }
8178
8179   return 0;
8180 }
8181
8182 /* Implement REGNO_REG_CLASS.  */
8183
8184 enum reg_class
8185 aarch64_regno_regclass (unsigned regno)
8186 {
8187   if (GP_REGNUM_P (regno))
8188     return GENERAL_REGS;
8189
8190   if (regno == SP_REGNUM)
8191     return STACK_REG;
8192
8193   if (regno == FRAME_POINTER_REGNUM
8194       || regno == ARG_POINTER_REGNUM)
8195     return POINTER_REGS;
8196
8197   if (FP_REGNUM_P (regno))
8198     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
8199
8200   if (PR_REGNUM_P (regno))
8201     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
8202
8203   return NO_REGS;
8204 }
8205
8206 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8207    If OFFSET is out of range, return an offset of an anchor point
8208    that is in range.  Return 0 otherwise.  */
8209
8210 static HOST_WIDE_INT
8211 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
8212                        machine_mode mode)
8213 {
8214   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
8215   if (size > 16)
8216     return (offset + 0x400) & ~0x7f0;
8217
8218   /* For offsets that aren't a multiple of the access size, the limit is
8219      -256...255.  */
8220   if (offset & (size - 1))
8221     {
8222       /* BLKmode typically uses LDP of X-registers.  */
8223       if (mode == BLKmode)
8224         return (offset + 512) & ~0x3ff;
8225       return (offset + 0x100) & ~0x1ff;
8226     }
8227
8228   /* Small negative offsets are supported.  */
8229   if (IN_RANGE (offset, -256, 0))
8230     return 0;
8231
8232   if (mode == TImode || mode == TFmode)
8233     return (offset + 0x100) & ~0x1ff;
8234
8235   /* Use 12-bit offset by access size.  */
8236   return offset & (~0xfff * size);
8237 }
8238
8239 static rtx
8240 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
8241 {
8242   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8243      where mask is selected by alignment and size of the offset.
8244      We try to pick as large a range for the offset as possible to
8245      maximize the chance of a CSE.  However, for aligned addresses
8246      we limit the range to 4k so that structures with different sized
8247      elements are likely to use the same base.  We need to be careful
8248      not to split a CONST for some forms of address expression, otherwise
8249      it will generate sub-optimal code.  */
8250
8251   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
8252     {
8253       rtx base = XEXP (x, 0);
8254       rtx offset_rtx = XEXP (x, 1);
8255       HOST_WIDE_INT offset = INTVAL (offset_rtx);
8256
8257       if (GET_CODE (base) == PLUS)
8258         {
8259           rtx op0 = XEXP (base, 0);
8260           rtx op1 = XEXP (base, 1);
8261
8262           /* Force any scaling into a temp for CSE.  */
8263           op0 = force_reg (Pmode, op0);
8264           op1 = force_reg (Pmode, op1);
8265
8266           /* Let the pointer register be in op0.  */
8267           if (REG_POINTER (op1))
8268             std::swap (op0, op1);
8269
8270           /* If the pointer is virtual or frame related, then we know that
8271              virtual register instantiation or register elimination is going
8272              to apply a second constant.  We want the two constants folded
8273              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
8274           if (virt_or_elim_regno_p (REGNO (op0)))
8275             {
8276               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
8277                                    NULL_RTX, true, OPTAB_DIRECT);
8278               return gen_rtx_PLUS (Pmode, base, op1);
8279             }
8280
8281           /* Otherwise, in order to encourage CSE (and thence loop strength
8282              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
8283           base = expand_binop (Pmode, add_optab, op0, op1,
8284                                NULL_RTX, true, OPTAB_DIRECT);
8285           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
8286         }
8287
8288       HOST_WIDE_INT size;
8289       if (GET_MODE_SIZE (mode).is_constant (&size))
8290         {
8291           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
8292                                                              mode);
8293           if (base_offset != 0)
8294             {
8295               base = plus_constant (Pmode, base, base_offset);
8296               base = force_operand (base, NULL_RTX);
8297               return plus_constant (Pmode, base, offset - base_offset);
8298             }
8299         }
8300     }
8301
8302   return x;
8303 }
8304
8305 static reg_class_t
8306 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
8307                           reg_class_t rclass,
8308                           machine_mode mode,
8309                           secondary_reload_info *sri)
8310 {
8311   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8312      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
8313      comment at the head of aarch64-sve.md for more details about the
8314      big-endian handling.  */
8315   if (BYTES_BIG_ENDIAN
8316       && reg_class_subset_p (rclass, FP_REGS)
8317       && !((REG_P (x) && HARD_REGISTER_P (x))
8318            || aarch64_simd_valid_immediate (x, NULL))
8319       && aarch64_sve_data_mode_p (mode))
8320     {
8321       sri->icode = CODE_FOR_aarch64_sve_reload_be;
8322       return NO_REGS;
8323     }
8324
8325   /* If we have to disable direct literal pool loads and stores because the
8326      function is too big, then we need a scratch register.  */
8327   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
8328       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
8329           || targetm.vector_mode_supported_p (GET_MODE (x)))
8330       && !aarch64_pcrelative_literal_loads)
8331     {
8332       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
8333       return NO_REGS;
8334     }
8335
8336   /* Without the TARGET_SIMD instructions we cannot move a Q register
8337      to a Q register directly.  We need a scratch.  */
8338   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
8339       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
8340       && reg_class_subset_p (rclass, FP_REGS))
8341     {
8342       sri->icode = code_for_aarch64_reload_mov (mode);
8343       return NO_REGS;
8344     }
8345
8346   /* A TFmode or TImode memory access should be handled via an FP_REGS
8347      because AArch64 has richer addressing modes for LDR/STR instructions
8348      than LDP/STP instructions.  */
8349   if (TARGET_FLOAT && rclass == GENERAL_REGS
8350       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
8351     return FP_REGS;
8352
8353   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
8354       return GENERAL_REGS;
8355
8356   return NO_REGS;
8357 }
8358
8359 static bool
8360 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
8361 {
8362   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
8363
8364   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8365      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
8366   if (frame_pointer_needed)
8367     return to == HARD_FRAME_POINTER_REGNUM;
8368   return true;
8369 }
8370
8371 poly_int64
8372 aarch64_initial_elimination_offset (unsigned from, unsigned to)
8373 {
8374   if (to == HARD_FRAME_POINTER_REGNUM)
8375     {
8376       if (from == ARG_POINTER_REGNUM)
8377         return cfun->machine->frame.hard_fp_offset;
8378
8379       if (from == FRAME_POINTER_REGNUM)
8380         return cfun->machine->frame.hard_fp_offset
8381                - cfun->machine->frame.locals_offset;
8382     }
8383
8384   if (to == STACK_POINTER_REGNUM)
8385     {
8386       if (from == FRAME_POINTER_REGNUM)
8387           return cfun->machine->frame.frame_size
8388                  - cfun->machine->frame.locals_offset;
8389     }
8390
8391   return cfun->machine->frame.frame_size;
8392 }
8393
8394 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
8395    previous frame.  */
8396
8397 rtx
8398 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
8399 {
8400   if (count != 0)
8401     return const0_rtx;
8402   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
8403 }
8404
8405
8406 static void
8407 aarch64_asm_trampoline_template (FILE *f)
8408 {
8409   int offset1 = 16;
8410   int offset2 = 20;
8411
8412   if (aarch64_bti_enabled ())
8413     {
8414       asm_fprintf (f, "\thint\t34 // bti c\n");
8415       offset1 -= 4;
8416       offset2 -= 4;
8417     }
8418
8419   if (TARGET_ILP32)
8420     {
8421       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
8422       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
8423                    offset1);
8424     }
8425   else
8426     {
8427       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
8428       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
8429                    offset2);
8430     }
8431   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
8432
8433   /* The trampoline needs an extra padding instruction.  In case if BTI is
8434      enabled the padding instruction is replaced by the BTI instruction at
8435      the beginning.  */
8436   if (!aarch64_bti_enabled ())
8437     assemble_aligned_integer (4, const0_rtx);
8438
8439   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8440   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8441 }
8442
8443 static void
8444 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
8445 {
8446   rtx fnaddr, mem, a_tramp;
8447   const int tramp_code_sz = 16;
8448
8449   /* Don't need to copy the trailing D-words, we fill those in below.  */
8450   emit_block_move (m_tramp, assemble_trampoline_template (),
8451                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
8452   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
8453   fnaddr = XEXP (DECL_RTL (fndecl), 0);
8454   if (GET_MODE (fnaddr) != ptr_mode)
8455     fnaddr = convert_memory_address (ptr_mode, fnaddr);
8456   emit_move_insn (mem, fnaddr);
8457
8458   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
8459   emit_move_insn (mem, chain_value);
8460
8461   /* XXX We should really define a "clear_cache" pattern and use
8462      gen_clear_cache().  */
8463   a_tramp = XEXP (m_tramp, 0);
8464   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
8465                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
8466                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
8467                      ptr_mode);
8468 }
8469
8470 static unsigned char
8471 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
8472 {
8473   /* ??? Logically we should only need to provide a value when
8474      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
8475      can hold MODE, but at the moment we need to handle all modes.
8476      Just ignore any runtime parts for registers that can't store them.  */
8477   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
8478   unsigned int nregs;
8479   switch (regclass)
8480     {
8481     case TAILCALL_ADDR_REGS:
8482     case POINTER_REGS:
8483     case GENERAL_REGS:
8484     case ALL_REGS:
8485     case POINTER_AND_FP_REGS:
8486     case FP_REGS:
8487     case FP_LO_REGS:
8488       if (aarch64_sve_data_mode_p (mode)
8489           && constant_multiple_p (GET_MODE_SIZE (mode),
8490                                   BYTES_PER_SVE_VECTOR, &nregs))
8491         return nregs;
8492       return (aarch64_vector_data_mode_p (mode)
8493               ? CEIL (lowest_size, UNITS_PER_VREG)
8494               : CEIL (lowest_size, UNITS_PER_WORD));
8495     case STACK_REG:
8496     case PR_REGS:
8497     case PR_LO_REGS:
8498     case PR_HI_REGS:
8499       return 1;
8500
8501     case NO_REGS:
8502       return 0;
8503
8504     default:
8505       break;
8506     }
8507   gcc_unreachable ();
8508 }
8509
8510 static reg_class_t
8511 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
8512 {
8513   if (regclass == POINTER_REGS)
8514     return GENERAL_REGS;
8515
8516   if (regclass == STACK_REG)
8517     {
8518       if (REG_P(x)
8519           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
8520           return regclass;
8521
8522       return NO_REGS;
8523     }
8524
8525   /* Register eliminiation can result in a request for
8526      SP+constant->FP_REGS.  We cannot support such operations which
8527      use SP as source and an FP_REG as destination, so reject out
8528      right now.  */
8529   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
8530     {
8531       rtx lhs = XEXP (x, 0);
8532
8533       /* Look through a possible SUBREG introduced by ILP32.  */
8534       if (GET_CODE (lhs) == SUBREG)
8535         lhs = SUBREG_REG (lhs);
8536
8537       gcc_assert (REG_P (lhs));
8538       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
8539                                       POINTER_REGS));
8540       return NO_REGS;
8541     }
8542
8543   return regclass;
8544 }
8545
8546 void
8547 aarch64_asm_output_labelref (FILE* f, const char *name)
8548 {
8549   asm_fprintf (f, "%U%s", name);
8550 }
8551
8552 static void
8553 aarch64_elf_asm_constructor (rtx symbol, int priority)
8554 {
8555   if (priority == DEFAULT_INIT_PRIORITY)
8556     default_ctor_section_asm_out_constructor (symbol, priority);
8557   else
8558     {
8559       section *s;
8560       /* While priority is known to be in range [0, 65535], so 18 bytes
8561          would be enough, the compiler might not know that.  To avoid
8562          -Wformat-truncation false positive, use a larger size.  */
8563       char buf[23];
8564       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
8565       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8566       switch_to_section (s);
8567       assemble_align (POINTER_SIZE);
8568       assemble_aligned_integer (POINTER_BYTES, symbol);
8569     }
8570 }
8571
8572 static void
8573 aarch64_elf_asm_destructor (rtx symbol, int priority)
8574 {
8575   if (priority == DEFAULT_INIT_PRIORITY)
8576     default_dtor_section_asm_out_destructor (symbol, priority);
8577   else
8578     {
8579       section *s;
8580       /* While priority is known to be in range [0, 65535], so 18 bytes
8581          would be enough, the compiler might not know that.  To avoid
8582          -Wformat-truncation false positive, use a larger size.  */
8583       char buf[23];
8584       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
8585       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8586       switch_to_section (s);
8587       assemble_align (POINTER_SIZE);
8588       assemble_aligned_integer (POINTER_BYTES, symbol);
8589     }
8590 }
8591
8592 const char*
8593 aarch64_output_casesi (rtx *operands)
8594 {
8595   char buf[100];
8596   char label[100];
8597   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
8598   int index;
8599   static const char *const patterns[4][2] =
8600   {
8601     {
8602       "ldrb\t%w3, [%0,%w1,uxtw]",
8603       "add\t%3, %4, %w3, sxtb #2"
8604     },
8605     {
8606       "ldrh\t%w3, [%0,%w1,uxtw #1]",
8607       "add\t%3, %4, %w3, sxth #2"
8608     },
8609     {
8610       "ldr\t%w3, [%0,%w1,uxtw #2]",
8611       "add\t%3, %4, %w3, sxtw #2"
8612     },
8613     /* We assume that DImode is only generated when not optimizing and
8614        that we don't really need 64-bit address offsets.  That would
8615        imply an object file with 8GB of code in a single function!  */
8616     {
8617       "ldr\t%w3, [%0,%w1,uxtw #2]",
8618       "add\t%3, %4, %w3, sxtw #2"
8619     }
8620   };
8621
8622   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
8623
8624   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
8625   index = exact_log2 (GET_MODE_SIZE (mode));
8626
8627   gcc_assert (index >= 0 && index <= 3);
8628
8629   /* Need to implement table size reduction, by chaning the code below.  */
8630   output_asm_insn (patterns[index][0], operands);
8631   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
8632   snprintf (buf, sizeof (buf),
8633             "adr\t%%4, %s", targetm.strip_name_encoding (label));
8634   output_asm_insn (buf, operands);
8635   output_asm_insn (patterns[index][1], operands);
8636   output_asm_insn ("br\t%3", operands);
8637   assemble_label (asm_out_file, label);
8638   return "";
8639 }
8640
8641
8642 /* Return size in bits of an arithmetic operand which is shifted/scaled and
8643    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
8644    operator.  */
8645
8646 int
8647 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
8648 {
8649   if (shift >= 0 && shift <= 3)
8650     {
8651       int size;
8652       for (size = 8; size <= 32; size *= 2)
8653         {
8654           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
8655           if (mask == bits << shift)
8656             return size;
8657         }
8658     }
8659   return 0;
8660 }
8661
8662 /* Constant pools are per function only when PC relative
8663    literal loads are true or we are in the large memory
8664    model.  */
8665
8666 static inline bool
8667 aarch64_can_use_per_function_literal_pools_p (void)
8668 {
8669   return (aarch64_pcrelative_literal_loads
8670           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
8671 }
8672
8673 static bool
8674 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
8675 {
8676   /* We can't use blocks for constants when we're using a per-function
8677      constant pool.  */
8678   return !aarch64_can_use_per_function_literal_pools_p ();
8679 }
8680
8681 /* Select appropriate section for constants depending
8682    on where we place literal pools.  */
8683
8684 static section *
8685 aarch64_select_rtx_section (machine_mode mode,
8686                             rtx x,
8687                             unsigned HOST_WIDE_INT align)
8688 {
8689   if (aarch64_can_use_per_function_literal_pools_p ())
8690     return function_section (current_function_decl);
8691
8692   return default_elf_select_rtx_section (mode, x, align);
8693 }
8694
8695 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
8696 void
8697 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
8698                                   HOST_WIDE_INT offset)
8699 {
8700   /* When using per-function literal pools, we must ensure that any code
8701      section is aligned to the minimal instruction length, lest we get
8702      errors from the assembler re "unaligned instructions".  */
8703   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
8704     ASM_OUTPUT_ALIGN (f, 2);
8705 }
8706
8707 /* Costs.  */
8708
8709 /* Helper function for rtx cost calculation.  Strip a shift expression
8710    from X.  Returns the inner operand if successful, or the original
8711    expression on failure.  */
8712 static rtx
8713 aarch64_strip_shift (rtx x)
8714 {
8715   rtx op = x;
8716
8717   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
8718      we can convert both to ROR during final output.  */
8719   if ((GET_CODE (op) == ASHIFT
8720        || GET_CODE (op) == ASHIFTRT
8721        || GET_CODE (op) == LSHIFTRT
8722        || GET_CODE (op) == ROTATERT
8723        || GET_CODE (op) == ROTATE)
8724       && CONST_INT_P (XEXP (op, 1)))
8725     return XEXP (op, 0);
8726
8727   if (GET_CODE (op) == MULT
8728       && CONST_INT_P (XEXP (op, 1))
8729       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
8730     return XEXP (op, 0);
8731
8732   return x;
8733 }
8734
8735 /* Helper function for rtx cost calculation.  Strip an extend
8736    expression from X.  Returns the inner operand if successful, or the
8737    original expression on failure.  We deal with a number of possible
8738    canonicalization variations here. If STRIP_SHIFT is true, then
8739    we can strip off a shift also.  */
8740 static rtx
8741 aarch64_strip_extend (rtx x, bool strip_shift)
8742 {
8743   scalar_int_mode mode;
8744   rtx op = x;
8745
8746   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
8747     return op;
8748
8749   /* Zero and sign extraction of a widened value.  */
8750   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
8751       && XEXP (op, 2) == const0_rtx
8752       && GET_CODE (XEXP (op, 0)) == MULT
8753       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
8754                                          XEXP (op, 1)))
8755     return XEXP (XEXP (op, 0), 0);
8756
8757   /* It can also be represented (for zero-extend) as an AND with an
8758      immediate.  */
8759   if (GET_CODE (op) == AND
8760       && GET_CODE (XEXP (op, 0)) == MULT
8761       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
8762       && CONST_INT_P (XEXP (op, 1))
8763       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
8764                            INTVAL (XEXP (op, 1))) != 0)
8765     return XEXP (XEXP (op, 0), 0);
8766
8767   /* Now handle extended register, as this may also have an optional
8768      left shift by 1..4.  */
8769   if (strip_shift
8770       && GET_CODE (op) == ASHIFT
8771       && CONST_INT_P (XEXP (op, 1))
8772       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
8773     op = XEXP (op, 0);
8774
8775   if (GET_CODE (op) == ZERO_EXTEND
8776       || GET_CODE (op) == SIGN_EXTEND)
8777     op = XEXP (op, 0);
8778
8779   if (op != x)
8780     return op;
8781
8782   return x;
8783 }
8784
8785 /* Return true iff CODE is a shift supported in combination
8786    with arithmetic instructions.  */
8787
8788 static bool
8789 aarch64_shift_p (enum rtx_code code)
8790 {
8791   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
8792 }
8793
8794
8795 /* Return true iff X is a cheap shift without a sign extend. */
8796
8797 static bool
8798 aarch64_cheap_mult_shift_p (rtx x)
8799 {
8800   rtx op0, op1;
8801
8802   op0 = XEXP (x, 0);
8803   op1 = XEXP (x, 1);
8804
8805   if (!(aarch64_tune_params.extra_tuning_flags
8806                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
8807     return false;
8808
8809   if (GET_CODE (op0) == SIGN_EXTEND)
8810     return false;
8811
8812   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
8813       && UINTVAL (op1) <= 4)
8814     return true;
8815
8816   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
8817     return false;
8818
8819   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
8820
8821   if (l2 > 0 && l2 <= 4)
8822     return true;
8823
8824   return false;
8825 }
8826
8827 /* Helper function for rtx cost calculation.  Calculate the cost of
8828    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
8829    Return the calculated cost of the expression, recursing manually in to
8830    operands where needed.  */
8831
8832 static int
8833 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
8834 {
8835   rtx op0, op1;
8836   const struct cpu_cost_table *extra_cost
8837     = aarch64_tune_params.insn_extra_cost;
8838   int cost = 0;
8839   bool compound_p = (outer == PLUS || outer == MINUS);
8840   machine_mode mode = GET_MODE (x);
8841
8842   gcc_checking_assert (code == MULT);
8843
8844   op0 = XEXP (x, 0);
8845   op1 = XEXP (x, 1);
8846
8847   if (VECTOR_MODE_P (mode))
8848     mode = GET_MODE_INNER (mode);
8849
8850   /* Integer multiply/fma.  */
8851   if (GET_MODE_CLASS (mode) == MODE_INT)
8852     {
8853       /* The multiply will be canonicalized as a shift, cost it as such.  */
8854       if (aarch64_shift_p (GET_CODE (x))
8855           || (CONST_INT_P (op1)
8856               && exact_log2 (INTVAL (op1)) > 0))
8857         {
8858           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
8859                            || GET_CODE (op0) == SIGN_EXTEND;
8860           if (speed)
8861             {
8862               if (compound_p)
8863                 {
8864                   /* If the shift is considered cheap,
8865                      then don't add any cost. */
8866                   if (aarch64_cheap_mult_shift_p (x))
8867                     ;
8868                   else if (REG_P (op1))
8869                     /* ARITH + shift-by-register.  */
8870                     cost += extra_cost->alu.arith_shift_reg;
8871                   else if (is_extend)
8872                     /* ARITH + extended register.  We don't have a cost field
8873                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
8874                     cost += extra_cost->alu.extend_arith;
8875                   else
8876                     /* ARITH + shift-by-immediate.  */
8877                     cost += extra_cost->alu.arith_shift;
8878                 }
8879               else
8880                 /* LSL (immediate).  */
8881                 cost += extra_cost->alu.shift;
8882
8883             }
8884           /* Strip extends as we will have costed them in the case above.  */
8885           if (is_extend)
8886             op0 = aarch64_strip_extend (op0, true);
8887
8888           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
8889
8890           return cost;
8891         }
8892
8893       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
8894          compound and let the below cases handle it.  After all, MNEG is a
8895          special-case alias of MSUB.  */
8896       if (GET_CODE (op0) == NEG)
8897         {
8898           op0 = XEXP (op0, 0);
8899           compound_p = true;
8900         }
8901
8902       /* Integer multiplies or FMAs have zero/sign extending variants.  */
8903       if ((GET_CODE (op0) == ZERO_EXTEND
8904            && GET_CODE (op1) == ZERO_EXTEND)
8905           || (GET_CODE (op0) == SIGN_EXTEND
8906               && GET_CODE (op1) == SIGN_EXTEND))
8907         {
8908           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8909           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8910
8911           if (speed)
8912             {
8913               if (compound_p)
8914                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
8915                 cost += extra_cost->mult[0].extend_add;
8916               else
8917                 /* MUL/SMULL/UMULL.  */
8918                 cost += extra_cost->mult[0].extend;
8919             }
8920
8921           return cost;
8922         }
8923
8924       /* This is either an integer multiply or a MADD.  In both cases
8925          we want to recurse and cost the operands.  */
8926       cost += rtx_cost (op0, mode, MULT, 0, speed);
8927       cost += rtx_cost (op1, mode, MULT, 1, speed);
8928
8929       if (speed)
8930         {
8931           if (compound_p)
8932             /* MADD/MSUB.  */
8933             cost += extra_cost->mult[mode == DImode].add;
8934           else
8935             /* MUL.  */
8936             cost += extra_cost->mult[mode == DImode].simple;
8937         }
8938
8939       return cost;
8940     }
8941   else
8942     {
8943       if (speed)
8944         {
8945           /* Floating-point FMA/FMUL can also support negations of the
8946              operands, unless the rounding mode is upward or downward in
8947              which case FNMUL is different than FMUL with operand negation.  */
8948           bool neg0 = GET_CODE (op0) == NEG;
8949           bool neg1 = GET_CODE (op1) == NEG;
8950           if (compound_p || !flag_rounding_math || (neg0 && neg1))
8951             {
8952               if (neg0)
8953                 op0 = XEXP (op0, 0);
8954               if (neg1)
8955                 op1 = XEXP (op1, 0);
8956             }
8957
8958           if (compound_p)
8959             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
8960             cost += extra_cost->fp[mode == DFmode].fma;
8961           else
8962             /* FMUL/FNMUL.  */
8963             cost += extra_cost->fp[mode == DFmode].mult;
8964         }
8965
8966       cost += rtx_cost (op0, mode, MULT, 0, speed);
8967       cost += rtx_cost (op1, mode, MULT, 1, speed);
8968       return cost;
8969     }
8970 }
8971
8972 static int
8973 aarch64_address_cost (rtx x,
8974                       machine_mode mode,
8975                       addr_space_t as ATTRIBUTE_UNUSED,
8976                       bool speed)
8977 {
8978   enum rtx_code c = GET_CODE (x);
8979   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8980   struct aarch64_address_info info;
8981   int cost = 0;
8982   info.shift = 0;
8983
8984   if (!aarch64_classify_address (&info, x, mode, false))
8985     {
8986       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8987         {
8988           /* This is a CONST or SYMBOL ref which will be split
8989              in a different way depending on the code model in use.
8990              Cost it through the generic infrastructure.  */
8991           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8992           /* Divide through by the cost of one instruction to
8993              bring it to the same units as the address costs.  */
8994           cost_symbol_ref /= COSTS_N_INSNS (1);
8995           /* The cost is then the cost of preparing the address,
8996              followed by an immediate (possibly 0) offset.  */
8997           return cost_symbol_ref + addr_cost->imm_offset;
8998         }
8999       else
9000         {
9001           /* This is most likely a jump table from a case
9002              statement.  */
9003           return addr_cost->register_offset;
9004         }
9005     }
9006
9007   switch (info.type)
9008     {
9009       case ADDRESS_LO_SUM:
9010       case ADDRESS_SYMBOLIC:
9011       case ADDRESS_REG_IMM:
9012         cost += addr_cost->imm_offset;
9013         break;
9014
9015       case ADDRESS_REG_WB:
9016         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9017           cost += addr_cost->pre_modify;
9018         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9019           cost += addr_cost->post_modify;
9020         else
9021           gcc_unreachable ();
9022
9023         break;
9024
9025       case ADDRESS_REG_REG:
9026         cost += addr_cost->register_offset;
9027         break;
9028
9029       case ADDRESS_REG_SXTW:
9030         cost += addr_cost->register_sextend;
9031         break;
9032
9033       case ADDRESS_REG_UXTW:
9034         cost += addr_cost->register_zextend;
9035         break;
9036
9037       default:
9038         gcc_unreachable ();
9039     }
9040
9041
9042   if (info.shift > 0)
9043     {
9044       /* For the sake of calculating the cost of the shifted register
9045          component, we can treat same sized modes in the same way.  */
9046       if (known_eq (GET_MODE_BITSIZE (mode), 16))
9047         cost += addr_cost->addr_scale_costs.hi;
9048       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9049         cost += addr_cost->addr_scale_costs.si;
9050       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9051         cost += addr_cost->addr_scale_costs.di;
9052       else
9053         /* We can't tell, or this is a 128-bit vector.  */
9054         cost += addr_cost->addr_scale_costs.ti;
9055     }
9056
9057   return cost;
9058 }
9059
9060 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
9061    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
9062    to be taken.  */
9063
9064 int
9065 aarch64_branch_cost (bool speed_p, bool predictable_p)
9066 {
9067   /* When optimizing for speed, use the cost of unpredictable branches.  */
9068   const struct cpu_branch_cost *branch_costs =
9069     aarch64_tune_params.branch_costs;
9070
9071   if (!speed_p || predictable_p)
9072     return branch_costs->predictable;
9073   else
9074     return branch_costs->unpredictable;
9075 }
9076
9077 /* Return true if the RTX X in mode MODE is a zero or sign extract
9078    usable in an ADD or SUB (extended register) instruction.  */
9079 static bool
9080 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9081 {
9082   /* Catch add with a sign extract.
9083      This is add_<optab><mode>_multp2.  */
9084   if (GET_CODE (x) == SIGN_EXTRACT
9085       || GET_CODE (x) == ZERO_EXTRACT)
9086     {
9087       rtx op0 = XEXP (x, 0);
9088       rtx op1 = XEXP (x, 1);
9089       rtx op2 = XEXP (x, 2);
9090
9091       if (GET_CODE (op0) == MULT
9092           && CONST_INT_P (op1)
9093           && op2 == const0_rtx
9094           && CONST_INT_P (XEXP (op0, 1))
9095           && aarch64_is_extend_from_extract (mode,
9096                                              XEXP (op0, 1),
9097                                              op1))
9098         {
9099           return true;
9100         }
9101     }
9102   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9103      No shift.  */
9104   else if (GET_CODE (x) == SIGN_EXTEND
9105            || GET_CODE (x) == ZERO_EXTEND)
9106     return REG_P (XEXP (x, 0));
9107
9108   return false;
9109 }
9110
9111 static bool
9112 aarch64_frint_unspec_p (unsigned int u)
9113 {
9114   switch (u)
9115     {
9116       case UNSPEC_FRINTZ:
9117       case UNSPEC_FRINTP:
9118       case UNSPEC_FRINTM:
9119       case UNSPEC_FRINTA:
9120       case UNSPEC_FRINTN:
9121       case UNSPEC_FRINTX:
9122       case UNSPEC_FRINTI:
9123         return true;
9124
9125       default:
9126         return false;
9127     }
9128 }
9129
9130 /* Return true iff X is an rtx that will match an extr instruction
9131    i.e. as described in the *extr<mode>5_insn family of patterns.
9132    OP0 and OP1 will be set to the operands of the shifts involved
9133    on success and will be NULL_RTX otherwise.  */
9134
9135 static bool
9136 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9137 {
9138   rtx op0, op1;
9139   scalar_int_mode mode;
9140   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9141     return false;
9142
9143   *res_op0 = NULL_RTX;
9144   *res_op1 = NULL_RTX;
9145
9146   if (GET_CODE (x) != IOR)
9147     return false;
9148
9149   op0 = XEXP (x, 0);
9150   op1 = XEXP (x, 1);
9151
9152   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
9153       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
9154     {
9155      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
9156       if (GET_CODE (op1) == ASHIFT)
9157         std::swap (op0, op1);
9158
9159       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9160         return false;
9161
9162       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9163       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9164
9165       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9166           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9167         {
9168           *res_op0 = XEXP (op0, 0);
9169           *res_op1 = XEXP (op1, 0);
9170           return true;
9171         }
9172     }
9173
9174   return false;
9175 }
9176
9177 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9178    storing it in *COST.  Result is true if the total cost of the operation
9179    has now been calculated.  */
9180 static bool
9181 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
9182 {
9183   rtx inner;
9184   rtx comparator;
9185   enum rtx_code cmpcode;
9186
9187   if (COMPARISON_P (op0))
9188     {
9189       inner = XEXP (op0, 0);
9190       comparator = XEXP (op0, 1);
9191       cmpcode = GET_CODE (op0);
9192     }
9193   else
9194     {
9195       inner = op0;
9196       comparator = const0_rtx;
9197       cmpcode = NE;
9198     }
9199
9200   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
9201     {
9202       /* Conditional branch.  */
9203       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9204         return true;
9205       else
9206         {
9207           if (cmpcode == NE || cmpcode == EQ)
9208             {
9209               if (comparator == const0_rtx)
9210                 {
9211                   /* TBZ/TBNZ/CBZ/CBNZ.  */
9212                   if (GET_CODE (inner) == ZERO_EXTRACT)
9213                     /* TBZ/TBNZ.  */
9214                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
9215                                        ZERO_EXTRACT, 0, speed);
9216                   else
9217                     /* CBZ/CBNZ.  */
9218                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
9219
9220                 return true;
9221               }
9222             }
9223           else if (cmpcode == LT || cmpcode == GE)
9224             {
9225               /* TBZ/TBNZ.  */
9226               if (comparator == const0_rtx)
9227                 return true;
9228             }
9229         }
9230     }
9231   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9232     {
9233       /* CCMP.  */
9234       if (GET_CODE (op1) == COMPARE)
9235         {
9236           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
9237           if (XEXP (op1, 1) == const0_rtx)
9238             *cost += 1;
9239           if (speed)
9240             {
9241               machine_mode mode = GET_MODE (XEXP (op1, 0));
9242               const struct cpu_cost_table *extra_cost
9243                 = aarch64_tune_params.insn_extra_cost;
9244
9245               if (GET_MODE_CLASS (mode) == MODE_INT)
9246                 *cost += extra_cost->alu.arith;
9247               else
9248                 *cost += extra_cost->fp[mode == DFmode].compare;
9249             }
9250           return true;
9251         }
9252
9253       /* It's a conditional operation based on the status flags,
9254          so it must be some flavor of CSEL.  */
9255
9256       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
9257       if (GET_CODE (op1) == NEG
9258           || GET_CODE (op1) == NOT
9259           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
9260         op1 = XEXP (op1, 0);
9261       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
9262         {
9263           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
9264           op1 = XEXP (op1, 0);
9265           op2 = XEXP (op2, 0);
9266         }
9267
9268       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
9269       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
9270       return true;
9271     }
9272
9273   /* We don't know what this is, cost all operands.  */
9274   return false;
9275 }
9276
9277 /* Check whether X is a bitfield operation of the form shift + extend that
9278    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
9279    operand to which the bitfield operation is applied.  Otherwise return
9280    NULL_RTX.  */
9281
9282 static rtx
9283 aarch64_extend_bitfield_pattern_p (rtx x)
9284 {
9285   rtx_code outer_code = GET_CODE (x);
9286   machine_mode outer_mode = GET_MODE (x);
9287
9288   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
9289       && outer_mode != SImode && outer_mode != DImode)
9290     return NULL_RTX;
9291
9292   rtx inner = XEXP (x, 0);
9293   rtx_code inner_code = GET_CODE (inner);
9294   machine_mode inner_mode = GET_MODE (inner);
9295   rtx op = NULL_RTX;
9296
9297   switch (inner_code)
9298     {
9299       case ASHIFT:
9300         if (CONST_INT_P (XEXP (inner, 1))
9301             && (inner_mode == QImode || inner_mode == HImode))
9302           op = XEXP (inner, 0);
9303         break;
9304       case LSHIFTRT:
9305         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
9306             && (inner_mode == QImode || inner_mode == HImode))
9307           op = XEXP (inner, 0);
9308         break;
9309       case ASHIFTRT:
9310         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
9311             && (inner_mode == QImode || inner_mode == HImode))
9312           op = XEXP (inner, 0);
9313         break;
9314       default:
9315         break;
9316     }
9317
9318   return op;
9319 }
9320
9321 /* Return true if the mask and a shift amount from an RTX of the form
9322    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9323    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
9324
9325 bool
9326 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
9327                                     rtx shft_amnt)
9328 {
9329   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
9330          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
9331          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
9332          && (INTVAL (mask)
9333              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
9334 }
9335
9336 /* Calculate the cost of calculating X, storing it in *COST.  Result
9337    is true if the total cost of the operation has now been calculated.  */
9338 static bool
9339 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
9340                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
9341 {
9342   rtx op0, op1, op2;
9343   const struct cpu_cost_table *extra_cost
9344     = aarch64_tune_params.insn_extra_cost;
9345   int code = GET_CODE (x);
9346   scalar_int_mode int_mode;
9347
9348   /* By default, assume that everything has equivalent cost to the
9349      cheapest instruction.  Any additional costs are applied as a delta
9350      above this default.  */
9351   *cost = COSTS_N_INSNS (1);
9352
9353   switch (code)
9354     {
9355     case SET:
9356       /* The cost depends entirely on the operands to SET.  */
9357       *cost = 0;
9358       op0 = SET_DEST (x);
9359       op1 = SET_SRC (x);
9360
9361       switch (GET_CODE (op0))
9362         {
9363         case MEM:
9364           if (speed)
9365             {
9366               rtx address = XEXP (op0, 0);
9367               if (VECTOR_MODE_P (mode))
9368                 *cost += extra_cost->ldst.storev;
9369               else if (GET_MODE_CLASS (mode) == MODE_INT)
9370                 *cost += extra_cost->ldst.store;
9371               else if (mode == SFmode)
9372                 *cost += extra_cost->ldst.storef;
9373               else if (mode == DFmode)
9374                 *cost += extra_cost->ldst.stored;
9375
9376               *cost +=
9377                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9378                                                      0, speed));
9379             }
9380
9381           *cost += rtx_cost (op1, mode, SET, 1, speed);
9382           return true;
9383
9384         case SUBREG:
9385           if (! REG_P (SUBREG_REG (op0)))
9386             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
9387
9388           /* Fall through.  */
9389         case REG:
9390           /* The cost is one per vector-register copied.  */
9391           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
9392             {
9393               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
9394               *cost = COSTS_N_INSNS (nregs);
9395             }
9396           /* const0_rtx is in general free, but we will use an
9397              instruction to set a register to 0.  */
9398           else if (REG_P (op1) || op1 == const0_rtx)
9399             {
9400               /* The cost is 1 per register copied.  */
9401               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
9402               *cost = COSTS_N_INSNS (nregs);
9403             }
9404           else
9405             /* Cost is just the cost of the RHS of the set.  */
9406             *cost += rtx_cost (op1, mode, SET, 1, speed);
9407           return true;
9408
9409         case ZERO_EXTRACT:
9410         case SIGN_EXTRACT:
9411           /* Bit-field insertion.  Strip any redundant widening of
9412              the RHS to meet the width of the target.  */
9413           if (GET_CODE (op1) == SUBREG)
9414             op1 = SUBREG_REG (op1);
9415           if ((GET_CODE (op1) == ZERO_EXTEND
9416                || GET_CODE (op1) == SIGN_EXTEND)
9417               && CONST_INT_P (XEXP (op0, 1))
9418               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
9419               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
9420             op1 = XEXP (op1, 0);
9421
9422           if (CONST_INT_P (op1))
9423             {
9424               /* MOV immediate is assumed to always be cheap.  */
9425               *cost = COSTS_N_INSNS (1);
9426             }
9427           else
9428             {
9429               /* BFM.  */
9430               if (speed)
9431                 *cost += extra_cost->alu.bfi;
9432               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
9433             }
9434
9435           return true;
9436
9437         default:
9438           /* We can't make sense of this, assume default cost.  */
9439           *cost = COSTS_N_INSNS (1);
9440           return false;
9441         }
9442       return false;
9443
9444     case CONST_INT:
9445       /* If an instruction can incorporate a constant within the
9446          instruction, the instruction's expression avoids calling
9447          rtx_cost() on the constant.  If rtx_cost() is called on a
9448          constant, then it is usually because the constant must be
9449          moved into a register by one or more instructions.
9450
9451          The exception is constant 0, which can be expressed
9452          as XZR/WZR and is therefore free.  The exception to this is
9453          if we have (set (reg) (const0_rtx)) in which case we must cost
9454          the move.  However, we can catch that when we cost the SET, so
9455          we don't need to consider that here.  */
9456       if (x == const0_rtx)
9457         *cost = 0;
9458       else
9459         {
9460           /* To an approximation, building any other constant is
9461              proportionally expensive to the number of instructions
9462              required to build that constant.  This is true whether we
9463              are compiling for SPEED or otherwise.  */
9464           if (!is_a <scalar_int_mode> (mode, &int_mode))
9465             int_mode = word_mode;
9466           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
9467                                  (NULL_RTX, x, false, int_mode));
9468         }
9469       return true;
9470
9471     case CONST_DOUBLE:
9472
9473       /* First determine number of instructions to do the move
9474           as an integer constant.  */
9475       if (!aarch64_float_const_representable_p (x)
9476            && !aarch64_can_const_movi_rtx_p (x, mode)
9477            && aarch64_float_const_rtx_p (x))
9478         {
9479           unsigned HOST_WIDE_INT ival;
9480           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
9481           gcc_assert (succeed);
9482
9483           scalar_int_mode imode = (mode == HFmode
9484                                    ? SImode
9485                                    : int_mode_for_mode (mode).require ());
9486           int ncost = aarch64_internal_mov_immediate
9487                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9488           *cost += COSTS_N_INSNS (ncost);
9489           return true;
9490         }
9491
9492       if (speed)
9493         {
9494           /* mov[df,sf]_aarch64.  */
9495           if (aarch64_float_const_representable_p (x))
9496             /* FMOV (scalar immediate).  */
9497             *cost += extra_cost->fp[mode == DFmode].fpconst;
9498           else if (!aarch64_float_const_zero_rtx_p (x))
9499             {
9500               /* This will be a load from memory.  */
9501               if (mode == DFmode)
9502                 *cost += extra_cost->ldst.loadd;
9503               else
9504                 *cost += extra_cost->ldst.loadf;
9505             }
9506           else
9507             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
9508                or MOV v0.s[0], wzr - neither of which are modeled by the
9509                cost tables.  Just use the default cost.  */
9510             {
9511             }
9512         }
9513
9514       return true;
9515
9516     case MEM:
9517       if (speed)
9518         {
9519           /* For loads we want the base cost of a load, plus an
9520              approximation for the additional cost of the addressing
9521              mode.  */
9522           rtx address = XEXP (x, 0);
9523           if (VECTOR_MODE_P (mode))
9524             *cost += extra_cost->ldst.loadv;
9525           else if (GET_MODE_CLASS (mode) == MODE_INT)
9526             *cost += extra_cost->ldst.load;
9527           else if (mode == SFmode)
9528             *cost += extra_cost->ldst.loadf;
9529           else if (mode == DFmode)
9530             *cost += extra_cost->ldst.loadd;
9531
9532           *cost +=
9533                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9534                                                      0, speed));
9535         }
9536
9537       return true;
9538
9539     case NEG:
9540       op0 = XEXP (x, 0);
9541
9542       if (VECTOR_MODE_P (mode))
9543         {
9544           if (speed)
9545             {
9546               /* FNEG.  */
9547               *cost += extra_cost->vect.alu;
9548             }
9549           return false;
9550         }
9551
9552       if (GET_MODE_CLASS (mode) == MODE_INT)
9553         {
9554           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9555               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9556             {
9557               /* CSETM.  */
9558               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
9559               return true;
9560             }
9561
9562           /* Cost this as SUB wzr, X.  */
9563           op0 = CONST0_RTX (mode);
9564           op1 = XEXP (x, 0);
9565           goto cost_minus;
9566         }
9567
9568       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9569         {
9570           /* Support (neg(fma...)) as a single instruction only if
9571              sign of zeros is unimportant.  This matches the decision
9572              making in aarch64.md.  */
9573           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
9574             {
9575               /* FNMADD.  */
9576               *cost = rtx_cost (op0, mode, NEG, 0, speed);
9577               return true;
9578             }
9579           if (GET_CODE (op0) == MULT)
9580             {
9581               /* FNMUL.  */
9582               *cost = rtx_cost (op0, mode, NEG, 0, speed);
9583               return true;
9584             }
9585           if (speed)
9586             /* FNEG.  */
9587             *cost += extra_cost->fp[mode == DFmode].neg;
9588           return false;
9589         }
9590
9591       return false;
9592
9593     case CLRSB:
9594     case CLZ:
9595       if (speed)
9596         {
9597           if (VECTOR_MODE_P (mode))
9598             *cost += extra_cost->vect.alu;
9599           else
9600             *cost += extra_cost->alu.clz;
9601         }
9602
9603       return false;
9604
9605     case COMPARE:
9606       op0 = XEXP (x, 0);
9607       op1 = XEXP (x, 1);
9608
9609       if (op1 == const0_rtx
9610           && GET_CODE (op0) == AND)
9611         {
9612           x = op0;
9613           mode = GET_MODE (op0);
9614           goto cost_logic;
9615         }
9616
9617       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
9618         {
9619           /* TODO: A write to the CC flags possibly costs extra, this
9620              needs encoding in the cost tables.  */
9621
9622           mode = GET_MODE (op0);
9623           /* ANDS.  */
9624           if (GET_CODE (op0) == AND)
9625             {
9626               x = op0;
9627               goto cost_logic;
9628             }
9629
9630           if (GET_CODE (op0) == PLUS)
9631             {
9632               /* ADDS (and CMN alias).  */
9633               x = op0;
9634               goto cost_plus;
9635             }
9636
9637           if (GET_CODE (op0) == MINUS)
9638             {
9639               /* SUBS.  */
9640               x = op0;
9641               goto cost_minus;
9642             }
9643
9644           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
9645               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
9646               && CONST_INT_P (XEXP (op0, 2)))
9647             {
9648               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
9649                  Handle it here directly rather than going to cost_logic
9650                  since we know the immediate generated for the TST is valid
9651                  so we can avoid creating an intermediate rtx for it only
9652                  for costing purposes.  */
9653               if (speed)
9654                 *cost += extra_cost->alu.logical;
9655
9656               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
9657                                  ZERO_EXTRACT, 0, speed);
9658               return true;
9659             }
9660
9661           if (GET_CODE (op1) == NEG)
9662             {
9663               /* CMN.  */
9664               if (speed)
9665                 *cost += extra_cost->alu.arith;
9666
9667               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
9668               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
9669               return true;
9670             }
9671
9672           /* CMP.
9673
9674              Compare can freely swap the order of operands, and
9675              canonicalization puts the more complex operation first.
9676              But the integer MINUS logic expects the shift/extend
9677              operation in op1.  */
9678           if (! (REG_P (op0)
9679                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
9680           {
9681             op0 = XEXP (x, 1);
9682             op1 = XEXP (x, 0);
9683           }
9684           goto cost_minus;
9685         }
9686
9687       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
9688         {
9689           /* FCMP.  */
9690           if (speed)
9691             *cost += extra_cost->fp[mode == DFmode].compare;
9692
9693           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
9694             {
9695               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
9696               /* FCMP supports constant 0.0 for no extra cost. */
9697               return true;
9698             }
9699           return false;
9700         }
9701
9702       if (VECTOR_MODE_P (mode))
9703         {
9704           /* Vector compare.  */
9705           if (speed)
9706             *cost += extra_cost->vect.alu;
9707
9708           if (aarch64_float_const_zero_rtx_p (op1))
9709             {
9710               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
9711                  cost.  */
9712               return true;
9713             }
9714           return false;
9715         }
9716       return false;
9717
9718     case MINUS:
9719       {
9720         op0 = XEXP (x, 0);
9721         op1 = XEXP (x, 1);
9722
9723 cost_minus:
9724         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
9725
9726         /* Detect valid immediates.  */
9727         if ((GET_MODE_CLASS (mode) == MODE_INT
9728              || (GET_MODE_CLASS (mode) == MODE_CC
9729                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
9730             && CONST_INT_P (op1)
9731             && aarch64_uimm12_shift (INTVAL (op1)))
9732           {
9733             if (speed)
9734               /* SUB(S) (immediate).  */
9735               *cost += extra_cost->alu.arith;
9736             return true;
9737           }
9738
9739         /* Look for SUB (extended register).  */
9740         if (is_a <scalar_int_mode> (mode, &int_mode)
9741             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
9742           {
9743             if (speed)
9744               *cost += extra_cost->alu.extend_arith;
9745
9746             op1 = aarch64_strip_extend (op1, true);
9747             *cost += rtx_cost (op1, VOIDmode,
9748                                (enum rtx_code) GET_CODE (op1), 0, speed);
9749             return true;
9750           }
9751
9752         rtx new_op1 = aarch64_strip_extend (op1, false);
9753
9754         /* Cost this as an FMA-alike operation.  */
9755         if ((GET_CODE (new_op1) == MULT
9756              || aarch64_shift_p (GET_CODE (new_op1)))
9757             && code != COMPARE)
9758           {
9759             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
9760                                             (enum rtx_code) code,
9761                                             speed);
9762             return true;
9763           }
9764
9765         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
9766
9767         if (speed)
9768           {
9769             if (VECTOR_MODE_P (mode))
9770               {
9771                 /* Vector SUB.  */
9772                 *cost += extra_cost->vect.alu;
9773               }
9774             else if (GET_MODE_CLASS (mode) == MODE_INT)
9775               {
9776                 /* SUB(S).  */
9777                 *cost += extra_cost->alu.arith;
9778               }
9779             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9780               {
9781                 /* FSUB.  */
9782                 *cost += extra_cost->fp[mode == DFmode].addsub;
9783               }
9784           }
9785         return true;
9786       }
9787
9788     case PLUS:
9789       {
9790         rtx new_op0;
9791
9792         op0 = XEXP (x, 0);
9793         op1 = XEXP (x, 1);
9794
9795 cost_plus:
9796         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9797             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9798           {
9799             /* CSINC.  */
9800             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
9801             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9802             return true;
9803           }
9804
9805         if (GET_MODE_CLASS (mode) == MODE_INT
9806             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
9807                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
9808           {
9809             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
9810
9811             if (speed)
9812               /* ADD (immediate).  */
9813               *cost += extra_cost->alu.arith;
9814             return true;
9815           }
9816
9817         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9818
9819         /* Look for ADD (extended register).  */
9820         if (is_a <scalar_int_mode> (mode, &int_mode)
9821             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
9822           {
9823             if (speed)
9824               *cost += extra_cost->alu.extend_arith;
9825
9826             op0 = aarch64_strip_extend (op0, true);
9827             *cost += rtx_cost (op0, VOIDmode,
9828                                (enum rtx_code) GET_CODE (op0), 0, speed);
9829             return true;
9830           }
9831
9832         /* Strip any extend, leave shifts behind as we will
9833            cost them through mult_cost.  */
9834         new_op0 = aarch64_strip_extend (op0, false);
9835
9836         if (GET_CODE (new_op0) == MULT
9837             || aarch64_shift_p (GET_CODE (new_op0)))
9838           {
9839             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
9840                                             speed);
9841             return true;
9842           }
9843
9844         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
9845
9846         if (speed)
9847           {
9848             if (VECTOR_MODE_P (mode))
9849               {
9850                 /* Vector ADD.  */
9851                 *cost += extra_cost->vect.alu;
9852               }
9853             else if (GET_MODE_CLASS (mode) == MODE_INT)
9854               {
9855                 /* ADD.  */
9856                 *cost += extra_cost->alu.arith;
9857               }
9858             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9859               {
9860                 /* FADD.  */
9861                 *cost += extra_cost->fp[mode == DFmode].addsub;
9862               }
9863           }
9864         return true;
9865       }
9866
9867     case BSWAP:
9868       *cost = COSTS_N_INSNS (1);
9869
9870       if (speed)
9871         {
9872           if (VECTOR_MODE_P (mode))
9873             *cost += extra_cost->vect.alu;
9874           else
9875             *cost += extra_cost->alu.rev;
9876         }
9877       return false;
9878
9879     case IOR:
9880       if (aarch_rev16_p (x))
9881         {
9882           *cost = COSTS_N_INSNS (1);
9883
9884           if (speed)
9885             {
9886               if (VECTOR_MODE_P (mode))
9887                 *cost += extra_cost->vect.alu;
9888               else
9889                 *cost += extra_cost->alu.rev;
9890             }
9891           return true;
9892         }
9893
9894       if (aarch64_extr_rtx_p (x, &op0, &op1))
9895         {
9896           *cost += rtx_cost (op0, mode, IOR, 0, speed);
9897           *cost += rtx_cost (op1, mode, IOR, 1, speed);
9898           if (speed)
9899             *cost += extra_cost->alu.shift;
9900
9901           return true;
9902         }
9903     /* Fall through.  */
9904     case XOR:
9905     case AND:
9906     cost_logic:
9907       op0 = XEXP (x, 0);
9908       op1 = XEXP (x, 1);
9909
9910       if (VECTOR_MODE_P (mode))
9911         {
9912           if (speed)
9913             *cost += extra_cost->vect.alu;
9914           return true;
9915         }
9916
9917       if (code == AND
9918           && GET_CODE (op0) == MULT
9919           && CONST_INT_P (XEXP (op0, 1))
9920           && CONST_INT_P (op1)
9921           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9922                                INTVAL (op1)) != 0)
9923         {
9924           /* This is a UBFM/SBFM.  */
9925           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9926           if (speed)
9927             *cost += extra_cost->alu.bfx;
9928           return true;
9929         }
9930
9931       if (is_int_mode (mode, &int_mode))
9932         {
9933           if (CONST_INT_P (op1))
9934             {
9935               /* We have a mask + shift version of a UBFIZ
9936                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
9937               if (GET_CODE (op0) == ASHIFT
9938                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9939                                                          XEXP (op0, 1)))
9940                 {
9941                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
9942                                      (enum rtx_code) code, 0, speed);
9943                   if (speed)
9944                     *cost += extra_cost->alu.bfx;
9945
9946                   return true;
9947                 }
9948               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9949                 {
9950                 /* We possibly get the immediate for free, this is not
9951                    modelled.  */
9952                   *cost += rtx_cost (op0, int_mode,
9953                                      (enum rtx_code) code, 0, speed);
9954                   if (speed)
9955                     *cost += extra_cost->alu.logical;
9956
9957                   return true;
9958                 }
9959             }
9960           else
9961             {
9962               rtx new_op0 = op0;
9963
9964               /* Handle ORN, EON, or BIC.  */
9965               if (GET_CODE (op0) == NOT)
9966                 op0 = XEXP (op0, 0);
9967
9968               new_op0 = aarch64_strip_shift (op0);
9969
9970               /* If we had a shift on op0 then this is a logical-shift-
9971                  by-register/immediate operation.  Otherwise, this is just
9972                  a logical operation.  */
9973               if (speed)
9974                 {
9975                   if (new_op0 != op0)
9976                     {
9977                       /* Shift by immediate.  */
9978                       if (CONST_INT_P (XEXP (op0, 1)))
9979                         *cost += extra_cost->alu.log_shift;
9980                       else
9981                         *cost += extra_cost->alu.log_shift_reg;
9982                     }
9983                   else
9984                     *cost += extra_cost->alu.logical;
9985                 }
9986
9987               /* In both cases we want to cost both operands.  */
9988               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9989                                  0, speed);
9990               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9991                                  1, speed);
9992
9993               return true;
9994             }
9995         }
9996       return false;
9997
9998     case NOT:
9999       x = XEXP (x, 0);
10000       op0 = aarch64_strip_shift (x);
10001
10002       if (VECTOR_MODE_P (mode))
10003         {
10004           /* Vector NOT.  */
10005           *cost += extra_cost->vect.alu;
10006           return false;
10007         }
10008
10009       /* MVN-shifted-reg.  */
10010       if (op0 != x)
10011         {
10012           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10013
10014           if (speed)
10015             *cost += extra_cost->alu.log_shift;
10016
10017           return true;
10018         }
10019       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10020          Handle the second form here taking care that 'a' in the above can
10021          be a shift.  */
10022       else if (GET_CODE (op0) == XOR)
10023         {
10024           rtx newop0 = XEXP (op0, 0);
10025           rtx newop1 = XEXP (op0, 1);
10026           rtx op0_stripped = aarch64_strip_shift (newop0);
10027
10028           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10029           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10030
10031           if (speed)
10032             {
10033               if (op0_stripped != newop0)
10034                 *cost += extra_cost->alu.log_shift;
10035               else
10036                 *cost += extra_cost->alu.logical;
10037             }
10038
10039           return true;
10040         }
10041       /* MVN.  */
10042       if (speed)
10043         *cost += extra_cost->alu.logical;
10044
10045       return false;
10046
10047     case ZERO_EXTEND:
10048
10049       op0 = XEXP (x, 0);
10050       /* If a value is written in SI mode, then zero extended to DI
10051          mode, the operation will in general be free as a write to
10052          a 'w' register implicitly zeroes the upper bits of an 'x'
10053          register.  However, if this is
10054
10055            (set (reg) (zero_extend (reg)))
10056
10057          we must cost the explicit register move.  */
10058       if (mode == DImode
10059           && GET_MODE (op0) == SImode
10060           && outer == SET)
10061         {
10062           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10063
10064         /* If OP_COST is non-zero, then the cost of the zero extend
10065            is effectively the cost of the inner operation.  Otherwise
10066            we have a MOV instruction and we take the cost from the MOV
10067            itself.  This is true independently of whether we are
10068            optimizing for space or time.  */
10069           if (op_cost)
10070             *cost = op_cost;
10071
10072           return true;
10073         }
10074       else if (MEM_P (op0))
10075         {
10076           /* All loads can zero extend to any size for free.  */
10077           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10078           return true;
10079         }
10080
10081       op0 = aarch64_extend_bitfield_pattern_p (x);
10082       if (op0)
10083         {
10084           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
10085           if (speed)
10086             *cost += extra_cost->alu.bfx;
10087           return true;
10088         }
10089
10090       if (speed)
10091         {
10092           if (VECTOR_MODE_P (mode))
10093             {
10094               /* UMOV.  */
10095               *cost += extra_cost->vect.alu;
10096             }
10097           else
10098             {
10099               /* We generate an AND instead of UXTB/UXTH.  */
10100               *cost += extra_cost->alu.logical;
10101             }
10102         }
10103       return false;
10104
10105     case SIGN_EXTEND:
10106       if (MEM_P (XEXP (x, 0)))
10107         {
10108           /* LDRSH.  */
10109           if (speed)
10110             {
10111               rtx address = XEXP (XEXP (x, 0), 0);
10112               *cost += extra_cost->ldst.load_sign_extend;
10113
10114               *cost +=
10115                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10116                                                      0, speed));
10117             }
10118           return true;
10119         }
10120
10121       op0 = aarch64_extend_bitfield_pattern_p (x);
10122       if (op0)
10123         {
10124           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
10125           if (speed)
10126             *cost += extra_cost->alu.bfx;
10127           return true;
10128         }
10129
10130       if (speed)
10131         {
10132           if (VECTOR_MODE_P (mode))
10133             *cost += extra_cost->vect.alu;
10134           else
10135             *cost += extra_cost->alu.extend;
10136         }
10137       return false;
10138
10139     case ASHIFT:
10140       op0 = XEXP (x, 0);
10141       op1 = XEXP (x, 1);
10142
10143       if (CONST_INT_P (op1))
10144         {
10145           if (speed)
10146             {
10147               if (VECTOR_MODE_P (mode))
10148                 {
10149                   /* Vector shift (immediate).  */
10150                   *cost += extra_cost->vect.alu;
10151                 }
10152               else
10153                 {
10154                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
10155                      aliases.  */
10156                   *cost += extra_cost->alu.shift;
10157                 }
10158             }
10159
10160           /* We can incorporate zero/sign extend for free.  */
10161           if (GET_CODE (op0) == ZERO_EXTEND
10162               || GET_CODE (op0) == SIGN_EXTEND)
10163             op0 = XEXP (op0, 0);
10164
10165           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
10166           return true;
10167         }
10168       else
10169         {
10170           if (VECTOR_MODE_P (mode))
10171             {
10172               if (speed)
10173                 /* Vector shift (register).  */
10174                 *cost += extra_cost->vect.alu;
10175             }
10176           else
10177             {
10178               if (speed)
10179                 /* LSLV.  */
10180                 *cost += extra_cost->alu.shift_reg;
10181
10182               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10183                   && CONST_INT_P (XEXP (op1, 1))
10184                   && known_eq (INTVAL (XEXP (op1, 1)),
10185                                GET_MODE_BITSIZE (mode) - 1))
10186                 {
10187                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10188                   /* We already demanded XEXP (op1, 0) to be REG_P, so
10189                      don't recurse into it.  */
10190                   return true;
10191                 }
10192             }
10193           return false;  /* All arguments need to be in registers.  */
10194         }
10195
10196     case ROTATE:
10197     case ROTATERT:
10198     case LSHIFTRT:
10199     case ASHIFTRT:
10200       op0 = XEXP (x, 0);
10201       op1 = XEXP (x, 1);
10202
10203       if (CONST_INT_P (op1))
10204         {
10205           /* ASR (immediate) and friends.  */
10206           if (speed)
10207             {
10208               if (VECTOR_MODE_P (mode))
10209                 *cost += extra_cost->vect.alu;
10210               else
10211                 *cost += extra_cost->alu.shift;
10212             }
10213
10214           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10215           return true;
10216         }
10217       else
10218         {
10219           if (VECTOR_MODE_P (mode))
10220             {
10221               if (speed)
10222                 /* Vector shift (register).  */
10223                 *cost += extra_cost->vect.alu;
10224             }
10225           else
10226             {
10227               if (speed)
10228                 /* ASR (register) and friends.  */
10229                 *cost += extra_cost->alu.shift_reg;
10230
10231               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10232                   && CONST_INT_P (XEXP (op1, 1))
10233                   && known_eq (INTVAL (XEXP (op1, 1)),
10234                                GET_MODE_BITSIZE (mode) - 1))
10235                 {
10236                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10237                   /* We already demanded XEXP (op1, 0) to be REG_P, so
10238                      don't recurse into it.  */
10239                   return true;
10240                 }
10241             }
10242           return false;  /* All arguments need to be in registers.  */
10243         }
10244
10245     case SYMBOL_REF:
10246
10247       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
10248           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
10249         {
10250           /* LDR.  */
10251           if (speed)
10252             *cost += extra_cost->ldst.load;
10253         }
10254       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
10255                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
10256         {
10257           /* ADRP, followed by ADD.  */
10258           *cost += COSTS_N_INSNS (1);
10259           if (speed)
10260             *cost += 2 * extra_cost->alu.arith;
10261         }
10262       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
10263                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10264         {
10265           /* ADR.  */
10266           if (speed)
10267             *cost += extra_cost->alu.arith;
10268         }
10269
10270       if (flag_pic)
10271         {
10272           /* One extra load instruction, after accessing the GOT.  */
10273           *cost += COSTS_N_INSNS (1);
10274           if (speed)
10275             *cost += extra_cost->ldst.load;
10276         }
10277       return true;
10278
10279     case HIGH:
10280     case LO_SUM:
10281       /* ADRP/ADD (immediate).  */
10282       if (speed)
10283         *cost += extra_cost->alu.arith;
10284       return true;
10285
10286     case ZERO_EXTRACT:
10287     case SIGN_EXTRACT:
10288       /* UBFX/SBFX.  */
10289       if (speed)
10290         {
10291           if (VECTOR_MODE_P (mode))
10292             *cost += extra_cost->vect.alu;
10293           else
10294             *cost += extra_cost->alu.bfx;
10295         }
10296
10297       /* We can trust that the immediates used will be correct (there
10298          are no by-register forms), so we need only cost op0.  */
10299       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
10300       return true;
10301
10302     case MULT:
10303       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
10304       /* aarch64_rtx_mult_cost always handles recursion to its
10305          operands.  */
10306       return true;
10307
10308     case MOD:
10309     /* We can expand signed mod by power of 2 using a NEGS, two parallel
10310        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
10311        an unconditional negate.  This case should only ever be reached through
10312        the set_smod_pow2_cheap check in expmed.c.  */
10313       if (CONST_INT_P (XEXP (x, 1))
10314           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
10315           && (mode == SImode || mode == DImode))
10316         {
10317           /* We expand to 4 instructions.  Reset the baseline.  */
10318           *cost = COSTS_N_INSNS (4);
10319
10320           if (speed)
10321             *cost += 2 * extra_cost->alu.logical
10322                      + 2 * extra_cost->alu.arith;
10323
10324           return true;
10325         }
10326
10327     /* Fall-through.  */
10328     case UMOD:
10329       if (speed)
10330         {
10331           /* Slighly prefer UMOD over SMOD.  */
10332           if (VECTOR_MODE_P (mode))
10333             *cost += extra_cost->vect.alu;
10334           else if (GET_MODE_CLASS (mode) == MODE_INT)
10335             *cost += (extra_cost->mult[mode == DImode].add
10336                       + extra_cost->mult[mode == DImode].idiv
10337                       + (code == MOD ? 1 : 0));
10338         }
10339       return false;  /* All arguments need to be in registers.  */
10340
10341     case DIV:
10342     case UDIV:
10343     case SQRT:
10344       if (speed)
10345         {
10346           if (VECTOR_MODE_P (mode))
10347             *cost += extra_cost->vect.alu;
10348           else if (GET_MODE_CLASS (mode) == MODE_INT)
10349             /* There is no integer SQRT, so only DIV and UDIV can get
10350                here.  */
10351             *cost += (extra_cost->mult[mode == DImode].idiv
10352                      /* Slighly prefer UDIV over SDIV.  */
10353                      + (code == DIV ? 1 : 0));
10354           else
10355             *cost += extra_cost->fp[mode == DFmode].div;
10356         }
10357       return false;  /* All arguments need to be in registers.  */
10358
10359     case IF_THEN_ELSE:
10360       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
10361                                          XEXP (x, 2), cost, speed);
10362
10363     case EQ:
10364     case NE:
10365     case GT:
10366     case GTU:
10367     case LT:
10368     case LTU:
10369     case GE:
10370     case GEU:
10371     case LE:
10372     case LEU:
10373
10374       return false; /* All arguments must be in registers.  */
10375
10376     case FMA:
10377       op0 = XEXP (x, 0);
10378       op1 = XEXP (x, 1);
10379       op2 = XEXP (x, 2);
10380
10381       if (speed)
10382         {
10383           if (VECTOR_MODE_P (mode))
10384             *cost += extra_cost->vect.alu;
10385           else
10386             *cost += extra_cost->fp[mode == DFmode].fma;
10387         }
10388
10389       /* FMSUB, FNMADD, and FNMSUB are free.  */
10390       if (GET_CODE (op0) == NEG)
10391         op0 = XEXP (op0, 0);
10392
10393       if (GET_CODE (op2) == NEG)
10394         op2 = XEXP (op2, 0);
10395
10396       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
10397          and the by-element operand as operand 0.  */
10398       if (GET_CODE (op1) == NEG)
10399         op1 = XEXP (op1, 0);
10400
10401       /* Catch vector-by-element operations.  The by-element operand can
10402          either be (vec_duplicate (vec_select (x))) or just
10403          (vec_select (x)), depending on whether we are multiplying by
10404          a vector or a scalar.
10405
10406          Canonicalization is not very good in these cases, FMA4 will put the
10407          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
10408       if (GET_CODE (op0) == VEC_DUPLICATE)
10409         op0 = XEXP (op0, 0);
10410       else if (GET_CODE (op1) == VEC_DUPLICATE)
10411         op1 = XEXP (op1, 0);
10412
10413       if (GET_CODE (op0) == VEC_SELECT)
10414         op0 = XEXP (op0, 0);
10415       else if (GET_CODE (op1) == VEC_SELECT)
10416         op1 = XEXP (op1, 0);
10417
10418       /* If the remaining parameters are not registers,
10419          get the cost to put them into registers.  */
10420       *cost += rtx_cost (op0, mode, FMA, 0, speed);
10421       *cost += rtx_cost (op1, mode, FMA, 1, speed);
10422       *cost += rtx_cost (op2, mode, FMA, 2, speed);
10423       return true;
10424
10425     case FLOAT:
10426     case UNSIGNED_FLOAT:
10427       if (speed)
10428         *cost += extra_cost->fp[mode == DFmode].fromint;
10429       return false;
10430
10431     case FLOAT_EXTEND:
10432       if (speed)
10433         {
10434           if (VECTOR_MODE_P (mode))
10435             {
10436               /*Vector truncate.  */
10437               *cost += extra_cost->vect.alu;
10438             }
10439           else
10440             *cost += extra_cost->fp[mode == DFmode].widen;
10441         }
10442       return false;
10443
10444     case FLOAT_TRUNCATE:
10445       if (speed)
10446         {
10447           if (VECTOR_MODE_P (mode))
10448             {
10449               /*Vector conversion.  */
10450               *cost += extra_cost->vect.alu;
10451             }
10452           else
10453             *cost += extra_cost->fp[mode == DFmode].narrow;
10454         }
10455       return false;
10456
10457     case FIX:
10458     case UNSIGNED_FIX:
10459       x = XEXP (x, 0);
10460       /* Strip the rounding part.  They will all be implemented
10461          by the fcvt* family of instructions anyway.  */
10462       if (GET_CODE (x) == UNSPEC)
10463         {
10464           unsigned int uns_code = XINT (x, 1);
10465
10466           if (uns_code == UNSPEC_FRINTA
10467               || uns_code == UNSPEC_FRINTM
10468               || uns_code == UNSPEC_FRINTN
10469               || uns_code == UNSPEC_FRINTP
10470               || uns_code == UNSPEC_FRINTZ)
10471             x = XVECEXP (x, 0, 0);
10472         }
10473
10474       if (speed)
10475         {
10476           if (VECTOR_MODE_P (mode))
10477             *cost += extra_cost->vect.alu;
10478           else
10479             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
10480         }
10481
10482       /* We can combine fmul by a power of 2 followed by a fcvt into a single
10483          fixed-point fcvt.  */
10484       if (GET_CODE (x) == MULT
10485           && ((VECTOR_MODE_P (mode)
10486                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
10487               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
10488         {
10489           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
10490                              0, speed);
10491           return true;
10492         }
10493
10494       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
10495       return true;
10496
10497     case ABS:
10498       if (VECTOR_MODE_P (mode))
10499         {
10500           /* ABS (vector).  */
10501           if (speed)
10502             *cost += extra_cost->vect.alu;
10503         }
10504       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10505         {
10506           op0 = XEXP (x, 0);
10507
10508           /* FABD, which is analogous to FADD.  */
10509           if (GET_CODE (op0) == MINUS)
10510             {
10511               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
10512               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
10513               if (speed)
10514                 *cost += extra_cost->fp[mode == DFmode].addsub;
10515
10516               return true;
10517             }
10518           /* Simple FABS is analogous to FNEG.  */
10519           if (speed)
10520             *cost += extra_cost->fp[mode == DFmode].neg;
10521         }
10522       else
10523         {
10524           /* Integer ABS will either be split to
10525              two arithmetic instructions, or will be an ABS
10526              (scalar), which we don't model.  */
10527           *cost = COSTS_N_INSNS (2);
10528           if (speed)
10529             *cost += 2 * extra_cost->alu.arith;
10530         }
10531       return false;
10532
10533     case SMAX:
10534     case SMIN:
10535       if (speed)
10536         {
10537           if (VECTOR_MODE_P (mode))
10538             *cost += extra_cost->vect.alu;
10539           else
10540             {
10541               /* FMAXNM/FMINNM/FMAX/FMIN.
10542                  TODO: This may not be accurate for all implementations, but
10543                  we do not model this in the cost tables.  */
10544               *cost += extra_cost->fp[mode == DFmode].addsub;
10545             }
10546         }
10547       return false;
10548
10549     case UNSPEC:
10550       /* The floating point round to integer frint* instructions.  */
10551       if (aarch64_frint_unspec_p (XINT (x, 1)))
10552         {
10553           if (speed)
10554             *cost += extra_cost->fp[mode == DFmode].roundint;
10555
10556           return false;
10557         }
10558
10559       if (XINT (x, 1) == UNSPEC_RBIT)
10560         {
10561           if (speed)
10562             *cost += extra_cost->alu.rev;
10563
10564           return false;
10565         }
10566       break;
10567
10568     case TRUNCATE:
10569
10570       /* Decompose <su>muldi3_highpart.  */
10571       if (/* (truncate:DI  */
10572           mode == DImode
10573           /*   (lshiftrt:TI  */
10574           && GET_MODE (XEXP (x, 0)) == TImode
10575           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
10576           /*      (mult:TI  */
10577           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
10578           /*        (ANY_EXTEND:TI (reg:DI))
10579                     (ANY_EXTEND:TI (reg:DI)))  */
10580           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
10581                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
10582               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
10583                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
10584           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
10585           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
10586           /*     (const_int 64)  */
10587           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10588           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
10589         {
10590           /* UMULH/SMULH.  */
10591           if (speed)
10592             *cost += extra_cost->mult[mode == DImode].extend;
10593           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
10594                              mode, MULT, 0, speed);
10595           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
10596                              mode, MULT, 1, speed);
10597           return true;
10598         }
10599
10600       /* Fall through.  */
10601     default:
10602       break;
10603     }
10604
10605   if (dump_file
10606       && flag_aarch64_verbose_cost)
10607     fprintf (dump_file,
10608       "\nFailed to cost RTX.  Assuming default cost.\n");
10609
10610   return true;
10611 }
10612
10613 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
10614    calculated for X.  This cost is stored in *COST.  Returns true
10615    if the total cost of X was calculated.  */
10616 static bool
10617 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
10618                    int param, int *cost, bool speed)
10619 {
10620   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
10621
10622   if (dump_file
10623       && flag_aarch64_verbose_cost)
10624     {
10625       print_rtl_single (dump_file, x);
10626       fprintf (dump_file, "\n%s cost: %d (%s)\n",
10627                speed ? "Hot" : "Cold",
10628                *cost, result ? "final" : "partial");
10629     }
10630
10631   return result;
10632 }
10633
10634 static int
10635 aarch64_register_move_cost (machine_mode mode,
10636                             reg_class_t from_i, reg_class_t to_i)
10637 {
10638   enum reg_class from = (enum reg_class) from_i;
10639   enum reg_class to = (enum reg_class) to_i;
10640   const struct cpu_regmove_cost *regmove_cost
10641     = aarch64_tune_params.regmove_cost;
10642
10643   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
10644   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
10645     to = GENERAL_REGS;
10646
10647   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
10648     from = GENERAL_REGS;
10649
10650   /* Moving between GPR and stack cost is the same as GP2GP.  */
10651   if ((from == GENERAL_REGS && to == STACK_REG)
10652       || (to == GENERAL_REGS && from == STACK_REG))
10653     return regmove_cost->GP2GP;
10654
10655   /* To/From the stack register, we move via the gprs.  */
10656   if (to == STACK_REG || from == STACK_REG)
10657     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
10658             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
10659
10660   if (known_eq (GET_MODE_SIZE (mode), 16))
10661     {
10662       /* 128-bit operations on general registers require 2 instructions.  */
10663       if (from == GENERAL_REGS && to == GENERAL_REGS)
10664         return regmove_cost->GP2GP * 2;
10665       else if (from == GENERAL_REGS)
10666         return regmove_cost->GP2FP * 2;
10667       else if (to == GENERAL_REGS)
10668         return regmove_cost->FP2GP * 2;
10669
10670       /* When AdvSIMD instructions are disabled it is not possible to move
10671          a 128-bit value directly between Q registers.  This is handled in
10672          secondary reload.  A general register is used as a scratch to move
10673          the upper DI value and the lower DI value is moved directly,
10674          hence the cost is the sum of three moves. */
10675       if (! TARGET_SIMD)
10676         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
10677
10678       return regmove_cost->FP2FP;
10679     }
10680
10681   if (from == GENERAL_REGS && to == GENERAL_REGS)
10682     return regmove_cost->GP2GP;
10683   else if (from == GENERAL_REGS)
10684     return regmove_cost->GP2FP;
10685   else if (to == GENERAL_REGS)
10686     return regmove_cost->FP2GP;
10687
10688   return regmove_cost->FP2FP;
10689 }
10690
10691 static int
10692 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
10693                           reg_class_t rclass ATTRIBUTE_UNUSED,
10694                           bool in ATTRIBUTE_UNUSED)
10695 {
10696   return aarch64_tune_params.memmov_cost;
10697 }
10698
10699 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
10700    to optimize 1.0/sqrt.  */
10701
10702 static bool
10703 use_rsqrt_p (machine_mode mode)
10704 {
10705   return (!flag_trapping_math
10706           && flag_unsafe_math_optimizations
10707           && ((aarch64_tune_params.approx_modes->recip_sqrt
10708                & AARCH64_APPROX_MODE (mode))
10709               || flag_mrecip_low_precision_sqrt));
10710 }
10711
10712 /* Function to decide when to use the approximate reciprocal square root
10713    builtin.  */
10714
10715 static tree
10716 aarch64_builtin_reciprocal (tree fndecl)
10717 {
10718   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
10719
10720   if (!use_rsqrt_p (mode))
10721     return NULL_TREE;
10722   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
10723 }
10724
10725 /* Emit instruction sequence to compute either the approximate square root
10726    or its approximate reciprocal, depending on the flag RECP, and return
10727    whether the sequence was emitted or not.  */
10728
10729 bool
10730 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
10731 {
10732   machine_mode mode = GET_MODE (dst);
10733
10734   if (GET_MODE_INNER (mode) == HFmode)
10735     {
10736       gcc_assert (!recp);
10737       return false;
10738     }
10739
10740   if (!recp)
10741     {
10742       if (!(flag_mlow_precision_sqrt
10743             || (aarch64_tune_params.approx_modes->sqrt
10744                 & AARCH64_APPROX_MODE (mode))))
10745         return false;
10746
10747       if (flag_finite_math_only
10748           || flag_trapping_math
10749           || !flag_unsafe_math_optimizations
10750           || optimize_function_for_size_p (cfun))
10751         return false;
10752     }
10753   else
10754     /* Caller assumes we cannot fail.  */
10755     gcc_assert (use_rsqrt_p (mode));
10756
10757   machine_mode mmsk = mode_for_int_vector (mode).require ();
10758   rtx xmsk = gen_reg_rtx (mmsk);
10759   if (!recp)
10760     /* When calculating the approximate square root, compare the
10761        argument with 0.0 and create a mask.  */
10762     emit_insn (gen_rtx_SET (xmsk,
10763                             gen_rtx_NEG (mmsk,
10764                                          gen_rtx_EQ (mmsk, src,
10765                                                      CONST0_RTX (mode)))));
10766
10767   /* Estimate the approximate reciprocal square root.  */
10768   rtx xdst = gen_reg_rtx (mode);
10769   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
10770
10771   /* Iterate over the series twice for SF and thrice for DF.  */
10772   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10773
10774   /* Optionally iterate over the series once less for faster performance
10775      while sacrificing the accuracy.  */
10776   if ((recp && flag_mrecip_low_precision_sqrt)
10777       || (!recp && flag_mlow_precision_sqrt))
10778     iterations--;
10779
10780   /* Iterate over the series to calculate the approximate reciprocal square
10781      root.  */
10782   rtx x1 = gen_reg_rtx (mode);
10783   while (iterations--)
10784     {
10785       rtx x2 = gen_reg_rtx (mode);
10786       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
10787
10788       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
10789
10790       if (iterations > 0)
10791         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
10792     }
10793
10794   if (!recp)
10795     {
10796       /* Qualify the approximate reciprocal square root when the argument is
10797          0.0 by squashing the intermediary result to 0.0.  */
10798       rtx xtmp = gen_reg_rtx (mmsk);
10799       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
10800                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
10801       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
10802
10803       /* Calculate the approximate square root.  */
10804       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
10805     }
10806
10807   /* Finalize the approximation.  */
10808   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
10809
10810   return true;
10811 }
10812
10813 /* Emit the instruction sequence to compute the approximation for the division
10814    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
10815
10816 bool
10817 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10818 {
10819   machine_mode mode = GET_MODE (quo);
10820
10821   if (GET_MODE_INNER (mode) == HFmode)
10822     return false;
10823
10824   bool use_approx_division_p = (flag_mlow_precision_div
10825                                 || (aarch64_tune_params.approx_modes->division
10826                                     & AARCH64_APPROX_MODE (mode)));
10827
10828   if (!flag_finite_math_only
10829       || flag_trapping_math
10830       || !flag_unsafe_math_optimizations
10831       || optimize_function_for_size_p (cfun)
10832       || !use_approx_division_p)
10833     return false;
10834
10835   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10836     return false;
10837
10838   /* Estimate the approximate reciprocal.  */
10839   rtx xrcp = gen_reg_rtx (mode);
10840   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
10841
10842   /* Iterate over the series twice for SF and thrice for DF.  */
10843   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10844
10845   /* Optionally iterate over the series once less for faster performance,
10846      while sacrificing the accuracy.  */
10847   if (flag_mlow_precision_div)
10848     iterations--;
10849
10850   /* Iterate over the series to calculate the approximate reciprocal.  */
10851   rtx xtmp = gen_reg_rtx (mode);
10852   while (iterations--)
10853     {
10854       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
10855
10856       if (iterations > 0)
10857         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10858     }
10859
10860   if (num != CONST1_RTX (mode))
10861     {
10862       /* As the approximate reciprocal of DEN is already calculated, only
10863          calculate the approximate division when NUM is not 1.0.  */
10864       rtx xnum = force_reg (mode, num);
10865       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10866     }
10867
10868   /* Finalize the approximation.  */
10869   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10870   return true;
10871 }
10872
10873 /* Return the number of instructions that can be issued per cycle.  */
10874 static int
10875 aarch64_sched_issue_rate (void)
10876 {
10877   return aarch64_tune_params.issue_rate;
10878 }
10879
10880 static int
10881 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10882 {
10883   int issue_rate = aarch64_sched_issue_rate ();
10884
10885   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10886 }
10887
10888
10889 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10890    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
10891    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
10892
10893 static int
10894 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10895                                                     int ready_index)
10896 {
10897   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10898 }
10899
10900
10901 /* Vectorizer cost model target hooks.  */
10902
10903 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
10904 static int
10905 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10906                                     tree vectype,
10907                                     int misalign ATTRIBUTE_UNUSED)
10908 {
10909   unsigned elements;
10910   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10911   bool fp = false;
10912
10913   if (vectype != NULL)
10914     fp = FLOAT_TYPE_P (vectype);
10915
10916   switch (type_of_cost)
10917     {
10918       case scalar_stmt:
10919         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10920
10921       case scalar_load:
10922         return costs->scalar_load_cost;
10923
10924       case scalar_store:
10925         return costs->scalar_store_cost;
10926
10927       case vector_stmt:
10928         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10929
10930       case vector_load:
10931         return costs->vec_align_load_cost;
10932
10933       case vector_store:
10934         return costs->vec_store_cost;
10935
10936       case vec_to_scalar:
10937         return costs->vec_to_scalar_cost;
10938
10939       case scalar_to_vec:
10940         return costs->scalar_to_vec_cost;
10941
10942       case unaligned_load:
10943       case vector_gather_load:
10944         return costs->vec_unalign_load_cost;
10945
10946       case unaligned_store:
10947       case vector_scatter_store:
10948         return costs->vec_unalign_store_cost;
10949
10950       case cond_branch_taken:
10951         return costs->cond_taken_branch_cost;
10952
10953       case cond_branch_not_taken:
10954         return costs->cond_not_taken_branch_cost;
10955
10956       case vec_perm:
10957         return costs->vec_permute_cost;
10958
10959       case vec_promote_demote:
10960         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10961
10962       case vec_construct:
10963         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10964         return elements / 2 + 1;
10965
10966       default:
10967         gcc_unreachable ();
10968     }
10969 }
10970
10971 /* Implement targetm.vectorize.add_stmt_cost.  */
10972 static unsigned
10973 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10974                        struct _stmt_vec_info *stmt_info, int misalign,
10975                        enum vect_cost_model_location where)
10976 {
10977   unsigned *cost = (unsigned *) data;
10978   unsigned retval = 0;
10979
10980   if (flag_vect_cost_model)
10981     {
10982       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10983       int stmt_cost =
10984             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10985
10986       /* Statements in an inner loop relative to the loop being
10987          vectorized are weighted more heavily.  The value here is
10988          arbitrary and could potentially be improved with analysis.  */
10989       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10990         count *= 50; /*  FIXME  */
10991
10992       retval = (unsigned) (count * stmt_cost);
10993       cost[where] += retval;
10994     }
10995
10996   return retval;
10997 }
10998
10999 static void initialize_aarch64_code_model (struct gcc_options *);
11000
11001 /* Parse the TO_PARSE string and put the architecture struct that it
11002    selects into RES and the architectural features into ISA_FLAGS.
11003    Return an aarch64_parse_opt_result describing the parse result.
11004    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11005    When the TO_PARSE string contains an invalid extension,
11006    a copy of the string is created and stored to INVALID_EXTENSION.  */
11007
11008 static enum aarch64_parse_opt_result
11009 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11010                     unsigned long *isa_flags, std::string *invalid_extension)
11011 {
11012   const char *ext;
11013   const struct processor *arch;
11014   size_t len;
11015
11016   ext = strchr (to_parse, '+');
11017
11018   if (ext != NULL)
11019     len = ext - to_parse;
11020   else
11021     len = strlen (to_parse);
11022
11023   if (len == 0)
11024     return AARCH64_PARSE_MISSING_ARG;
11025
11026
11027   /* Loop through the list of supported ARCHes to find a match.  */
11028   for (arch = all_architectures; arch->name != NULL; arch++)
11029     {
11030       if (strlen (arch->name) == len
11031           && strncmp (arch->name, to_parse, len) == 0)
11032         {
11033           unsigned long isa_temp = arch->flags;
11034
11035           if (ext != NULL)
11036             {
11037               /* TO_PARSE string contains at least one extension.  */
11038               enum aarch64_parse_opt_result ext_res
11039                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11040
11041               if (ext_res != AARCH64_PARSE_OK)
11042                 return ext_res;
11043             }
11044           /* Extension parsing was successful.  Confirm the result
11045              arch and ISA flags.  */
11046           *res = arch;
11047           *isa_flags = isa_temp;
11048           return AARCH64_PARSE_OK;
11049         }
11050     }
11051
11052   /* ARCH name not found in list.  */
11053   return AARCH64_PARSE_INVALID_ARG;
11054 }
11055
11056 /* Parse the TO_PARSE string and put the result tuning in RES and the
11057    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
11058    describing the parse result.  If there is an error parsing, RES and
11059    ISA_FLAGS are left unchanged.
11060    When the TO_PARSE string contains an invalid extension,
11061    a copy of the string is created and stored to INVALID_EXTENSION.  */
11062
11063 static enum aarch64_parse_opt_result
11064 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11065                    unsigned long *isa_flags, std::string *invalid_extension)
11066 {
11067   const char *ext;
11068   const struct processor *cpu;
11069   size_t len;
11070
11071   ext = strchr (to_parse, '+');
11072
11073   if (ext != NULL)
11074     len = ext - to_parse;
11075   else
11076     len = strlen (to_parse);
11077
11078   if (len == 0)
11079     return AARCH64_PARSE_MISSING_ARG;
11080
11081
11082   /* Loop through the list of supported CPUs to find a match.  */
11083   for (cpu = all_cores; cpu->name != NULL; cpu++)
11084     {
11085       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
11086         {
11087           unsigned long isa_temp = cpu->flags;
11088
11089
11090           if (ext != NULL)
11091             {
11092               /* TO_PARSE string contains at least one extension.  */
11093               enum aarch64_parse_opt_result ext_res
11094                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11095
11096               if (ext_res != AARCH64_PARSE_OK)
11097                 return ext_res;
11098             }
11099           /* Extension parsing was successfull.  Confirm the result
11100              cpu and ISA flags.  */
11101           *res = cpu;
11102           *isa_flags = isa_temp;
11103           return AARCH64_PARSE_OK;
11104         }
11105     }
11106
11107   /* CPU name not found in list.  */
11108   return AARCH64_PARSE_INVALID_ARG;
11109 }
11110
11111 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11112    Return an aarch64_parse_opt_result describing the parse result.
11113    If the parsing fails the RES does not change.  */
11114
11115 static enum aarch64_parse_opt_result
11116 aarch64_parse_tune (const char *to_parse, const struct processor **res)
11117 {
11118   const struct processor *cpu;
11119
11120   /* Loop through the list of supported CPUs to find a match.  */
11121   for (cpu = all_cores; cpu->name != NULL; cpu++)
11122     {
11123       if (strcmp (cpu->name, to_parse) == 0)
11124         {
11125           *res = cpu;
11126           return AARCH64_PARSE_OK;
11127         }
11128     }
11129
11130   /* CPU name not found in list.  */
11131   return AARCH64_PARSE_INVALID_ARG;
11132 }
11133
11134 /* Parse TOKEN, which has length LENGTH to see if it is an option
11135    described in FLAG.  If it is, return the index bit for that fusion type.
11136    If not, error (printing OPTION_NAME) and return zero.  */
11137
11138 static unsigned int
11139 aarch64_parse_one_option_token (const char *token,
11140                                 size_t length,
11141                                 const struct aarch64_flag_desc *flag,
11142                                 const char *option_name)
11143 {
11144   for (; flag->name != NULL; flag++)
11145     {
11146       if (length == strlen (flag->name)
11147           && !strncmp (flag->name, token, length))
11148         return flag->flag;
11149     }
11150
11151   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
11152   return 0;
11153 }
11154
11155 /* Parse OPTION which is a comma-separated list of flags to enable.
11156    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11157    default state we inherit from the CPU tuning structures.  OPTION_NAME
11158    gives the top-level option we are parsing in the -moverride string,
11159    for use in error messages.  */
11160
11161 static unsigned int
11162 aarch64_parse_boolean_options (const char *option,
11163                                const struct aarch64_flag_desc *flags,
11164                                unsigned int initial_state,
11165                                const char *option_name)
11166 {
11167   const char separator = '.';
11168   const char* specs = option;
11169   const char* ntoken = option;
11170   unsigned int found_flags = initial_state;
11171
11172   while ((ntoken = strchr (specs, separator)))
11173     {
11174       size_t token_length = ntoken - specs;
11175       unsigned token_ops = aarch64_parse_one_option_token (specs,
11176                                                            token_length,
11177                                                            flags,
11178                                                            option_name);
11179       /* If we find "none" (or, for simplicity's sake, an error) anywhere
11180          in the token stream, reset the supported operations.  So:
11181
11182            adrp+add.cmp+branch.none.adrp+add
11183
11184            would have the result of turning on only adrp+add fusion.  */
11185       if (!token_ops)
11186         found_flags = 0;
11187
11188       found_flags |= token_ops;
11189       specs = ++ntoken;
11190     }
11191
11192   /* We ended with a comma, print something.  */
11193   if (!(*specs))
11194     {
11195       error ("%s string ill-formed\n", option_name);
11196       return 0;
11197     }
11198
11199   /* We still have one more token to parse.  */
11200   size_t token_length = strlen (specs);
11201   unsigned token_ops = aarch64_parse_one_option_token (specs,
11202                                                        token_length,
11203                                                        flags,
11204                                                        option_name);
11205    if (!token_ops)
11206      found_flags = 0;
11207
11208   found_flags |= token_ops;
11209   return found_flags;
11210 }
11211
11212 /* Support for overriding instruction fusion.  */
11213
11214 static void
11215 aarch64_parse_fuse_string (const char *fuse_string,
11216                             struct tune_params *tune)
11217 {
11218   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
11219                                                      aarch64_fusible_pairs,
11220                                                      tune->fusible_ops,
11221                                                      "fuse=");
11222 }
11223
11224 /* Support for overriding other tuning flags.  */
11225
11226 static void
11227 aarch64_parse_tune_string (const char *tune_string,
11228                             struct tune_params *tune)
11229 {
11230   tune->extra_tuning_flags
11231     = aarch64_parse_boolean_options (tune_string,
11232                                      aarch64_tuning_flags,
11233                                      tune->extra_tuning_flags,
11234                                      "tune=");
11235 }
11236
11237 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11238    Accept the valid SVE vector widths allowed by
11239    aarch64_sve_vector_bits_enum and use it to override sve_width
11240    in TUNE.  */
11241
11242 static void
11243 aarch64_parse_sve_width_string (const char *tune_string,
11244                                 struct tune_params *tune)
11245 {
11246   int width = -1;
11247
11248   int n = sscanf (tune_string, "%d", &width);
11249   if (n == EOF)
11250     {
11251       error ("invalid format for sve_width");
11252       return;
11253     }
11254   switch (width)
11255     {
11256     case SVE_128:
11257     case SVE_256:
11258     case SVE_512:
11259     case SVE_1024:
11260     case SVE_2048:
11261       break;
11262     default:
11263       error ("invalid sve_width value: %d", width);
11264     }
11265   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
11266 }
11267
11268 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11269    we understand.  If it is, extract the option string and handoff to
11270    the appropriate function.  */
11271
11272 void
11273 aarch64_parse_one_override_token (const char* token,
11274                                   size_t length,
11275                                   struct tune_params *tune)
11276 {
11277   const struct aarch64_tuning_override_function *fn
11278     = aarch64_tuning_override_functions;
11279
11280   const char *option_part = strchr (token, '=');
11281   if (!option_part)
11282     {
11283       error ("tuning string missing in option (%s)", token);
11284       return;
11285     }
11286
11287   /* Get the length of the option name.  */
11288   length = option_part - token;
11289   /* Skip the '=' to get to the option string.  */
11290   option_part++;
11291
11292   for (; fn->name != NULL; fn++)
11293     {
11294       if (!strncmp (fn->name, token, length))
11295         {
11296           fn->parse_override (option_part, tune);
11297           return;
11298         }
11299     }
11300
11301   error ("unknown tuning option (%s)",token);
11302   return;
11303 }
11304
11305 /* A checking mechanism for the implementation of the tls size.  */
11306
11307 static void
11308 initialize_aarch64_tls_size (struct gcc_options *opts)
11309 {
11310   if (aarch64_tls_size == 0)
11311     aarch64_tls_size = 24;
11312
11313   switch (opts->x_aarch64_cmodel_var)
11314     {
11315     case AARCH64_CMODEL_TINY:
11316       /* Both the default and maximum TLS size allowed under tiny is 1M which
11317          needs two instructions to address, so we clamp the size to 24.  */
11318       if (aarch64_tls_size > 24)
11319         aarch64_tls_size = 24;
11320       break;
11321     case AARCH64_CMODEL_SMALL:
11322       /* The maximum TLS size allowed under small is 4G.  */
11323       if (aarch64_tls_size > 32)
11324         aarch64_tls_size = 32;
11325       break;
11326     case AARCH64_CMODEL_LARGE:
11327       /* The maximum TLS size allowed under large is 16E.
11328          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
11329       if (aarch64_tls_size > 48)
11330         aarch64_tls_size = 48;
11331       break;
11332     default:
11333       gcc_unreachable ();
11334     }
11335
11336   return;
11337 }
11338
11339 /* Parse STRING looking for options in the format:
11340      string     :: option:string
11341      option     :: name=substring
11342      name       :: {a-z}
11343      substring  :: defined by option.  */
11344
11345 static void
11346 aarch64_parse_override_string (const char* input_string,
11347                                struct tune_params* tune)
11348 {
11349   const char separator = ':';
11350   size_t string_length = strlen (input_string) + 1;
11351   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
11352   char *string = string_root;
11353   strncpy (string, input_string, string_length);
11354   string[string_length - 1] = '\0';
11355
11356   char* ntoken = string;
11357
11358   while ((ntoken = strchr (string, separator)))
11359     {
11360       size_t token_length = ntoken - string;
11361       /* Make this substring look like a string.  */
11362       *ntoken = '\0';
11363       aarch64_parse_one_override_token (string, token_length, tune);
11364       string = ++ntoken;
11365     }
11366
11367   /* One last option to parse.  */
11368   aarch64_parse_one_override_token (string, strlen (string), tune);
11369   free (string_root);
11370 }
11371
11372
11373 static void
11374 aarch64_override_options_after_change_1 (struct gcc_options *opts)
11375 {
11376   if (accepted_branch_protection_string)
11377     {
11378       opts->x_aarch64_branch_protection_string
11379         = xstrdup (accepted_branch_protection_string);
11380     }
11381
11382   /* PR 70044: We have to be careful about being called multiple times for the
11383      same function.  This means all changes should be repeatable.  */
11384
11385   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
11386      Disable the frame pointer flag so the mid-end will not use a frame
11387      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
11388      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
11389      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
11390   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
11391   if (opts->x_flag_omit_frame_pointer == 0)
11392     opts->x_flag_omit_frame_pointer = 2;
11393
11394   /* If not optimizing for size, set the default
11395      alignment to what the target wants.  */
11396   if (!opts->x_optimize_size)
11397     {
11398       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
11399         opts->x_str_align_loops = aarch64_tune_params.loop_align;
11400       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
11401         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
11402       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
11403         opts->x_str_align_functions = aarch64_tune_params.function_align;
11404     }
11405
11406   /* We default to no pc-relative literal loads.  */
11407
11408   aarch64_pcrelative_literal_loads = false;
11409
11410   /* If -mpc-relative-literal-loads is set on the command line, this
11411      implies that the user asked for PC relative literal loads.  */
11412   if (opts->x_pcrelative_literal_loads == 1)
11413     aarch64_pcrelative_literal_loads = true;
11414
11415   /* In the tiny memory model it makes no sense to disallow PC relative
11416      literal pool loads.  */
11417   if (aarch64_cmodel == AARCH64_CMODEL_TINY
11418       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11419     aarch64_pcrelative_literal_loads = true;
11420
11421   /* When enabling the lower precision Newton series for the square root, also
11422      enable it for the reciprocal square root, since the latter is an
11423      intermediary step for the former.  */
11424   if (flag_mlow_precision_sqrt)
11425     flag_mrecip_low_precision_sqrt = true;
11426 }
11427
11428 /* 'Unpack' up the internal tuning structs and update the options
11429     in OPTS.  The caller must have set up selected_tune and selected_arch
11430     as all the other target-specific codegen decisions are
11431     derived from them.  */
11432
11433 void
11434 aarch64_override_options_internal (struct gcc_options *opts)
11435 {
11436   aarch64_tune_flags = selected_tune->flags;
11437   aarch64_tune = selected_tune->sched_core;
11438   /* Make a copy of the tuning parameters attached to the core, which
11439      we may later overwrite.  */
11440   aarch64_tune_params = *(selected_tune->tune);
11441   aarch64_architecture_version = selected_arch->architecture_version;
11442
11443   if (opts->x_aarch64_override_tune_string)
11444     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
11445                                   &aarch64_tune_params);
11446
11447   /* This target defaults to strict volatile bitfields.  */
11448   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
11449     opts->x_flag_strict_volatile_bitfields = 1;
11450
11451   if (aarch64_stack_protector_guard == SSP_GLOBAL
11452       && opts->x_aarch64_stack_protector_guard_offset_str)
11453     {
11454       error ("incompatible options -mstack-protector-guard=global and"
11455              "-mstack-protector-guard-offset=%qs",
11456              aarch64_stack_protector_guard_offset_str);
11457     }
11458
11459   if (aarch64_stack_protector_guard == SSP_SYSREG
11460       && !(opts->x_aarch64_stack_protector_guard_offset_str
11461            && opts->x_aarch64_stack_protector_guard_reg_str))
11462     {
11463       error ("both -mstack-protector-guard-offset and "
11464              "-mstack-protector-guard-reg must be used "
11465              "with -mstack-protector-guard=sysreg");
11466     }
11467
11468   if (opts->x_aarch64_stack_protector_guard_reg_str)
11469     {
11470       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
11471           error ("specify a system register with a small string length.");
11472     }
11473
11474   if (opts->x_aarch64_stack_protector_guard_offset_str)
11475     {
11476       char *end;
11477       const char *str = aarch64_stack_protector_guard_offset_str;
11478       errno = 0;
11479       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
11480       if (!*str || *end || errno)
11481         error ("%qs is not a valid offset in %qs", str,
11482                "-mstack-protector-guard-offset=");
11483       aarch64_stack_protector_guard_offset = offs;
11484     }
11485
11486   initialize_aarch64_code_model (opts);
11487   initialize_aarch64_tls_size (opts);
11488
11489   int queue_depth = 0;
11490   switch (aarch64_tune_params.autoprefetcher_model)
11491     {
11492       case tune_params::AUTOPREFETCHER_OFF:
11493         queue_depth = -1;
11494         break;
11495       case tune_params::AUTOPREFETCHER_WEAK:
11496         queue_depth = 0;
11497         break;
11498       case tune_params::AUTOPREFETCHER_STRONG:
11499         queue_depth = max_insn_queue_index + 1;
11500         break;
11501       default:
11502         gcc_unreachable ();
11503     }
11504
11505   /* We don't mind passing in global_options_set here as we don't use
11506      the *options_set structs anyway.  */
11507   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
11508                          queue_depth,
11509                          opts->x_param_values,
11510                          global_options_set.x_param_values);
11511
11512   /* Set up parameters to be used in prefetching algorithm.  Do not
11513      override the defaults unless we are tuning for a core we have
11514      researched values for.  */
11515   if (aarch64_tune_params.prefetch->num_slots > 0)
11516     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
11517                            aarch64_tune_params.prefetch->num_slots,
11518                            opts->x_param_values,
11519                            global_options_set.x_param_values);
11520   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
11521     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
11522                            aarch64_tune_params.prefetch->l1_cache_size,
11523                            opts->x_param_values,
11524                            global_options_set.x_param_values);
11525   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
11526     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
11527                            aarch64_tune_params.prefetch->l1_cache_line_size,
11528                            opts->x_param_values,
11529                            global_options_set.x_param_values);
11530   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
11531     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
11532                            aarch64_tune_params.prefetch->l2_cache_size,
11533                            opts->x_param_values,
11534                            global_options_set.x_param_values);
11535   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
11536     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
11537                            0,
11538                            opts->x_param_values,
11539                            global_options_set.x_param_values);
11540   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
11541     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
11542                            aarch64_tune_params.prefetch->minimum_stride,
11543                            opts->x_param_values,
11544                            global_options_set.x_param_values);
11545
11546   /* Use the alternative scheduling-pressure algorithm by default.  */
11547   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
11548                          opts->x_param_values,
11549                          global_options_set.x_param_values);
11550
11551   /* If the user hasn't changed it via configure then set the default to 64 KB
11552      for the backend.  */
11553   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
11554                          DEFAULT_STK_CLASH_GUARD_SIZE == 0
11555                            ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
11556                          opts->x_param_values,
11557                          global_options_set.x_param_values);
11558
11559   /* Validate the guard size.  */
11560   int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
11561
11562   /* Enforce that interval is the same size as size so the mid-end does the
11563      right thing.  */
11564   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
11565                          guard_size,
11566                          opts->x_param_values,
11567                          global_options_set.x_param_values);
11568
11569   /* The maybe_set calls won't update the value if the user has explicitly set
11570      one.  Which means we need to validate that probing interval and guard size
11571      are equal.  */
11572   int probe_interval
11573     = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
11574   if (guard_size != probe_interval)
11575     error ("stack clash guard size '%d' must be equal to probing interval "
11576            "'%d'", guard_size, probe_interval);
11577
11578   /* Enable sw prefetching at specified optimization level for
11579      CPUS that have prefetch.  Lower optimization level threshold by 1
11580      when profiling is enabled.  */
11581   if (opts->x_flag_prefetch_loop_arrays < 0
11582       && !opts->x_optimize_size
11583       && aarch64_tune_params.prefetch->default_opt_level >= 0
11584       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
11585     opts->x_flag_prefetch_loop_arrays = 1;
11586
11587   if (opts->x_aarch64_arch_string == NULL)
11588     opts->x_aarch64_arch_string = selected_arch->name;
11589   if (opts->x_aarch64_cpu_string == NULL)
11590     opts->x_aarch64_cpu_string = selected_cpu->name;
11591   if (opts->x_aarch64_tune_string == NULL)
11592     opts->x_aarch64_tune_string = selected_tune->name;
11593
11594   aarch64_override_options_after_change_1 (opts);
11595 }
11596
11597 /* Print a hint with a suggestion for a core or architecture name that
11598    most closely resembles what the user passed in STR.  ARCH is true if
11599    the user is asking for an architecture name.  ARCH is false if the user
11600    is asking for a core name.  */
11601
11602 static void
11603 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
11604 {
11605   auto_vec<const char *> candidates;
11606   const struct processor *entry = arch ? all_architectures : all_cores;
11607   for (; entry->name != NULL; entry++)
11608     candidates.safe_push (entry->name);
11609
11610 #ifdef HAVE_LOCAL_CPU_DETECT
11611   /* Add also "native" as possible value.  */
11612   if (arch)
11613     candidates.safe_push ("native");
11614 #endif
11615
11616   char *s;
11617   const char *hint = candidates_list_and_hint (str, s, candidates);
11618   if (hint)
11619     inform (input_location, "valid arguments are: %s;"
11620                              " did you mean %qs?", s, hint);
11621   else
11622     inform (input_location, "valid arguments are: %s", s);
11623
11624   XDELETEVEC (s);
11625 }
11626
11627 /* Print a hint with a suggestion for a core name that most closely resembles
11628    what the user passed in STR.  */
11629
11630 inline static void
11631 aarch64_print_hint_for_core (const char *str)
11632 {
11633   aarch64_print_hint_for_core_or_arch (str, false);
11634 }
11635
11636 /* Print a hint with a suggestion for an architecture name that most closely
11637    resembles what the user passed in STR.  */
11638
11639 inline static void
11640 aarch64_print_hint_for_arch (const char *str)
11641 {
11642   aarch64_print_hint_for_core_or_arch (str, true);
11643 }
11644
11645
11646 /* Print a hint with a suggestion for an extension name
11647    that most closely resembles what the user passed in STR.  */
11648
11649 void
11650 aarch64_print_hint_for_extensions (const std::string &str)
11651 {
11652   auto_vec<const char *> candidates;
11653   aarch64_get_all_extension_candidates (&candidates);
11654   char *s;
11655   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
11656   if (hint)
11657     inform (input_location, "valid arguments are: %s;"
11658                              " did you mean %qs?", s, hint);
11659   else
11660     inform (input_location, "valid arguments are: %s;", s);
11661
11662   XDELETEVEC (s);
11663 }
11664
11665 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
11666    specified in STR and throw errors if appropriate.  Put the results if
11667    they are valid in RES and ISA_FLAGS.  Return whether the option is
11668    valid.  */
11669
11670 static bool
11671 aarch64_validate_mcpu (const char *str, const struct processor **res,
11672                        unsigned long *isa_flags)
11673 {
11674   std::string invalid_extension;
11675   enum aarch64_parse_opt_result parse_res
11676     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
11677
11678   if (parse_res == AARCH64_PARSE_OK)
11679     return true;
11680
11681   switch (parse_res)
11682     {
11683       case AARCH64_PARSE_MISSING_ARG:
11684         error ("missing cpu name in %<-mcpu=%s%>", str);
11685         break;
11686       case AARCH64_PARSE_INVALID_ARG:
11687         error ("unknown value %qs for -mcpu", str);
11688         aarch64_print_hint_for_core (str);
11689         break;
11690       case AARCH64_PARSE_INVALID_FEATURE:
11691         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
11692                invalid_extension.c_str (), str);
11693         aarch64_print_hint_for_extensions (invalid_extension);
11694         break;
11695       default:
11696         gcc_unreachable ();
11697     }
11698
11699   return false;
11700 }
11701
11702 /* Parses CONST_STR for branch protection features specified in
11703    aarch64_branch_protect_types, and set any global variables required.  Returns
11704    the parsing result and assigns LAST_STR to the last processed token from
11705    CONST_STR so that it can be used for error reporting.  */
11706
11707 static enum
11708 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
11709                                                           char** last_str)
11710 {
11711   char *str_root = xstrdup (const_str);
11712   char* token_save = NULL;
11713   char *str = strtok_r (str_root, "+", &token_save);
11714   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
11715   if (!str)
11716     res = AARCH64_PARSE_MISSING_ARG;
11717   else
11718     {
11719       char *next_str = strtok_r (NULL, "+", &token_save);
11720       /* Reset the branch protection features to their defaults.  */
11721       aarch64_handle_no_branch_protection (NULL, NULL);
11722
11723       while (str && res == AARCH64_PARSE_OK)
11724         {
11725           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
11726           bool found = false;
11727           /* Search for this type.  */
11728           while (type && type->name && !found && res == AARCH64_PARSE_OK)
11729             {
11730               if (strcmp (str, type->name) == 0)
11731                 {
11732                   found = true;
11733                   res = type->handler (str, next_str);
11734                   str = next_str;
11735                   next_str = strtok_r (NULL, "+", &token_save);
11736                 }
11737               else
11738                 type++;
11739             }
11740           if (found && res == AARCH64_PARSE_OK)
11741             {
11742               bool found_subtype = true;
11743               /* Loop through each token until we find one that isn't a
11744                  subtype.  */
11745               while (found_subtype)
11746                 {
11747                   found_subtype = false;
11748                   const aarch64_branch_protect_type *subtype = type->subtypes;
11749                   /* Search for the subtype.  */
11750                   while (str && subtype && subtype->name && !found_subtype
11751                           && res == AARCH64_PARSE_OK)
11752                     {
11753                       if (strcmp (str, subtype->name) == 0)
11754                         {
11755                           found_subtype = true;
11756                           res = subtype->handler (str, next_str);
11757                           str = next_str;
11758                           next_str = strtok_r (NULL, "+", &token_save);
11759                         }
11760                       else
11761                         subtype++;
11762                     }
11763                 }
11764             }
11765           else if (!found)
11766             res = AARCH64_PARSE_INVALID_ARG;
11767         }
11768     }
11769   /* Copy the last processed token into the argument to pass it back.
11770     Used by option and attribute validation to print the offending token.  */
11771   if (last_str)
11772     {
11773       if (str) strcpy (*last_str, str);
11774       else *last_str = NULL;
11775     }
11776   if (res == AARCH64_PARSE_OK)
11777     {
11778       /* If needed, alloc the accepted string then copy in const_str.
11779         Used by override_option_after_change_1.  */
11780       if (!accepted_branch_protection_string)
11781         accepted_branch_protection_string = (char *) xmalloc (
11782                                                       BRANCH_PROTECT_STR_MAX
11783                                                         + 1);
11784       strncpy (accepted_branch_protection_string, const_str,
11785                 BRANCH_PROTECT_STR_MAX + 1);
11786       /* Forcibly null-terminate.  */
11787       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
11788     }
11789   return res;
11790 }
11791
11792 static bool
11793 aarch64_validate_mbranch_protection (const char *const_str)
11794 {
11795   char *str = (char *) xmalloc (strlen (const_str));
11796   enum aarch64_parse_opt_result res =
11797     aarch64_parse_branch_protection (const_str, &str);
11798   if (res == AARCH64_PARSE_INVALID_ARG)
11799     error ("invalid arg %<%s%> for %<-mbranch-protection=%>", str);
11800   else if (res == AARCH64_PARSE_MISSING_ARG)
11801     error ("missing arg for %<-mbranch-protection=%>");
11802   free (str);
11803   return res == AARCH64_PARSE_OK;
11804 }
11805
11806 /* Validate a command-line -march option.  Parse the arch and extensions
11807    (if any) specified in STR and throw errors if appropriate.  Put the
11808    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
11809    option is valid.  */
11810
11811 static bool
11812 aarch64_validate_march (const char *str, const struct processor **res,
11813                          unsigned long *isa_flags)
11814 {
11815   std::string invalid_extension;
11816   enum aarch64_parse_opt_result parse_res
11817     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
11818
11819   if (parse_res == AARCH64_PARSE_OK)
11820     return true;
11821
11822   switch (parse_res)
11823     {
11824       case AARCH64_PARSE_MISSING_ARG:
11825         error ("missing arch name in %<-march=%s%>", str);
11826         break;
11827       case AARCH64_PARSE_INVALID_ARG:
11828         error ("unknown value %qs for -march", str);
11829         aarch64_print_hint_for_arch (str);
11830         break;
11831       case AARCH64_PARSE_INVALID_FEATURE:
11832         error ("invalid feature modifier %qs in %<-march=%s%>",
11833                invalid_extension.c_str (), str);
11834         aarch64_print_hint_for_extensions (invalid_extension);
11835         break;
11836       default:
11837         gcc_unreachable ();
11838     }
11839
11840   return false;
11841 }
11842
11843 /* Validate a command-line -mtune option.  Parse the cpu
11844    specified in STR and throw errors if appropriate.  Put the
11845    result, if it is valid, in RES.  Return whether the option is
11846    valid.  */
11847
11848 static bool
11849 aarch64_validate_mtune (const char *str, const struct processor **res)
11850 {
11851   enum aarch64_parse_opt_result parse_res
11852     = aarch64_parse_tune (str, res);
11853
11854   if (parse_res == AARCH64_PARSE_OK)
11855     return true;
11856
11857   switch (parse_res)
11858     {
11859       case AARCH64_PARSE_MISSING_ARG:
11860         error ("missing cpu name in %<-mtune=%s%>", str);
11861         break;
11862       case AARCH64_PARSE_INVALID_ARG:
11863         error ("unknown value %qs for -mtune", str);
11864         aarch64_print_hint_for_core (str);
11865         break;
11866       default:
11867         gcc_unreachable ();
11868     }
11869   return false;
11870 }
11871
11872 /* Return the CPU corresponding to the enum CPU.
11873    If it doesn't specify a cpu, return the default.  */
11874
11875 static const struct processor *
11876 aarch64_get_tune_cpu (enum aarch64_processor cpu)
11877 {
11878   if (cpu != aarch64_none)
11879     return &all_cores[cpu];
11880
11881   /* The & 0x3f is to extract the bottom 6 bits that encode the
11882      default cpu as selected by the --with-cpu GCC configure option
11883      in config.gcc.
11884      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
11885      flags mechanism should be reworked to make it more sane.  */
11886   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11887 }
11888
11889 /* Return the architecture corresponding to the enum ARCH.
11890    If it doesn't specify a valid architecture, return the default.  */
11891
11892 static const struct processor *
11893 aarch64_get_arch (enum aarch64_arch arch)
11894 {
11895   if (arch != aarch64_no_arch)
11896     return &all_architectures[arch];
11897
11898   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11899
11900   return &all_architectures[cpu->arch];
11901 }
11902
11903 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
11904
11905 static poly_uint16
11906 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
11907 {
11908   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
11909      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
11910      deciding which .md file patterns to use and when deciding whether
11911      something is a legitimate address or constant.  */
11912   if (value == SVE_SCALABLE || value == SVE_128)
11913     return poly_uint16 (2, 2);
11914   else
11915     return (int) value / 64;
11916 }
11917
11918 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
11919    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
11920    tuning structs.  In particular it must set selected_tune and
11921    aarch64_isa_flags that define the available ISA features and tuning
11922    decisions.  It must also set selected_arch as this will be used to
11923    output the .arch asm tags for each function.  */
11924
11925 static void
11926 aarch64_override_options (void)
11927 {
11928   unsigned long cpu_isa = 0;
11929   unsigned long arch_isa = 0;
11930   aarch64_isa_flags = 0;
11931
11932   bool valid_cpu = true;
11933   bool valid_tune = true;
11934   bool valid_arch = true;
11935
11936   selected_cpu = NULL;
11937   selected_arch = NULL;
11938   selected_tune = NULL;
11939
11940   if (aarch64_branch_protection_string)
11941     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
11942
11943   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
11944      If either of -march or -mtune is given, they override their
11945      respective component of -mcpu.  */
11946   if (aarch64_cpu_string)
11947     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
11948                                         &cpu_isa);
11949
11950   if (aarch64_arch_string)
11951     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
11952                                           &arch_isa);
11953
11954   if (aarch64_tune_string)
11955     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
11956
11957 #ifdef SUBTARGET_OVERRIDE_OPTIONS
11958   SUBTARGET_OVERRIDE_OPTIONS;
11959 #endif
11960
11961   /* If the user did not specify a processor, choose the default
11962      one for them.  This will be the CPU set during configuration using
11963      --with-cpu, otherwise it is "generic".  */
11964   if (!selected_cpu)
11965     {
11966       if (selected_arch)
11967         {
11968           selected_cpu = &all_cores[selected_arch->ident];
11969           aarch64_isa_flags = arch_isa;
11970           explicit_arch = selected_arch->arch;
11971         }
11972       else
11973         {
11974           /* Get default configure-time CPU.  */
11975           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
11976           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
11977         }
11978
11979       if (selected_tune)
11980         explicit_tune_core = selected_tune->ident;
11981     }
11982   /* If both -mcpu and -march are specified check that they are architecturally
11983      compatible, warn if they're not and prefer the -march ISA flags.  */
11984   else if (selected_arch)
11985     {
11986       if (selected_arch->arch != selected_cpu->arch)
11987         {
11988           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
11989                        all_architectures[selected_cpu->arch].name,
11990                        selected_arch->name);
11991         }
11992       aarch64_isa_flags = arch_isa;
11993       explicit_arch = selected_arch->arch;
11994       explicit_tune_core = selected_tune ? selected_tune->ident
11995                                           : selected_cpu->ident;
11996     }
11997   else
11998     {
11999       /* -mcpu but no -march.  */
12000       aarch64_isa_flags = cpu_isa;
12001       explicit_tune_core = selected_tune ? selected_tune->ident
12002                                           : selected_cpu->ident;
12003       gcc_assert (selected_cpu);
12004       selected_arch = &all_architectures[selected_cpu->arch];
12005       explicit_arch = selected_arch->arch;
12006     }
12007
12008   /* Set the arch as well as we will need it when outputing
12009      the .arch directive in assembly.  */
12010   if (!selected_arch)
12011     {
12012       gcc_assert (selected_cpu);
12013       selected_arch = &all_architectures[selected_cpu->arch];
12014     }
12015
12016   if (!selected_tune)
12017     selected_tune = selected_cpu;
12018
12019   if (aarch64_enable_bti == 2)
12020     {
12021 #ifdef TARGET_ENABLE_BTI
12022       aarch64_enable_bti = 1;
12023 #else
12024       aarch64_enable_bti = 0;
12025 #endif
12026     }
12027
12028   /* Return address signing is currently not supported for ILP32 targets.  For
12029      LP64 targets use the configured option in the absence of a command-line
12030      option for -mbranch-protection.  */
12031   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12032     {
12033 #ifdef TARGET_ENABLE_PAC_RET
12034       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12035       aarch64_ra_sign_key = AARCH64_KEY_A;
12036 #else
12037       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12038 #endif
12039     }
12040
12041 #ifndef HAVE_AS_MABI_OPTION
12042   /* The compiler may have been configured with 2.23.* binutils, which does
12043      not have support for ILP32.  */
12044   if (TARGET_ILP32)
12045     error ("assembler does not support -mabi=ilp32");
12046 #endif
12047
12048   /* Convert -msve-vector-bits to a VG count.  */
12049   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12050
12051   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12052     sorry ("return address signing is only supported for -mabi=lp64");
12053
12054   /* Make sure we properly set up the explicit options.  */
12055   if ((aarch64_cpu_string && valid_cpu)
12056        || (aarch64_tune_string && valid_tune))
12057     gcc_assert (explicit_tune_core != aarch64_none);
12058
12059   if ((aarch64_cpu_string && valid_cpu)
12060        || (aarch64_arch_string && valid_arch))
12061     gcc_assert (explicit_arch != aarch64_no_arch);
12062
12063   /* The pass to insert speculation tracking runs before
12064      shrink-wrapping and the latter does not know how to update the
12065      tracking status.  So disable it in this case.  */
12066   if (aarch64_track_speculation)
12067     flag_shrink_wrap = 0;
12068
12069   aarch64_override_options_internal (&global_options);
12070
12071   /* Save these options as the default ones in case we push and pop them later
12072      while processing functions with potential target attributes.  */
12073   target_option_default_node = target_option_current_node
12074       = build_target_option_node (&global_options);
12075 }
12076
12077 /* Implement targetm.override_options_after_change.  */
12078
12079 static void
12080 aarch64_override_options_after_change (void)
12081 {
12082   aarch64_override_options_after_change_1 (&global_options);
12083 }
12084
12085 static struct machine_function *
12086 aarch64_init_machine_status (void)
12087 {
12088   struct machine_function *machine;
12089   machine = ggc_cleared_alloc<machine_function> ();
12090   return machine;
12091 }
12092
12093 void
12094 aarch64_init_expanders (void)
12095 {
12096   init_machine_status = aarch64_init_machine_status;
12097 }
12098
12099 /* A checking mechanism for the implementation of the various code models.  */
12100 static void
12101 initialize_aarch64_code_model (struct gcc_options *opts)
12102 {
12103    if (opts->x_flag_pic)
12104      {
12105        switch (opts->x_aarch64_cmodel_var)
12106          {
12107          case AARCH64_CMODEL_TINY:
12108            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
12109            break;
12110          case AARCH64_CMODEL_SMALL:
12111 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12112            aarch64_cmodel = (flag_pic == 2
12113                              ? AARCH64_CMODEL_SMALL_PIC
12114                              : AARCH64_CMODEL_SMALL_SPIC);
12115 #else
12116            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
12117 #endif
12118            break;
12119          case AARCH64_CMODEL_LARGE:
12120            sorry ("code model %qs with -f%s", "large",
12121                   opts->x_flag_pic > 1 ? "PIC" : "pic");
12122            break;
12123          default:
12124            gcc_unreachable ();
12125          }
12126      }
12127    else
12128      aarch64_cmodel = opts->x_aarch64_cmodel_var;
12129 }
12130
12131 /* Implement TARGET_OPTION_SAVE.  */
12132
12133 static void
12134 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
12135 {
12136   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
12137   ptr->x_aarch64_branch_protection_string
12138     = opts->x_aarch64_branch_protection_string;
12139 }
12140
12141 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
12142    using the information saved in PTR.  */
12143
12144 static void
12145 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
12146 {
12147   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
12148   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12149   opts->x_explicit_arch = ptr->x_explicit_arch;
12150   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
12151   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
12152   opts->x_aarch64_branch_protection_string
12153     = ptr->x_aarch64_branch_protection_string;
12154   if (opts->x_aarch64_branch_protection_string)
12155     {
12156       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
12157                                         NULL);
12158     }
12159
12160   aarch64_override_options_internal (opts);
12161 }
12162
12163 /* Implement TARGET_OPTION_PRINT.  */
12164
12165 static void
12166 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
12167 {
12168   const struct processor *cpu
12169     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12170   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
12171   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
12172   std::string extension
12173     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
12174
12175   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
12176   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
12177            arch->name, extension.c_str ());
12178 }
12179
12180 static GTY(()) tree aarch64_previous_fndecl;
12181
12182 void
12183 aarch64_reset_previous_fndecl (void)
12184 {
12185   aarch64_previous_fndecl = NULL;
12186 }
12187
12188 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
12189    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
12190    make sure optab availability predicates are recomputed when necessary.  */
12191
12192 void
12193 aarch64_save_restore_target_globals (tree new_tree)
12194 {
12195   if (TREE_TARGET_GLOBALS (new_tree))
12196     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
12197   else if (new_tree == target_option_default_node)
12198     restore_target_globals (&default_target_globals);
12199   else
12200     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
12201 }
12202
12203 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
12204    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
12205    of the function, if such exists.  This function may be called multiple
12206    times on a single function so use aarch64_previous_fndecl to avoid
12207    setting up identical state.  */
12208
12209 static void
12210 aarch64_set_current_function (tree fndecl)
12211 {
12212   if (!fndecl || fndecl == aarch64_previous_fndecl)
12213     return;
12214
12215   tree old_tree = (aarch64_previous_fndecl
12216                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
12217                    : NULL_TREE);
12218
12219   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12220
12221   /* If current function has no attributes but the previous one did,
12222      use the default node.  */
12223   if (!new_tree && old_tree)
12224     new_tree = target_option_default_node;
12225
12226   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
12227      the default have been handled by aarch64_save_restore_target_globals from
12228      aarch64_pragma_target_parse.  */
12229   if (old_tree == new_tree)
12230     return;
12231
12232   aarch64_previous_fndecl = fndecl;
12233
12234   /* First set the target options.  */
12235   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
12236
12237   aarch64_save_restore_target_globals (new_tree);
12238 }
12239
12240 /* Enum describing the various ways we can handle attributes.
12241    In many cases we can reuse the generic option handling machinery.  */
12242
12243 enum aarch64_attr_opt_type
12244 {
12245   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
12246   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
12247   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
12248   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
12249 };
12250
12251 /* All the information needed to handle a target attribute.
12252    NAME is the name of the attribute.
12253    ATTR_TYPE specifies the type of behavior of the attribute as described
12254    in the definition of enum aarch64_attr_opt_type.
12255    ALLOW_NEG is true if the attribute supports a "no-" form.
12256    HANDLER is the function that takes the attribute string as an argument
12257    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12258    OPT_NUM is the enum specifying the option that the attribute modifies.
12259    This is needed for attributes that mirror the behavior of a command-line
12260    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12261    aarch64_attr_enum.  */
12262
12263 struct aarch64_attribute_info
12264 {
12265   const char *name;
12266   enum aarch64_attr_opt_type attr_type;
12267   bool allow_neg;
12268   bool (*handler) (const char *);
12269   enum opt_code opt_num;
12270 };
12271
12272 /* Handle the ARCH_STR argument to the arch= target attribute.  */
12273
12274 static bool
12275 aarch64_handle_attr_arch (const char *str)
12276 {
12277   const struct processor *tmp_arch = NULL;
12278   std::string invalid_extension;
12279   enum aarch64_parse_opt_result parse_res
12280     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
12281
12282   if (parse_res == AARCH64_PARSE_OK)
12283     {
12284       gcc_assert (tmp_arch);
12285       selected_arch = tmp_arch;
12286       explicit_arch = selected_arch->arch;
12287       return true;
12288     }
12289
12290   switch (parse_res)
12291     {
12292       case AARCH64_PARSE_MISSING_ARG:
12293         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12294         break;
12295       case AARCH64_PARSE_INVALID_ARG:
12296         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
12297         aarch64_print_hint_for_arch (str);
12298         break;
12299       case AARCH64_PARSE_INVALID_FEATURE:
12300         error ("invalid feature modifier %s of value (\"%s\") in "
12301                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12302         aarch64_print_hint_for_extensions (invalid_extension);
12303         break;
12304       default:
12305         gcc_unreachable ();
12306     }
12307
12308   return false;
12309 }
12310
12311 /* Handle the argument CPU_STR to the cpu= target attribute.  */
12312
12313 static bool
12314 aarch64_handle_attr_cpu (const char *str)
12315 {
12316   const struct processor *tmp_cpu = NULL;
12317   std::string invalid_extension;
12318   enum aarch64_parse_opt_result parse_res
12319     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
12320
12321   if (parse_res == AARCH64_PARSE_OK)
12322     {
12323       gcc_assert (tmp_cpu);
12324       selected_tune = tmp_cpu;
12325       explicit_tune_core = selected_tune->ident;
12326
12327       selected_arch = &all_architectures[tmp_cpu->arch];
12328       explicit_arch = selected_arch->arch;
12329       return true;
12330     }
12331
12332   switch (parse_res)
12333     {
12334       case AARCH64_PARSE_MISSING_ARG:
12335         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12336         break;
12337       case AARCH64_PARSE_INVALID_ARG:
12338         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
12339         aarch64_print_hint_for_core (str);
12340         break;
12341       case AARCH64_PARSE_INVALID_FEATURE:
12342         error ("invalid feature modifier %s of value (\"%s\") in "
12343                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12344         aarch64_print_hint_for_extensions (invalid_extension);
12345         break;
12346       default:
12347         gcc_unreachable ();
12348     }
12349
12350   return false;
12351 }
12352
12353 /* Handle the argument STR to the branch-protection= attribute.  */
12354
12355  static bool
12356  aarch64_handle_attr_branch_protection (const char* str)
12357  {
12358   char *err_str = (char *) xmalloc (strlen (str));
12359   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
12360                                                                       &err_str);
12361   bool success = false;
12362   switch (res)
12363     {
12364      case AARCH64_PARSE_MISSING_ARG:
12365        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12366               " attribute");
12367        break;
12368      case AARCH64_PARSE_INVALID_ARG:
12369        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12370               "=\")%> pragma or attribute", err_str);
12371        break;
12372      case AARCH64_PARSE_OK:
12373        success = true;
12374       /* Fall through.  */
12375      case AARCH64_PARSE_INVALID_FEATURE:
12376        break;
12377      default:
12378        gcc_unreachable ();
12379     }
12380   free (err_str);
12381   return success;
12382  }
12383
12384 /* Handle the argument STR to the tune= target attribute.  */
12385
12386 static bool
12387 aarch64_handle_attr_tune (const char *str)
12388 {
12389   const struct processor *tmp_tune = NULL;
12390   enum aarch64_parse_opt_result parse_res
12391     = aarch64_parse_tune (str, &tmp_tune);
12392
12393   if (parse_res == AARCH64_PARSE_OK)
12394     {
12395       gcc_assert (tmp_tune);
12396       selected_tune = tmp_tune;
12397       explicit_tune_core = selected_tune->ident;
12398       return true;
12399     }
12400
12401   switch (parse_res)
12402     {
12403       case AARCH64_PARSE_INVALID_ARG:
12404         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
12405         aarch64_print_hint_for_core (str);
12406         break;
12407       default:
12408         gcc_unreachable ();
12409     }
12410
12411   return false;
12412 }
12413
12414 /* Parse an architecture extensions target attribute string specified in STR.
12415    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
12416    if successful.  Update aarch64_isa_flags to reflect the ISA features
12417    modified.  */
12418
12419 static bool
12420 aarch64_handle_attr_isa_flags (char *str)
12421 {
12422   enum aarch64_parse_opt_result parse_res;
12423   unsigned long isa_flags = aarch64_isa_flags;
12424
12425   /* We allow "+nothing" in the beginning to clear out all architectural
12426      features if the user wants to handpick specific features.  */
12427   if (strncmp ("+nothing", str, 8) == 0)
12428     {
12429       isa_flags = 0;
12430       str += 8;
12431     }
12432
12433   std::string invalid_extension;
12434   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
12435
12436   if (parse_res == AARCH64_PARSE_OK)
12437     {
12438       aarch64_isa_flags = isa_flags;
12439       return true;
12440     }
12441
12442   switch (parse_res)
12443     {
12444       case AARCH64_PARSE_MISSING_ARG:
12445         error ("missing value in %<target()%> pragma or attribute");
12446         break;
12447
12448       case AARCH64_PARSE_INVALID_FEATURE:
12449         error ("invalid feature modifier %s of value (\"%s\") in "
12450                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12451         break;
12452
12453       default:
12454         gcc_unreachable ();
12455     }
12456
12457  return false;
12458 }
12459
12460 /* The target attributes that we support.  On top of these we also support just
12461    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
12462    handled explicitly in aarch64_process_one_target_attr.  */
12463
12464 static const struct aarch64_attribute_info aarch64_attributes[] =
12465 {
12466   { "general-regs-only", aarch64_attr_mask, false, NULL,
12467      OPT_mgeneral_regs_only },
12468   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
12469      OPT_mfix_cortex_a53_835769 },
12470   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
12471      OPT_mfix_cortex_a53_843419 },
12472   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
12473   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
12474   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
12475      OPT_momit_leaf_frame_pointer },
12476   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
12477   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
12478      OPT_march_ },
12479   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
12480   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
12481      OPT_mtune_ },
12482   { "branch-protection", aarch64_attr_custom, false,
12483      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
12484   { "sign-return-address", aarch64_attr_enum, false, NULL,
12485      OPT_msign_return_address_ },
12486   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
12487 };
12488
12489 /* Parse ARG_STR which contains the definition of one target attribute.
12490    Show appropriate errors if any or return true if the attribute is valid.  */
12491
12492 static bool
12493 aarch64_process_one_target_attr (char *arg_str)
12494 {
12495   bool invert = false;
12496
12497   size_t len = strlen (arg_str);
12498
12499   if (len == 0)
12500     {
12501       error ("malformed %<target()%> pragma or attribute");
12502       return false;
12503     }
12504
12505   char *str_to_check = (char *) alloca (len + 1);
12506   strcpy (str_to_check, arg_str);
12507
12508   /* Skip leading whitespace.  */
12509   while (*str_to_check == ' ' || *str_to_check == '\t')
12510     str_to_check++;
12511
12512   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
12513      It is easier to detect and handle it explicitly here rather than going
12514      through the machinery for the rest of the target attributes in this
12515      function.  */
12516   if (*str_to_check == '+')
12517     return aarch64_handle_attr_isa_flags (str_to_check);
12518
12519   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
12520     {
12521       invert = true;
12522       str_to_check += 3;
12523     }
12524   char *arg = strchr (str_to_check, '=');
12525
12526   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
12527      and point ARG to "foo".  */
12528   if (arg)
12529     {
12530       *arg = '\0';
12531       arg++;
12532     }
12533   const struct aarch64_attribute_info *p_attr;
12534   bool found = false;
12535   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
12536     {
12537       /* If the names don't match up, or the user has given an argument
12538          to an attribute that doesn't accept one, or didn't give an argument
12539          to an attribute that expects one, fail to match.  */
12540       if (strcmp (str_to_check, p_attr->name) != 0)
12541         continue;
12542
12543       found = true;
12544       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
12545                               || p_attr->attr_type == aarch64_attr_enum;
12546
12547       if (attr_need_arg_p ^ (arg != NULL))
12548         {
12549           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
12550           return false;
12551         }
12552
12553       /* If the name matches but the attribute does not allow "no-" versions
12554          then we can't match.  */
12555       if (invert && !p_attr->allow_neg)
12556         {
12557           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
12558           return false;
12559         }
12560
12561       switch (p_attr->attr_type)
12562         {
12563         /* Has a custom handler registered.
12564            For example, cpu=, arch=, tune=.  */
12565           case aarch64_attr_custom:
12566             gcc_assert (p_attr->handler);
12567             if (!p_attr->handler (arg))
12568               return false;
12569             break;
12570
12571           /* Either set or unset a boolean option.  */
12572           case aarch64_attr_bool:
12573             {
12574               struct cl_decoded_option decoded;
12575
12576               generate_option (p_attr->opt_num, NULL, !invert,
12577                                CL_TARGET, &decoded);
12578               aarch64_handle_option (&global_options, &global_options_set,
12579                                       &decoded, input_location);
12580               break;
12581             }
12582           /* Set or unset a bit in the target_flags.  aarch64_handle_option
12583              should know what mask to apply given the option number.  */
12584           case aarch64_attr_mask:
12585             {
12586               struct cl_decoded_option decoded;
12587               /* We only need to specify the option number.
12588                  aarch64_handle_option will know which mask to apply.  */
12589               decoded.opt_index = p_attr->opt_num;
12590               decoded.value = !invert;
12591               aarch64_handle_option (&global_options, &global_options_set,
12592                                       &decoded, input_location);
12593               break;
12594             }
12595           /* Use the option setting machinery to set an option to an enum.  */
12596           case aarch64_attr_enum:
12597             {
12598               gcc_assert (arg);
12599               bool valid;
12600               int value;
12601               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
12602                                               &value, CL_TARGET);
12603               if (valid)
12604                 {
12605                   set_option (&global_options, NULL, p_attr->opt_num, value,
12606                               NULL, DK_UNSPECIFIED, input_location,
12607                               global_dc);
12608                 }
12609               else
12610                 {
12611                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
12612                 }
12613               break;
12614             }
12615           default:
12616             gcc_unreachable ();
12617         }
12618     }
12619
12620   /* If we reached here we either have found an attribute and validated
12621      it or didn't match any.  If we matched an attribute but its arguments
12622      were malformed we will have returned false already.  */
12623   return found;
12624 }
12625
12626 /* Count how many times the character C appears in
12627    NULL-terminated string STR.  */
12628
12629 static unsigned int
12630 num_occurences_in_str (char c, char *str)
12631 {
12632   unsigned int res = 0;
12633   while (*str != '\0')
12634     {
12635       if (*str == c)
12636         res++;
12637
12638       str++;
12639     }
12640
12641   return res;
12642 }
12643
12644 /* Parse the tree in ARGS that contains the target attribute information
12645    and update the global target options space.  */
12646
12647 bool
12648 aarch64_process_target_attr (tree args)
12649 {
12650   if (TREE_CODE (args) == TREE_LIST)
12651     {
12652       do
12653         {
12654           tree head = TREE_VALUE (args);
12655           if (head)
12656             {
12657               if (!aarch64_process_target_attr (head))
12658                 return false;
12659             }
12660           args = TREE_CHAIN (args);
12661         } while (args);
12662
12663       return true;
12664     }
12665
12666   if (TREE_CODE (args) != STRING_CST)
12667     {
12668       error ("attribute %<target%> argument not a string");
12669       return false;
12670     }
12671
12672   size_t len = strlen (TREE_STRING_POINTER (args));
12673   char *str_to_check = (char *) alloca (len + 1);
12674   strcpy (str_to_check, TREE_STRING_POINTER (args));
12675
12676   if (len == 0)
12677     {
12678       error ("malformed %<target()%> pragma or attribute");
12679       return false;
12680     }
12681
12682   /* Used to catch empty spaces between commas i.e.
12683      attribute ((target ("attr1,,attr2"))).  */
12684   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
12685
12686   /* Handle multiple target attributes separated by ','.  */
12687   char *token = strtok_r (str_to_check, ",", &str_to_check);
12688
12689   unsigned int num_attrs = 0;
12690   while (token)
12691     {
12692       num_attrs++;
12693       if (!aarch64_process_one_target_attr (token))
12694         {
12695           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
12696           return false;
12697         }
12698
12699       token = strtok_r (NULL, ",", &str_to_check);
12700     }
12701
12702   if (num_attrs != num_commas + 1)
12703     {
12704       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
12705       return false;
12706     }
12707
12708   return true;
12709 }
12710
12711 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
12712    process attribute ((target ("..."))).  */
12713
12714 static bool
12715 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
12716 {
12717   struct cl_target_option cur_target;
12718   bool ret;
12719   tree old_optimize;
12720   tree new_target, new_optimize;
12721   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12722
12723   /* If what we're processing is the current pragma string then the
12724      target option node is already stored in target_option_current_node
12725      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
12726      having to re-parse the string.  This is especially useful to keep
12727      arm_neon.h compile times down since that header contains a lot
12728      of intrinsics enclosed in pragmas.  */
12729   if (!existing_target && args == current_target_pragma)
12730     {
12731       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
12732       return true;
12733     }
12734   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12735
12736   old_optimize = build_optimization_node (&global_options);
12737   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12738
12739   /* If the function changed the optimization levels as well as setting
12740      target options, start with the optimizations specified.  */
12741   if (func_optimize && func_optimize != old_optimize)
12742     cl_optimization_restore (&global_options,
12743                              TREE_OPTIMIZATION (func_optimize));
12744
12745   /* Save the current target options to restore at the end.  */
12746   cl_target_option_save (&cur_target, &global_options);
12747
12748   /* If fndecl already has some target attributes applied to it, unpack
12749      them so that we add this attribute on top of them, rather than
12750      overwriting them.  */
12751   if (existing_target)
12752     {
12753       struct cl_target_option *existing_options
12754         = TREE_TARGET_OPTION (existing_target);
12755
12756       if (existing_options)
12757         cl_target_option_restore (&global_options, existing_options);
12758     }
12759   else
12760     cl_target_option_restore (&global_options,
12761                         TREE_TARGET_OPTION (target_option_current_node));
12762
12763   ret = aarch64_process_target_attr (args);
12764
12765   /* Set up any additional state.  */
12766   if (ret)
12767     {
12768       aarch64_override_options_internal (&global_options);
12769       /* Initialize SIMD builtins if we haven't already.
12770          Set current_target_pragma to NULL for the duration so that
12771          the builtin initialization code doesn't try to tag the functions
12772          being built with the attributes specified by any current pragma, thus
12773          going into an infinite recursion.  */
12774       if (TARGET_SIMD)
12775         {
12776           tree saved_current_target_pragma = current_target_pragma;
12777           current_target_pragma = NULL;
12778           aarch64_init_simd_builtins ();
12779           current_target_pragma = saved_current_target_pragma;
12780         }
12781       new_target = build_target_option_node (&global_options);
12782     }
12783   else
12784     new_target = NULL;
12785
12786   new_optimize = build_optimization_node (&global_options);
12787
12788   if (fndecl && ret)
12789     {
12790       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
12791
12792       if (old_optimize != new_optimize)
12793         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
12794     }
12795
12796   cl_target_option_restore (&global_options, &cur_target);
12797
12798   if (old_optimize != new_optimize)
12799     cl_optimization_restore (&global_options,
12800                              TREE_OPTIMIZATION (old_optimize));
12801   return ret;
12802 }
12803
12804 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
12805    tri-bool options (yes, no, don't care) and the default value is
12806    DEF, determine whether to reject inlining.  */
12807
12808 static bool
12809 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
12810                                      int dont_care, int def)
12811 {
12812   /* If the callee doesn't care, always allow inlining.  */
12813   if (callee == dont_care)
12814     return true;
12815
12816   /* If the caller doesn't care, always allow inlining.  */
12817   if (caller == dont_care)
12818     return true;
12819
12820   /* Otherwise, allow inlining if either the callee and caller values
12821      agree, or if the callee is using the default value.  */
12822   return (callee == caller || callee == def);
12823 }
12824
12825 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
12826    to inline CALLEE into CALLER based on target-specific info.
12827    Make sure that the caller and callee have compatible architectural
12828    features.  Then go through the other possible target attributes
12829    and see if they can block inlining.  Try not to reject always_inline
12830    callees unless they are incompatible architecturally.  */
12831
12832 static bool
12833 aarch64_can_inline_p (tree caller, tree callee)
12834 {
12835   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
12836   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
12837
12838   struct cl_target_option *caller_opts
12839         = TREE_TARGET_OPTION (caller_tree ? caller_tree
12840                                            : target_option_default_node);
12841
12842   struct cl_target_option *callee_opts
12843         = TREE_TARGET_OPTION (callee_tree ? callee_tree
12844                                            : target_option_default_node);
12845
12846   /* Callee's ISA flags should be a subset of the caller's.  */
12847   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
12848        != callee_opts->x_aarch64_isa_flags)
12849     return false;
12850
12851   /* Allow non-strict aligned functions inlining into strict
12852      aligned ones.  */
12853   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
12854        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
12855       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
12856            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
12857     return false;
12858
12859   bool always_inline = lookup_attribute ("always_inline",
12860                                           DECL_ATTRIBUTES (callee));
12861
12862   /* If the architectural features match up and the callee is always_inline
12863      then the other attributes don't matter.  */
12864   if (always_inline)
12865     return true;
12866
12867   if (caller_opts->x_aarch64_cmodel_var
12868       != callee_opts->x_aarch64_cmodel_var)
12869     return false;
12870
12871   if (caller_opts->x_aarch64_tls_dialect
12872       != callee_opts->x_aarch64_tls_dialect)
12873     return false;
12874
12875   /* Honour explicit requests to workaround errata.  */
12876   if (!aarch64_tribools_ok_for_inlining_p (
12877           caller_opts->x_aarch64_fix_a53_err835769,
12878           callee_opts->x_aarch64_fix_a53_err835769,
12879           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
12880     return false;
12881
12882   if (!aarch64_tribools_ok_for_inlining_p (
12883           caller_opts->x_aarch64_fix_a53_err843419,
12884           callee_opts->x_aarch64_fix_a53_err843419,
12885           2, TARGET_FIX_ERR_A53_843419))
12886     return false;
12887
12888   /* If the user explicitly specified -momit-leaf-frame-pointer for the
12889      caller and calle and they don't match up, reject inlining.  */
12890   if (!aarch64_tribools_ok_for_inlining_p (
12891           caller_opts->x_flag_omit_leaf_frame_pointer,
12892           callee_opts->x_flag_omit_leaf_frame_pointer,
12893           2, 1))
12894     return false;
12895
12896   /* If the callee has specific tuning overrides, respect them.  */
12897   if (callee_opts->x_aarch64_override_tune_string != NULL
12898       && caller_opts->x_aarch64_override_tune_string == NULL)
12899     return false;
12900
12901   /* If the user specified tuning override strings for the
12902      caller and callee and they don't match up, reject inlining.
12903      We just do a string compare here, we don't analyze the meaning
12904      of the string, as it would be too costly for little gain.  */
12905   if (callee_opts->x_aarch64_override_tune_string
12906       && caller_opts->x_aarch64_override_tune_string
12907       && (strcmp (callee_opts->x_aarch64_override_tune_string,
12908                   caller_opts->x_aarch64_override_tune_string) != 0))
12909     return false;
12910
12911   return true;
12912 }
12913
12914 /* Return true if SYMBOL_REF X binds locally.  */
12915
12916 static bool
12917 aarch64_symbol_binds_local_p (const_rtx x)
12918 {
12919   return (SYMBOL_REF_DECL (x)
12920           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
12921           : SYMBOL_REF_LOCAL_P (x));
12922 }
12923
12924 /* Return true if SYMBOL_REF X is thread local */
12925 static bool
12926 aarch64_tls_symbol_p (rtx x)
12927 {
12928   if (! TARGET_HAVE_TLS)
12929     return false;
12930
12931   if (GET_CODE (x) != SYMBOL_REF)
12932     return false;
12933
12934   return SYMBOL_REF_TLS_MODEL (x) != 0;
12935 }
12936
12937 /* Classify a TLS symbol into one of the TLS kinds.  */
12938 enum aarch64_symbol_type
12939 aarch64_classify_tls_symbol (rtx x)
12940 {
12941   enum tls_model tls_kind = tls_symbolic_operand_type (x);
12942
12943   switch (tls_kind)
12944     {
12945     case TLS_MODEL_GLOBAL_DYNAMIC:
12946     case TLS_MODEL_LOCAL_DYNAMIC:
12947       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
12948
12949     case TLS_MODEL_INITIAL_EXEC:
12950       switch (aarch64_cmodel)
12951         {
12952         case AARCH64_CMODEL_TINY:
12953         case AARCH64_CMODEL_TINY_PIC:
12954           return SYMBOL_TINY_TLSIE;
12955         default:
12956           return SYMBOL_SMALL_TLSIE;
12957         }
12958
12959     case TLS_MODEL_LOCAL_EXEC:
12960       if (aarch64_tls_size == 12)
12961         return SYMBOL_TLSLE12;
12962       else if (aarch64_tls_size == 24)
12963         return SYMBOL_TLSLE24;
12964       else if (aarch64_tls_size == 32)
12965         return SYMBOL_TLSLE32;
12966       else if (aarch64_tls_size == 48)
12967         return SYMBOL_TLSLE48;
12968       else
12969         gcc_unreachable ();
12970
12971     case TLS_MODEL_EMULATED:
12972     case TLS_MODEL_NONE:
12973       return SYMBOL_FORCE_TO_MEM;
12974
12975     default:
12976       gcc_unreachable ();
12977     }
12978 }
12979
12980 /* Return the correct method for accessing X + OFFSET, where X is either
12981    a SYMBOL_REF or LABEL_REF.  */
12982
12983 enum aarch64_symbol_type
12984 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
12985 {
12986   if (GET_CODE (x) == LABEL_REF)
12987     {
12988       switch (aarch64_cmodel)
12989         {
12990         case AARCH64_CMODEL_LARGE:
12991           return SYMBOL_FORCE_TO_MEM;
12992
12993         case AARCH64_CMODEL_TINY_PIC:
12994         case AARCH64_CMODEL_TINY:
12995           return SYMBOL_TINY_ABSOLUTE;
12996
12997         case AARCH64_CMODEL_SMALL_SPIC:
12998         case AARCH64_CMODEL_SMALL_PIC:
12999         case AARCH64_CMODEL_SMALL:
13000           return SYMBOL_SMALL_ABSOLUTE;
13001
13002         default:
13003           gcc_unreachable ();
13004         }
13005     }
13006
13007   if (GET_CODE (x) == SYMBOL_REF)
13008     {
13009       if (aarch64_tls_symbol_p (x))
13010         return aarch64_classify_tls_symbol (x);
13011
13012       switch (aarch64_cmodel)
13013         {
13014         case AARCH64_CMODEL_TINY:
13015           /* When we retrieve symbol + offset address, we have to make sure
13016              the offset does not cause overflow of the final address.  But
13017              we have no way of knowing the address of symbol at compile time
13018              so we can't accurately say if the distance between the PC and
13019              symbol + offset is outside the addressible range of +/-1M in the
13020              TINY code model.  So we rely on images not being greater than
13021              1M and cap the offset at 1M and anything beyond 1M will have to
13022              be loaded using an alternative mechanism.  Furthermore if the
13023              symbol is a weak reference to something that isn't known to
13024              resolve to a symbol in this module, then force to memory.  */
13025           if ((SYMBOL_REF_WEAK (x)
13026                && !aarch64_symbol_binds_local_p (x))
13027               || !IN_RANGE (offset, -1048575, 1048575))
13028             return SYMBOL_FORCE_TO_MEM;
13029           return SYMBOL_TINY_ABSOLUTE;
13030
13031         case AARCH64_CMODEL_SMALL:
13032           /* Same reasoning as the tiny code model, but the offset cap here is
13033              4G.  */
13034           if ((SYMBOL_REF_WEAK (x)
13035                && !aarch64_symbol_binds_local_p (x))
13036               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13037                             HOST_WIDE_INT_C (4294967264)))
13038             return SYMBOL_FORCE_TO_MEM;
13039           return SYMBOL_SMALL_ABSOLUTE;
13040
13041         case AARCH64_CMODEL_TINY_PIC:
13042           if (!aarch64_symbol_binds_local_p (x))
13043             return SYMBOL_TINY_GOT;
13044           return SYMBOL_TINY_ABSOLUTE;
13045
13046         case AARCH64_CMODEL_SMALL_SPIC:
13047         case AARCH64_CMODEL_SMALL_PIC:
13048           if (!aarch64_symbol_binds_local_p (x))
13049             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13050                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13051           return SYMBOL_SMALL_ABSOLUTE;
13052
13053         case AARCH64_CMODEL_LARGE:
13054           /* This is alright even in PIC code as the constant
13055              pool reference is always PC relative and within
13056              the same translation unit.  */
13057           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13058             return SYMBOL_SMALL_ABSOLUTE;
13059           else
13060             return SYMBOL_FORCE_TO_MEM;
13061
13062         default:
13063           gcc_unreachable ();
13064         }
13065     }
13066
13067   /* By default push everything into the constant pool.  */
13068   return SYMBOL_FORCE_TO_MEM;
13069 }
13070
13071 bool
13072 aarch64_constant_address_p (rtx x)
13073 {
13074   return (CONSTANT_P (x) && memory_address_p (DImode, x));
13075 }
13076
13077 bool
13078 aarch64_legitimate_pic_operand_p (rtx x)
13079 {
13080   if (GET_CODE (x) == SYMBOL_REF
13081       || (GET_CODE (x) == CONST
13082           && GET_CODE (XEXP (x, 0)) == PLUS
13083           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13084      return false;
13085
13086   return true;
13087 }
13088
13089 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
13090    that should be rematerialized rather than spilled.  */
13091
13092 static bool
13093 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
13094 {
13095   /* Support CSE and rematerialization of common constants.  */
13096   if (CONST_INT_P (x)
13097       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
13098       || GET_CODE (x) == CONST_VECTOR)
13099     return true;
13100
13101   /* Do not allow vector struct mode constants for Advanced SIMD.
13102      We could support 0 and -1 easily, but they need support in
13103      aarch64-simd.md.  */
13104   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13105   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13106     return false;
13107
13108   /* Only accept variable-length vector constants if they can be
13109      handled directly.
13110
13111      ??? It would be possible to handle rematerialization of other
13112      constants via secondary reloads.  */
13113   if (vec_flags & VEC_ANY_SVE)
13114     return aarch64_simd_valid_immediate (x, NULL);
13115
13116   if (GET_CODE (x) == HIGH)
13117     x = XEXP (x, 0);
13118
13119   /* Accept polynomial constants that can be calculated by using the
13120      destination of a move as the sole temporary.  Constants that
13121      require a second temporary cannot be rematerialized (they can't be
13122      forced to memory and also aren't legitimate constants).  */
13123   poly_int64 offset;
13124   if (poly_int_rtx_p (x, &offset))
13125     return aarch64_offset_temporaries (false, offset) <= 1;
13126
13127   /* If an offset is being added to something else, we need to allow the
13128      base to be moved into the destination register, meaning that there
13129      are no free temporaries for the offset.  */
13130   x = strip_offset (x, &offset);
13131   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
13132     return false;
13133
13134   /* Do not allow const (plus (anchor_symbol, const_int)).  */
13135   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
13136     return false;
13137
13138   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
13139      so spilling them is better than rematerialization.  */
13140   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
13141     return true;
13142
13143   /* Label references are always constant.  */
13144   if (GET_CODE (x) == LABEL_REF)
13145     return true;
13146
13147   return false;
13148 }
13149
13150 rtx
13151 aarch64_load_tp (rtx target)
13152 {
13153   if (!target
13154       || GET_MODE (target) != Pmode
13155       || !register_operand (target, Pmode))
13156     target = gen_reg_rtx (Pmode);
13157
13158   /* Can return in any reg.  */
13159   emit_insn (gen_aarch64_load_tp_hard (target));
13160   return target;
13161 }
13162
13163 /* On AAPCS systems, this is the "struct __va_list".  */
13164 static GTY(()) tree va_list_type;
13165
13166 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13167    Return the type to use as __builtin_va_list.
13168
13169    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
13170
13171    struct __va_list
13172    {
13173      void *__stack;
13174      void *__gr_top;
13175      void *__vr_top;
13176      int   __gr_offs;
13177      int   __vr_offs;
13178    };  */
13179
13180 static tree
13181 aarch64_build_builtin_va_list (void)
13182 {
13183   tree va_list_name;
13184   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13185
13186   /* Create the type.  */
13187   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
13188   /* Give it the required name.  */
13189   va_list_name = build_decl (BUILTINS_LOCATION,
13190                              TYPE_DECL,
13191                              get_identifier ("__va_list"),
13192                              va_list_type);
13193   DECL_ARTIFICIAL (va_list_name) = 1;
13194   TYPE_NAME (va_list_type) = va_list_name;
13195   TYPE_STUB_DECL (va_list_type) = va_list_name;
13196
13197   /* Create the fields.  */
13198   f_stack = build_decl (BUILTINS_LOCATION,
13199                         FIELD_DECL, get_identifier ("__stack"),
13200                         ptr_type_node);
13201   f_grtop = build_decl (BUILTINS_LOCATION,
13202                         FIELD_DECL, get_identifier ("__gr_top"),
13203                         ptr_type_node);
13204   f_vrtop = build_decl (BUILTINS_LOCATION,
13205                         FIELD_DECL, get_identifier ("__vr_top"),
13206                         ptr_type_node);
13207   f_groff = build_decl (BUILTINS_LOCATION,
13208                         FIELD_DECL, get_identifier ("__gr_offs"),
13209                         integer_type_node);
13210   f_vroff = build_decl (BUILTINS_LOCATION,
13211                         FIELD_DECL, get_identifier ("__vr_offs"),
13212                         integer_type_node);
13213
13214   /* Tell tree-stdarg pass about our internal offset fields.
13215      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
13216      purpose to identify whether the code is updating va_list internal
13217      offset fields through irregular way.  */
13218   va_list_gpr_counter_field = f_groff;
13219   va_list_fpr_counter_field = f_vroff;
13220
13221   DECL_ARTIFICIAL (f_stack) = 1;
13222   DECL_ARTIFICIAL (f_grtop) = 1;
13223   DECL_ARTIFICIAL (f_vrtop) = 1;
13224   DECL_ARTIFICIAL (f_groff) = 1;
13225   DECL_ARTIFICIAL (f_vroff) = 1;
13226
13227   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
13228   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
13229   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
13230   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
13231   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
13232
13233   TYPE_FIELDS (va_list_type) = f_stack;
13234   DECL_CHAIN (f_stack) = f_grtop;
13235   DECL_CHAIN (f_grtop) = f_vrtop;
13236   DECL_CHAIN (f_vrtop) = f_groff;
13237   DECL_CHAIN (f_groff) = f_vroff;
13238
13239   /* Compute its layout.  */
13240   layout_type (va_list_type);
13241
13242   return va_list_type;
13243 }
13244
13245 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
13246 static void
13247 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
13248 {
13249   const CUMULATIVE_ARGS *cum;
13250   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13251   tree stack, grtop, vrtop, groff, vroff;
13252   tree t;
13253   int gr_save_area_size = cfun->va_list_gpr_size;
13254   int vr_save_area_size = cfun->va_list_fpr_size;
13255   int vr_offset;
13256
13257   cum = &crtl->args.info;
13258   if (cfun->va_list_gpr_size)
13259     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
13260                              cfun->va_list_gpr_size);
13261   if (cfun->va_list_fpr_size)
13262     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
13263                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
13264
13265   if (!TARGET_FLOAT)
13266     {
13267       gcc_assert (cum->aapcs_nvrn == 0);
13268       vr_save_area_size = 0;
13269     }
13270
13271   f_stack = TYPE_FIELDS (va_list_type_node);
13272   f_grtop = DECL_CHAIN (f_stack);
13273   f_vrtop = DECL_CHAIN (f_grtop);
13274   f_groff = DECL_CHAIN (f_vrtop);
13275   f_vroff = DECL_CHAIN (f_groff);
13276
13277   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
13278                   NULL_TREE);
13279   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
13280                   NULL_TREE);
13281   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
13282                   NULL_TREE);
13283   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
13284                   NULL_TREE);
13285   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
13286                   NULL_TREE);
13287
13288   /* Emit code to initialize STACK, which points to the next varargs stack
13289      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
13290      by named arguments.  STACK is 8-byte aligned.  */
13291   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
13292   if (cum->aapcs_stack_size > 0)
13293     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
13294   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
13295   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13296
13297   /* Emit code to initialize GRTOP, the top of the GR save area.
13298      virtual_incoming_args_rtx should have been 16 byte aligned.  */
13299   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
13300   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
13301   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13302
13303   /* Emit code to initialize VRTOP, the top of the VR save area.
13304      This address is gr_save_area_bytes below GRTOP, rounded
13305      down to the next 16-byte boundary.  */
13306   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
13307   vr_offset = ROUND_UP (gr_save_area_size,
13308                         STACK_BOUNDARY / BITS_PER_UNIT);
13309
13310   if (vr_offset)
13311     t = fold_build_pointer_plus_hwi (t, -vr_offset);
13312   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
13313   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13314
13315   /* Emit code to initialize GROFF, the offset from GRTOP of the
13316      next GPR argument.  */
13317   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
13318               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
13319   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13320
13321   /* Likewise emit code to initialize VROFF, the offset from FTOP
13322      of the next VR argument.  */
13323   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
13324               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
13325   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13326 }
13327
13328 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
13329
13330 static tree
13331 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
13332                               gimple_seq *post_p ATTRIBUTE_UNUSED)
13333 {
13334   tree addr;
13335   bool indirect_p;
13336   bool is_ha;           /* is HFA or HVA.  */
13337   bool dw_align;        /* double-word align.  */
13338   machine_mode ag_mode = VOIDmode;
13339   int nregs;
13340   machine_mode mode;
13341
13342   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13343   tree stack, f_top, f_off, off, arg, roundup, on_stack;
13344   HOST_WIDE_INT size, rsize, adjust, align;
13345   tree t, u, cond1, cond2;
13346
13347   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
13348   if (indirect_p)
13349     type = build_pointer_type (type);
13350
13351   mode = TYPE_MODE (type);
13352
13353   f_stack = TYPE_FIELDS (va_list_type_node);
13354   f_grtop = DECL_CHAIN (f_stack);
13355   f_vrtop = DECL_CHAIN (f_grtop);
13356   f_groff = DECL_CHAIN (f_vrtop);
13357   f_vroff = DECL_CHAIN (f_groff);
13358
13359   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
13360                   f_stack, NULL_TREE);
13361   size = int_size_in_bytes (type);
13362
13363   bool abi_break;
13364   align
13365     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
13366
13367   dw_align = false;
13368   adjust = 0;
13369   if (aarch64_vfp_is_call_or_return_candidate (mode,
13370                                                type,
13371                                                &ag_mode,
13372                                                &nregs,
13373                                                &is_ha))
13374     {
13375       /* No frontends can create types with variable-sized modes, so we
13376          shouldn't be asked to pass or return them.  */
13377       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
13378
13379       /* TYPE passed in fp/simd registers.  */
13380       if (!TARGET_FLOAT)
13381         aarch64_err_no_fpadvsimd (mode);
13382
13383       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
13384                       unshare_expr (valist), f_vrtop, NULL_TREE);
13385       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
13386                       unshare_expr (valist), f_vroff, NULL_TREE);
13387
13388       rsize = nregs * UNITS_PER_VREG;
13389
13390       if (is_ha)
13391         {
13392           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
13393             adjust = UNITS_PER_VREG - ag_size;
13394         }
13395       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13396                && size < UNITS_PER_VREG)
13397         {
13398           adjust = UNITS_PER_VREG - size;
13399         }
13400     }
13401   else
13402     {
13403       /* TYPE passed in general registers.  */
13404       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
13405                       unshare_expr (valist), f_grtop, NULL_TREE);
13406       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
13407                       unshare_expr (valist), f_groff, NULL_TREE);
13408       rsize = ROUND_UP (size, UNITS_PER_WORD);
13409       nregs = rsize / UNITS_PER_WORD;
13410
13411       if (align > 8)
13412         {
13413           if (abi_break && warn_psabi)
13414             inform (input_location, "parameter passing for argument of type "
13415                     "%qT changed in GCC 9.1", type);
13416           dw_align = true;
13417         }
13418
13419       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13420           && size < UNITS_PER_WORD)
13421         {
13422           adjust = UNITS_PER_WORD  - size;
13423         }
13424     }
13425
13426   /* Get a local temporary for the field value.  */
13427   off = get_initialized_tmp_var (f_off, pre_p, NULL);
13428
13429   /* Emit code to branch if off >= 0.  */
13430   t = build2 (GE_EXPR, boolean_type_node, off,
13431               build_int_cst (TREE_TYPE (off), 0));
13432   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
13433
13434   if (dw_align)
13435     {
13436       /* Emit: offs = (offs + 15) & -16.  */
13437       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13438                   build_int_cst (TREE_TYPE (off), 15));
13439       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
13440                   build_int_cst (TREE_TYPE (off), -16));
13441       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
13442     }
13443   else
13444     roundup = NULL;
13445
13446   /* Update ap.__[g|v]r_offs  */
13447   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13448               build_int_cst (TREE_TYPE (off), rsize));
13449   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
13450
13451   /* String up.  */
13452   if (roundup)
13453     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13454
13455   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
13456   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
13457               build_int_cst (TREE_TYPE (f_off), 0));
13458   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
13459
13460   /* String up: make sure the assignment happens before the use.  */
13461   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
13462   COND_EXPR_ELSE (cond1) = t;
13463
13464   /* Prepare the trees handling the argument that is passed on the stack;
13465      the top level node will store in ON_STACK.  */
13466   arg = get_initialized_tmp_var (stack, pre_p, NULL);
13467   if (align > 8)
13468     {
13469       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
13470       t = fold_build_pointer_plus_hwi (arg, 15);
13471       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13472                   build_int_cst (TREE_TYPE (t), -16));
13473       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
13474     }
13475   else
13476     roundup = NULL;
13477   /* Advance ap.__stack  */
13478   t = fold_build_pointer_plus_hwi (arg, size + 7);
13479   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13480               build_int_cst (TREE_TYPE (t), -8));
13481   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
13482   /* String up roundup and advance.  */
13483   if (roundup)
13484     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13485   /* String up with arg */
13486   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
13487   /* Big-endianness related address adjustment.  */
13488   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13489       && size < UNITS_PER_WORD)
13490   {
13491     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
13492                 size_int (UNITS_PER_WORD - size));
13493     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
13494   }
13495
13496   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
13497   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
13498
13499   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
13500   t = off;
13501   if (adjust)
13502     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
13503                 build_int_cst (TREE_TYPE (off), adjust));
13504
13505   t = fold_convert (sizetype, t);
13506   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
13507
13508   if (is_ha)
13509     {
13510       /* type ha; // treat as "struct {ftype field[n];}"
13511          ... [computing offs]
13512          for (i = 0; i <nregs; ++i, offs += 16)
13513            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
13514          return ha;  */
13515       int i;
13516       tree tmp_ha, field_t, field_ptr_t;
13517
13518       /* Declare a local variable.  */
13519       tmp_ha = create_tmp_var_raw (type, "ha");
13520       gimple_add_tmp_var (tmp_ha);
13521
13522       /* Establish the base type.  */
13523       switch (ag_mode)
13524         {
13525         case E_SFmode:
13526           field_t = float_type_node;
13527           field_ptr_t = float_ptr_type_node;
13528           break;
13529         case E_DFmode:
13530           field_t = double_type_node;
13531           field_ptr_t = double_ptr_type_node;
13532           break;
13533         case E_TFmode:
13534           field_t = long_double_type_node;
13535           field_ptr_t = long_double_ptr_type_node;
13536           break;
13537         case E_HFmode:
13538           field_t = aarch64_fp16_type_node;
13539           field_ptr_t = aarch64_fp16_ptr_type_node;
13540           break;
13541         case E_V2SImode:
13542         case E_V4SImode:
13543             {
13544               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
13545               field_t = build_vector_type_for_mode (innertype, ag_mode);
13546               field_ptr_t = build_pointer_type (field_t);
13547             }
13548           break;
13549         default:
13550           gcc_assert (0);
13551         }
13552
13553       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
13554       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
13555       addr = t;
13556       t = fold_convert (field_ptr_t, addr);
13557       t = build2 (MODIFY_EXPR, field_t,
13558                   build1 (INDIRECT_REF, field_t, tmp_ha),
13559                   build1 (INDIRECT_REF, field_t, t));
13560
13561       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
13562       for (i = 1; i < nregs; ++i)
13563         {
13564           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
13565           u = fold_convert (field_ptr_t, addr);
13566           u = build2 (MODIFY_EXPR, field_t,
13567                       build2 (MEM_REF, field_t, tmp_ha,
13568                               build_int_cst (field_ptr_t,
13569                                              (i *
13570                                               int_size_in_bytes (field_t)))),
13571                       build1 (INDIRECT_REF, field_t, u));
13572           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
13573         }
13574
13575       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
13576       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
13577     }
13578
13579   COND_EXPR_ELSE (cond2) = t;
13580   addr = fold_convert (build_pointer_type (type), cond1);
13581   addr = build_va_arg_indirect_ref (addr);
13582
13583   if (indirect_p)
13584     addr = build_va_arg_indirect_ref (addr);
13585
13586   return addr;
13587 }
13588
13589 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
13590
13591 static void
13592 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
13593                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
13594                                 int no_rtl)
13595 {
13596   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
13597   CUMULATIVE_ARGS local_cum;
13598   int gr_saved = cfun->va_list_gpr_size;
13599   int vr_saved = cfun->va_list_fpr_size;
13600
13601   /* The caller has advanced CUM up to, but not beyond, the last named
13602      argument.  Advance a local copy of CUM past the last "real" named
13603      argument, to find out how many registers are left over.  */
13604   local_cum = *cum;
13605   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
13606
13607   /* Found out how many registers we need to save.
13608      Honor tree-stdvar analysis results.  */
13609   if (cfun->va_list_gpr_size)
13610     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
13611                     cfun->va_list_gpr_size / UNITS_PER_WORD);
13612   if (cfun->va_list_fpr_size)
13613     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
13614                     cfun->va_list_fpr_size / UNITS_PER_VREG);
13615
13616   if (!TARGET_FLOAT)
13617     {
13618       gcc_assert (local_cum.aapcs_nvrn == 0);
13619       vr_saved = 0;
13620     }
13621
13622   if (!no_rtl)
13623     {
13624       if (gr_saved > 0)
13625         {
13626           rtx ptr, mem;
13627
13628           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
13629           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
13630                                - gr_saved * UNITS_PER_WORD);
13631           mem = gen_frame_mem (BLKmode, ptr);
13632           set_mem_alias_set (mem, get_varargs_alias_set ());
13633
13634           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
13635                                mem, gr_saved);
13636         }
13637       if (vr_saved > 0)
13638         {
13639           /* We can't use move_block_from_reg, because it will use
13640              the wrong mode, storing D regs only.  */
13641           machine_mode mode = TImode;
13642           int off, i, vr_start;
13643
13644           /* Set OFF to the offset from virtual_incoming_args_rtx of
13645              the first vector register.  The VR save area lies below
13646              the GR one, and is aligned to 16 bytes.  */
13647           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
13648                            STACK_BOUNDARY / BITS_PER_UNIT);
13649           off -= vr_saved * UNITS_PER_VREG;
13650
13651           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
13652           for (i = 0; i < vr_saved; ++i)
13653             {
13654               rtx ptr, mem;
13655
13656               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
13657               mem = gen_frame_mem (mode, ptr);
13658               set_mem_alias_set (mem, get_varargs_alias_set ());
13659               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
13660               off += UNITS_PER_VREG;
13661             }
13662         }
13663     }
13664
13665   /* We don't save the size into *PRETEND_SIZE because we want to avoid
13666      any complication of having crtl->args.pretend_args_size changed.  */
13667   cfun->machine->frame.saved_varargs_size
13668     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
13669                  STACK_BOUNDARY / BITS_PER_UNIT)
13670        + vr_saved * UNITS_PER_VREG);
13671 }
13672
13673 static void
13674 aarch64_conditional_register_usage (void)
13675 {
13676   int i;
13677   if (!TARGET_FLOAT)
13678     {
13679       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
13680         {
13681           fixed_regs[i] = 1;
13682           call_used_regs[i] = 1;
13683         }
13684     }
13685   if (!TARGET_SVE)
13686     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
13687       {
13688         fixed_regs[i] = 1;
13689         call_used_regs[i] = 1;
13690       }
13691
13692   /* When tracking speculation, we need a couple of call-clobbered registers
13693      to track the speculation state.  It would be nice to just use
13694      IP0 and IP1, but currently there are numerous places that just
13695      assume these registers are free for other uses (eg pointer
13696      authentication).  */
13697   if (aarch64_track_speculation)
13698     {
13699       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
13700       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
13701       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13702       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13703     }
13704 }
13705
13706 /* Walk down the type tree of TYPE counting consecutive base elements.
13707    If *MODEP is VOIDmode, then set it to the first valid floating point
13708    type.  If a non-floating point type is found, or if a floating point
13709    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
13710    otherwise return the count in the sub-tree.  */
13711 static int
13712 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
13713 {
13714   machine_mode mode;
13715   HOST_WIDE_INT size;
13716
13717   switch (TREE_CODE (type))
13718     {
13719     case REAL_TYPE:
13720       mode = TYPE_MODE (type);
13721       if (mode != DFmode && mode != SFmode
13722           && mode != TFmode && mode != HFmode)
13723         return -1;
13724
13725       if (*modep == VOIDmode)
13726         *modep = mode;
13727
13728       if (*modep == mode)
13729         return 1;
13730
13731       break;
13732
13733     case COMPLEX_TYPE:
13734       mode = TYPE_MODE (TREE_TYPE (type));
13735       if (mode != DFmode && mode != SFmode
13736           && mode != TFmode && mode != HFmode)
13737         return -1;
13738
13739       if (*modep == VOIDmode)
13740         *modep = mode;
13741
13742       if (*modep == mode)
13743         return 2;
13744
13745       break;
13746
13747     case VECTOR_TYPE:
13748       /* Use V2SImode and V4SImode as representatives of all 64-bit
13749          and 128-bit vector types.  */
13750       size = int_size_in_bytes (type);
13751       switch (size)
13752         {
13753         case 8:
13754           mode = V2SImode;
13755           break;
13756         case 16:
13757           mode = V4SImode;
13758           break;
13759         default:
13760           return -1;
13761         }
13762
13763       if (*modep == VOIDmode)
13764         *modep = mode;
13765
13766       /* Vector modes are considered to be opaque: two vectors are
13767          equivalent for the purposes of being homogeneous aggregates
13768          if they are the same size.  */
13769       if (*modep == mode)
13770         return 1;
13771
13772       break;
13773
13774     case ARRAY_TYPE:
13775       {
13776         int count;
13777         tree index = TYPE_DOMAIN (type);
13778
13779         /* Can't handle incomplete types nor sizes that are not
13780            fixed.  */
13781         if (!COMPLETE_TYPE_P (type)
13782             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13783           return -1;
13784
13785         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
13786         if (count == -1
13787             || !index
13788             || !TYPE_MAX_VALUE (index)
13789             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
13790             || !TYPE_MIN_VALUE (index)
13791             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
13792             || count < 0)
13793           return -1;
13794
13795         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
13796                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
13797
13798         /* There must be no padding.  */
13799         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13800                       count * GET_MODE_BITSIZE (*modep)))
13801           return -1;
13802
13803         return count;
13804       }
13805
13806     case RECORD_TYPE:
13807       {
13808         int count = 0;
13809         int sub_count;
13810         tree field;
13811
13812         /* Can't handle incomplete types nor sizes that are not
13813            fixed.  */
13814         if (!COMPLETE_TYPE_P (type)
13815             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13816           return -1;
13817
13818         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13819           {
13820             if (TREE_CODE (field) != FIELD_DECL)
13821               continue;
13822
13823             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13824             if (sub_count < 0)
13825               return -1;
13826             count += sub_count;
13827           }
13828
13829         /* There must be no padding.  */
13830         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13831                       count * GET_MODE_BITSIZE (*modep)))
13832           return -1;
13833
13834         return count;
13835       }
13836
13837     case UNION_TYPE:
13838     case QUAL_UNION_TYPE:
13839       {
13840         /* These aren't very interesting except in a degenerate case.  */
13841         int count = 0;
13842         int sub_count;
13843         tree field;
13844
13845         /* Can't handle incomplete types nor sizes that are not
13846            fixed.  */
13847         if (!COMPLETE_TYPE_P (type)
13848             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13849           return -1;
13850
13851         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13852           {
13853             if (TREE_CODE (field) != FIELD_DECL)
13854               continue;
13855
13856             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13857             if (sub_count < 0)
13858               return -1;
13859             count = count > sub_count ? count : sub_count;
13860           }
13861
13862         /* There must be no padding.  */
13863         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13864                       count * GET_MODE_BITSIZE (*modep)))
13865           return -1;
13866
13867         return count;
13868       }
13869
13870     default:
13871       break;
13872     }
13873
13874   return -1;
13875 }
13876
13877 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
13878    type as described in AAPCS64 \S 4.1.2.
13879
13880    See the comment above aarch64_composite_type_p for the notes on MODE.  */
13881
13882 static bool
13883 aarch64_short_vector_p (const_tree type,
13884                         machine_mode mode)
13885 {
13886   poly_int64 size = -1;
13887
13888   if (type && TREE_CODE (type) == VECTOR_TYPE)
13889     size = int_size_in_bytes (type);
13890   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
13891             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
13892     size = GET_MODE_SIZE (mode);
13893
13894   return known_eq (size, 8) || known_eq (size, 16);
13895 }
13896
13897 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
13898    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
13899    array types.  The C99 floating-point complex types are also considered
13900    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
13901    types, which are GCC extensions and out of the scope of AAPCS64, are
13902    treated as composite types here as well.
13903
13904    Note that MODE itself is not sufficient in determining whether a type
13905    is such a composite type or not.  This is because
13906    stor-layout.c:compute_record_mode may have already changed the MODE
13907    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
13908    structure with only one field may have its MODE set to the mode of the
13909    field.  Also an integer mode whose size matches the size of the
13910    RECORD_TYPE type may be used to substitute the original mode
13911    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
13912    solely relied on.  */
13913
13914 static bool
13915 aarch64_composite_type_p (const_tree type,
13916                           machine_mode mode)
13917 {
13918   if (aarch64_short_vector_p (type, mode))
13919     return false;
13920
13921   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
13922     return true;
13923
13924   if (mode == BLKmode
13925       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
13926       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
13927     return true;
13928
13929   return false;
13930 }
13931
13932 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
13933    shall be passed or returned in simd/fp register(s) (providing these
13934    parameter passing registers are available).
13935
13936    Upon successful return, *COUNT returns the number of needed registers,
13937    *BASE_MODE returns the mode of the individual register and when IS_HAF
13938    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
13939    floating-point aggregate or a homogeneous short-vector aggregate.  */
13940
13941 static bool
13942 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
13943                                          const_tree type,
13944                                          machine_mode *base_mode,
13945                                          int *count,
13946                                          bool *is_ha)
13947 {
13948   machine_mode new_mode = VOIDmode;
13949   bool composite_p = aarch64_composite_type_p (type, mode);
13950
13951   if (is_ha != NULL) *is_ha = false;
13952
13953   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
13954       || aarch64_short_vector_p (type, mode))
13955     {
13956       *count = 1;
13957       new_mode = mode;
13958     }
13959   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
13960     {
13961       if (is_ha != NULL) *is_ha = true;
13962       *count = 2;
13963       new_mode = GET_MODE_INNER (mode);
13964     }
13965   else if (type && composite_p)
13966     {
13967       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
13968
13969       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
13970         {
13971           if (is_ha != NULL) *is_ha = true;
13972           *count = ag_count;
13973         }
13974       else
13975         return false;
13976     }
13977   else
13978     return false;
13979
13980   *base_mode = new_mode;
13981   return true;
13982 }
13983
13984 /* Implement TARGET_STRUCT_VALUE_RTX.  */
13985
13986 static rtx
13987 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
13988                           int incoming ATTRIBUTE_UNUSED)
13989 {
13990   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
13991 }
13992
13993 /* Implements target hook vector_mode_supported_p.  */
13994 static bool
13995 aarch64_vector_mode_supported_p (machine_mode mode)
13996 {
13997   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13998   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
13999 }
14000
14001 /* Return appropriate SIMD container
14002    for MODE within a vector of WIDTH bits.  */
14003 static machine_mode
14004 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14005 {
14006   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14007     switch (mode)
14008       {
14009       case E_DFmode:
14010         return VNx2DFmode;
14011       case E_SFmode:
14012         return VNx4SFmode;
14013       case E_HFmode:
14014         return VNx8HFmode;
14015       case E_DImode:
14016         return VNx2DImode;
14017       case E_SImode:
14018         return VNx4SImode;
14019       case E_HImode:
14020         return VNx8HImode;
14021       case E_QImode:
14022         return VNx16QImode;
14023       default:
14024         return word_mode;
14025       }
14026
14027   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14028   if (TARGET_SIMD)
14029     {
14030       if (known_eq (width, 128))
14031         switch (mode)
14032           {
14033           case E_DFmode:
14034             return V2DFmode;
14035           case E_SFmode:
14036             return V4SFmode;
14037           case E_HFmode:
14038             return V8HFmode;
14039           case E_SImode:
14040             return V4SImode;
14041           case E_HImode:
14042             return V8HImode;
14043           case E_QImode:
14044             return V16QImode;
14045           case E_DImode:
14046             return V2DImode;
14047           default:
14048             break;
14049           }
14050       else
14051         switch (mode)
14052           {
14053           case E_SFmode:
14054             return V2SFmode;
14055           case E_HFmode:
14056             return V4HFmode;
14057           case E_SImode:
14058             return V2SImode;
14059           case E_HImode:
14060             return V4HImode;
14061           case E_QImode:
14062             return V8QImode;
14063           default:
14064             break;
14065           }
14066     }
14067   return word_mode;
14068 }
14069
14070 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
14071 static machine_mode
14072 aarch64_preferred_simd_mode (scalar_mode mode)
14073 {
14074   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
14075   return aarch64_simd_container_mode (mode, bits);
14076 }
14077
14078 /* Return a list of possible vector sizes for the vectorizer
14079    to iterate over.  */
14080 static void
14081 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
14082 {
14083   if (TARGET_SVE)
14084     sizes->safe_push (BYTES_PER_SVE_VECTOR);
14085   sizes->safe_push (16);
14086   sizes->safe_push (8);
14087 }
14088
14089 /* Implement TARGET_MANGLE_TYPE.  */
14090
14091 static const char *
14092 aarch64_mangle_type (const_tree type)
14093 {
14094   /* The AArch64 ABI documents say that "__va_list" has to be
14095      mangled as if it is in the "std" namespace.  */
14096   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
14097     return "St9__va_list";
14098
14099   /* Half-precision float.  */
14100   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
14101     return "Dh";
14102
14103   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
14104      builtin types.  */
14105   if (TYPE_NAME (type) != NULL)
14106     return aarch64_mangle_builtin_type (type);
14107
14108   /* Use the default mangling.  */
14109   return NULL;
14110 }
14111
14112 /* Find the first rtx_insn before insn that will generate an assembly
14113    instruction.  */
14114
14115 static rtx_insn *
14116 aarch64_prev_real_insn (rtx_insn *insn)
14117 {
14118   if (!insn)
14119     return NULL;
14120
14121   do
14122     {
14123       insn = prev_real_insn (insn);
14124     }
14125   while (insn && recog_memoized (insn) < 0);
14126
14127   return insn;
14128 }
14129
14130 static bool
14131 is_madd_op (enum attr_type t1)
14132 {
14133   unsigned int i;
14134   /* A number of these may be AArch32 only.  */
14135   enum attr_type mlatypes[] = {
14136     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
14137     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
14138     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
14139   };
14140
14141   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
14142     {
14143       if (t1 == mlatypes[i])
14144         return true;
14145     }
14146
14147   return false;
14148 }
14149
14150 /* Check if there is a register dependency between a load and the insn
14151    for which we hold recog_data.  */
14152
14153 static bool
14154 dep_between_memop_and_curr (rtx memop)
14155 {
14156   rtx load_reg;
14157   int opno;
14158
14159   gcc_assert (GET_CODE (memop) == SET);
14160
14161   if (!REG_P (SET_DEST (memop)))
14162     return false;
14163
14164   load_reg = SET_DEST (memop);
14165   for (opno = 1; opno < recog_data.n_operands; opno++)
14166     {
14167       rtx operand = recog_data.operand[opno];
14168       if (REG_P (operand)
14169           && reg_overlap_mentioned_p (load_reg, operand))
14170         return true;
14171
14172     }
14173   return false;
14174 }
14175
14176
14177 /* When working around the Cortex-A53 erratum 835769,
14178    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
14179    instruction and has a preceding memory instruction such that a NOP
14180    should be inserted between them.  */
14181
14182 bool
14183 aarch64_madd_needs_nop (rtx_insn* insn)
14184 {
14185   enum attr_type attr_type;
14186   rtx_insn *prev;
14187   rtx body;
14188
14189   if (!TARGET_FIX_ERR_A53_835769)
14190     return false;
14191
14192   if (!INSN_P (insn) || recog_memoized (insn) < 0)
14193     return false;
14194
14195   attr_type = get_attr_type (insn);
14196   if (!is_madd_op (attr_type))
14197     return false;
14198
14199   prev = aarch64_prev_real_insn (insn);
14200   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
14201      Restore recog state to INSN to avoid state corruption.  */
14202   extract_constrain_insn_cached (insn);
14203
14204   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
14205     return false;
14206
14207   body = single_set (prev);
14208
14209   /* If the previous insn is a memory op and there is no dependency between
14210      it and the DImode madd, emit a NOP between them.  If body is NULL then we
14211      have a complex memory operation, probably a load/store pair.
14212      Be conservative for now and emit a NOP.  */
14213   if (GET_MODE (recog_data.operand[0]) == DImode
14214       && (!body || !dep_between_memop_and_curr (body)))
14215     return true;
14216
14217   return false;
14218
14219 }
14220
14221
14222 /* Implement FINAL_PRESCAN_INSN.  */
14223
14224 void
14225 aarch64_final_prescan_insn (rtx_insn *insn)
14226 {
14227   if (aarch64_madd_needs_nop (insn))
14228     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
14229 }
14230
14231
14232 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
14233    instruction.  */
14234
14235 bool
14236 aarch64_sve_index_immediate_p (rtx base_or_step)
14237 {
14238   return (CONST_INT_P (base_or_step)
14239           && IN_RANGE (INTVAL (base_or_step), -16, 15));
14240 }
14241
14242 /* Return true if X is a valid immediate for the SVE ADD and SUB
14243    instructions.  Negate X first if NEGATE_P is true.  */
14244
14245 bool
14246 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
14247 {
14248   rtx elt;
14249
14250   if (!const_vec_duplicate_p (x, &elt)
14251       || !CONST_INT_P (elt))
14252     return false;
14253
14254   HOST_WIDE_INT val = INTVAL (elt);
14255   if (negate_p)
14256     val = -val;
14257   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
14258
14259   if (val & 0xff)
14260     return IN_RANGE (val, 0, 0xff);
14261   return IN_RANGE (val, 0, 0xff00);
14262 }
14263
14264 /* Return true if X is a valid immediate operand for an SVE logical
14265    instruction such as AND.  */
14266
14267 bool
14268 aarch64_sve_bitmask_immediate_p (rtx x)
14269 {
14270   rtx elt;
14271
14272   return (const_vec_duplicate_p (x, &elt)
14273           && CONST_INT_P (elt)
14274           && aarch64_bitmask_imm (INTVAL (elt),
14275                                   GET_MODE_INNER (GET_MODE (x))));
14276 }
14277
14278 /* Return true if X is a valid immediate for the SVE DUP and CPY
14279    instructions.  */
14280
14281 bool
14282 aarch64_sve_dup_immediate_p (rtx x)
14283 {
14284   rtx elt;
14285
14286   if (!const_vec_duplicate_p (x, &elt)
14287       || !CONST_INT_P (elt))
14288     return false;
14289
14290   HOST_WIDE_INT val = INTVAL (elt);
14291   if (val & 0xff)
14292     return IN_RANGE (val, -0x80, 0x7f);
14293   return IN_RANGE (val, -0x8000, 0x7f00);
14294 }
14295
14296 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14297    SIGNED_P says whether the operand is signed rather than unsigned.  */
14298
14299 bool
14300 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
14301 {
14302   rtx elt;
14303
14304   return (const_vec_duplicate_p (x, &elt)
14305           && CONST_INT_P (elt)
14306           && (signed_p
14307               ? IN_RANGE (INTVAL (elt), -16, 15)
14308               : IN_RANGE (INTVAL (elt), 0, 127)));
14309 }
14310
14311 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14312    instruction.  Negate X first if NEGATE_P is true.  */
14313
14314 bool
14315 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
14316 {
14317   rtx elt;
14318   REAL_VALUE_TYPE r;
14319
14320   if (!const_vec_duplicate_p (x, &elt)
14321       || GET_CODE (elt) != CONST_DOUBLE)
14322     return false;
14323
14324   r = *CONST_DOUBLE_REAL_VALUE (elt);
14325
14326   if (negate_p)
14327     r = real_value_negate (&r);
14328
14329   if (real_equal (&r, &dconst1))
14330     return true;
14331   if (real_equal (&r, &dconsthalf))
14332     return true;
14333   return false;
14334 }
14335
14336 /* Return true if X is a valid immediate operand for an SVE FMUL
14337    instruction.  */
14338
14339 bool
14340 aarch64_sve_float_mul_immediate_p (rtx x)
14341 {
14342   rtx elt;
14343
14344   /* GCC will never generate a multiply with an immediate of 2, so there is no
14345      point testing for it (even though it is a valid constant).  */
14346   return (const_vec_duplicate_p (x, &elt)
14347           && GET_CODE (elt) == CONST_DOUBLE
14348           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
14349 }
14350
14351 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14352    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
14353    is nonnull, use it to describe valid immediates.  */
14354 static bool
14355 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
14356                                     simd_immediate_info *info,
14357                                     enum simd_immediate_check which,
14358                                     simd_immediate_info::insn_type insn)
14359 {
14360   /* Try a 4-byte immediate with LSL.  */
14361   for (unsigned int shift = 0; shift < 32; shift += 8)
14362     if ((val32 & (0xff << shift)) == val32)
14363       {
14364         if (info)
14365           *info = simd_immediate_info (SImode, val32 >> shift, insn,
14366                                        simd_immediate_info::LSL, shift);
14367         return true;
14368       }
14369
14370   /* Try a 2-byte immediate with LSL.  */
14371   unsigned int imm16 = val32 & 0xffff;
14372   if (imm16 == (val32 >> 16))
14373     for (unsigned int shift = 0; shift < 16; shift += 8)
14374       if ((imm16 & (0xff << shift)) == imm16)
14375         {
14376           if (info)
14377             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
14378                                          simd_immediate_info::LSL, shift);
14379           return true;
14380         }
14381
14382   /* Try a 4-byte immediate with MSL, except for cases that MVN
14383      can handle.  */
14384   if (which == AARCH64_CHECK_MOV)
14385     for (unsigned int shift = 8; shift < 24; shift += 8)
14386       {
14387         unsigned int low = (1 << shift) - 1;
14388         if (((val32 & (0xff << shift)) | low) == val32)
14389           {
14390             if (info)
14391               *info = simd_immediate_info (SImode, val32 >> shift, insn,
14392                                            simd_immediate_info::MSL, shift);
14393             return true;
14394           }
14395       }
14396
14397   return false;
14398 }
14399
14400 /* Return true if replicating VAL64 is a valid immediate for the
14401    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
14402    use it to describe valid immediates.  */
14403 static bool
14404 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
14405                                  simd_immediate_info *info,
14406                                  enum simd_immediate_check which)
14407 {
14408   unsigned int val32 = val64 & 0xffffffff;
14409   unsigned int val16 = val64 & 0xffff;
14410   unsigned int val8 = val64 & 0xff;
14411
14412   if (val32 == (val64 >> 32))
14413     {
14414       if ((which & AARCH64_CHECK_ORR) != 0
14415           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
14416                                                  simd_immediate_info::MOV))
14417         return true;
14418
14419       if ((which & AARCH64_CHECK_BIC) != 0
14420           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
14421                                                  simd_immediate_info::MVN))
14422         return true;
14423
14424       /* Try using a replicated byte.  */
14425       if (which == AARCH64_CHECK_MOV
14426           && val16 == (val32 >> 16)
14427           && val8 == (val16 >> 8))
14428         {
14429           if (info)
14430             *info = simd_immediate_info (QImode, val8);
14431           return true;
14432         }
14433     }
14434
14435   /* Try using a bit-to-bytemask.  */
14436   if (which == AARCH64_CHECK_MOV)
14437     {
14438       unsigned int i;
14439       for (i = 0; i < 64; i += 8)
14440         {
14441           unsigned char byte = (val64 >> i) & 0xff;
14442           if (byte != 0 && byte != 0xff)
14443             break;
14444         }
14445       if (i == 64)
14446         {
14447           if (info)
14448             *info = simd_immediate_info (DImode, val64);
14449           return true;
14450         }
14451     }
14452   return false;
14453 }
14454
14455 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
14456    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
14457
14458 static bool
14459 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
14460                              simd_immediate_info *info)
14461 {
14462   scalar_int_mode mode = DImode;
14463   unsigned int val32 = val64 & 0xffffffff;
14464   if (val32 == (val64 >> 32))
14465     {
14466       mode = SImode;
14467       unsigned int val16 = val32 & 0xffff;
14468       if (val16 == (val32 >> 16))
14469         {
14470           mode = HImode;
14471           unsigned int val8 = val16 & 0xff;
14472           if (val8 == (val16 >> 8))
14473             mode = QImode;
14474         }
14475     }
14476   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
14477   if (IN_RANGE (val, -0x80, 0x7f))
14478     {
14479       /* DUP with no shift.  */
14480       if (info)
14481         *info = simd_immediate_info (mode, val);
14482       return true;
14483     }
14484   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
14485     {
14486       /* DUP with LSL #8.  */
14487       if (info)
14488         *info = simd_immediate_info (mode, val);
14489       return true;
14490     }
14491   if (aarch64_bitmask_imm (val64, mode))
14492     {
14493       /* DUPM.  */
14494       if (info)
14495         *info = simd_immediate_info (mode, val);
14496       return true;
14497     }
14498   return false;
14499 }
14500
14501 /* Return true if OP is a valid SIMD immediate for the operation
14502    described by WHICH.  If INFO is nonnull, use it to describe valid
14503    immediates.  */
14504 bool
14505 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
14506                               enum simd_immediate_check which)
14507 {
14508   machine_mode mode = GET_MODE (op);
14509   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14510   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14511     return false;
14512
14513   scalar_mode elt_mode = GET_MODE_INNER (mode);
14514   rtx base, step;
14515   unsigned int n_elts;
14516   if (GET_CODE (op) == CONST_VECTOR
14517       && CONST_VECTOR_DUPLICATE_P (op))
14518     n_elts = CONST_VECTOR_NPATTERNS (op);
14519   else if ((vec_flags & VEC_SVE_DATA)
14520            && const_vec_series_p (op, &base, &step))
14521     {
14522       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
14523       if (!aarch64_sve_index_immediate_p (base)
14524           || !aarch64_sve_index_immediate_p (step))
14525         return false;
14526
14527       if (info)
14528         *info = simd_immediate_info (elt_mode, base, step);
14529       return true;
14530     }
14531   else if (GET_CODE (op) == CONST_VECTOR
14532            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
14533     /* N_ELTS set above.  */;
14534   else
14535     return false;
14536
14537   /* Handle PFALSE and PTRUE.  */
14538   if (vec_flags & VEC_SVE_PRED)
14539     return (op == CONST0_RTX (mode)
14540             || op == CONSTM1_RTX (mode));
14541
14542   scalar_float_mode elt_float_mode;
14543   if (n_elts == 1
14544       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
14545     {
14546       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
14547       if (aarch64_float_const_zero_rtx_p (elt)
14548           || aarch64_float_const_representable_p (elt))
14549         {
14550           if (info)
14551             *info = simd_immediate_info (elt_float_mode, elt);
14552           return true;
14553         }
14554     }
14555
14556   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
14557   if (elt_size > 8)
14558     return false;
14559
14560   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
14561
14562   /* Expand the vector constant out into a byte vector, with the least
14563      significant byte of the register first.  */
14564   auto_vec<unsigned char, 16> bytes;
14565   bytes.reserve (n_elts * elt_size);
14566   for (unsigned int i = 0; i < n_elts; i++)
14567     {
14568       /* The vector is provided in gcc endian-neutral fashion.
14569          For aarch64_be Advanced SIMD, it must be laid out in the vector
14570          register in reverse order.  */
14571       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
14572       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
14573
14574       if (elt_mode != elt_int_mode)
14575         elt = gen_lowpart (elt_int_mode, elt);
14576
14577       if (!CONST_INT_P (elt))
14578         return false;
14579
14580       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
14581       for (unsigned int byte = 0; byte < elt_size; byte++)
14582         {
14583           bytes.quick_push (elt_val & 0xff);
14584           elt_val >>= BITS_PER_UNIT;
14585         }
14586     }
14587
14588   /* The immediate must repeat every eight bytes.  */
14589   unsigned int nbytes = bytes.length ();
14590   for (unsigned i = 8; i < nbytes; ++i)
14591     if (bytes[i] != bytes[i - 8])
14592       return false;
14593
14594   /* Get the repeating 8-byte value as an integer.  No endian correction
14595      is needed here because bytes is already in lsb-first order.  */
14596   unsigned HOST_WIDE_INT val64 = 0;
14597   for (unsigned int i = 0; i < 8; i++)
14598     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
14599               << (i * BITS_PER_UNIT));
14600
14601   if (vec_flags & VEC_SVE_DATA)
14602     return aarch64_sve_valid_immediate (val64, info);
14603   else
14604     return aarch64_advsimd_valid_immediate (val64, info, which);
14605 }
14606
14607 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
14608    has a step in the range of INDEX.  Return the index expression if so,
14609    otherwise return null.  */
14610 rtx
14611 aarch64_check_zero_based_sve_index_immediate (rtx x)
14612 {
14613   rtx base, step;
14614   if (const_vec_series_p (x, &base, &step)
14615       && base == const0_rtx
14616       && aarch64_sve_index_immediate_p (step))
14617     return step;
14618   return NULL_RTX;
14619 }
14620
14621 /* Check of immediate shift constants are within range.  */
14622 bool
14623 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
14624 {
14625   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
14626   if (left)
14627     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
14628   else
14629     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
14630 }
14631
14632 /* Return the bitmask CONST_INT to select the bits required by a zero extract
14633    operation of width WIDTH at bit position POS.  */
14634
14635 rtx
14636 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
14637 {
14638   gcc_assert (CONST_INT_P (width));
14639   gcc_assert (CONST_INT_P (pos));
14640
14641   unsigned HOST_WIDE_INT mask
14642     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
14643   return GEN_INT (mask << UINTVAL (pos));
14644 }
14645
14646 bool
14647 aarch64_mov_operand_p (rtx x, machine_mode mode)
14648 {
14649   if (GET_CODE (x) == HIGH
14650       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
14651     return true;
14652
14653   if (CONST_INT_P (x))
14654     return true;
14655
14656   if (VECTOR_MODE_P (GET_MODE (x)))
14657     return aarch64_simd_valid_immediate (x, NULL);
14658
14659   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
14660     return true;
14661
14662   if (aarch64_sve_cnt_immediate_p (x))
14663     return true;
14664
14665   return aarch64_classify_symbolic_expression (x)
14666     == SYMBOL_TINY_ABSOLUTE;
14667 }
14668
14669 /* Return a const_int vector of VAL.  */
14670 rtx
14671 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
14672 {
14673   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
14674   return gen_const_vec_duplicate (mode, c);
14675 }
14676
14677 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
14678
14679 bool
14680 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
14681 {
14682   machine_mode vmode;
14683
14684   vmode = aarch64_simd_container_mode (mode, 64);
14685   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
14686   return aarch64_simd_valid_immediate (op_v, NULL);
14687 }
14688
14689 /* Construct and return a PARALLEL RTX vector with elements numbering the
14690    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
14691    the vector - from the perspective of the architecture.  This does not
14692    line up with GCC's perspective on lane numbers, so we end up with
14693    different masks depending on our target endian-ness.  The diagram
14694    below may help.  We must draw the distinction when building masks
14695    which select one half of the vector.  An instruction selecting
14696    architectural low-lanes for a big-endian target, must be described using
14697    a mask selecting GCC high-lanes.
14698
14699                  Big-Endian             Little-Endian
14700
14701 GCC             0   1   2   3           3   2   1   0
14702               | x | x | x | x |       | x | x | x | x |
14703 Architecture    3   2   1   0           3   2   1   0
14704
14705 Low Mask:         { 2, 3 }                { 0, 1 }
14706 High Mask:        { 0, 1 }                { 2, 3 }
14707
14708    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
14709
14710 rtx
14711 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
14712 {
14713   rtvec v = rtvec_alloc (nunits / 2);
14714   int high_base = nunits / 2;
14715   int low_base = 0;
14716   int base;
14717   rtx t1;
14718   int i;
14719
14720   if (BYTES_BIG_ENDIAN)
14721     base = high ? low_base : high_base;
14722   else
14723     base = high ? high_base : low_base;
14724
14725   for (i = 0; i < nunits / 2; i++)
14726     RTVEC_ELT (v, i) = GEN_INT (base + i);
14727
14728   t1 = gen_rtx_PARALLEL (mode, v);
14729   return t1;
14730 }
14731
14732 /* Check OP for validity as a PARALLEL RTX vector with elements
14733    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
14734    from the perspective of the architecture.  See the diagram above
14735    aarch64_simd_vect_par_cnst_half for more details.  */
14736
14737 bool
14738 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
14739                                        bool high)
14740 {
14741   int nelts;
14742   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
14743     return false;
14744
14745   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
14746   HOST_WIDE_INT count_op = XVECLEN (op, 0);
14747   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
14748   int i = 0;
14749
14750   if (count_op != count_ideal)
14751     return false;
14752
14753   for (i = 0; i < count_ideal; i++)
14754     {
14755       rtx elt_op = XVECEXP (op, 0, i);
14756       rtx elt_ideal = XVECEXP (ideal, 0, i);
14757
14758       if (!CONST_INT_P (elt_op)
14759           || INTVAL (elt_ideal) != INTVAL (elt_op))
14760         return false;
14761     }
14762   return true;
14763 }
14764
14765 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
14766    HIGH (exclusive).  */
14767 void
14768 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
14769                           const_tree exp)
14770 {
14771   HOST_WIDE_INT lane;
14772   gcc_assert (CONST_INT_P (operand));
14773   lane = INTVAL (operand);
14774
14775   if (lane < low || lane >= high)
14776   {
14777     if (exp)
14778       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
14779     else
14780       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
14781   }
14782 }
14783
14784 /* Peform endian correction on lane number N, which indexes a vector
14785    of mode MODE, and return the result as an SImode rtx.  */
14786
14787 rtx
14788 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
14789 {
14790   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
14791 }
14792
14793 /* Return TRUE if OP is a valid vector addressing mode.  */
14794
14795 bool
14796 aarch64_simd_mem_operand_p (rtx op)
14797 {
14798   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
14799                         || REG_P (XEXP (op, 0)));
14800 }
14801
14802 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
14803
14804 bool
14805 aarch64_sve_ld1r_operand_p (rtx op)
14806 {
14807   struct aarch64_address_info addr;
14808   scalar_mode mode;
14809
14810   return (MEM_P (op)
14811           && is_a <scalar_mode> (GET_MODE (op), &mode)
14812           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
14813           && addr.type == ADDRESS_REG_IMM
14814           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
14815 }
14816
14817 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
14818    The conditions for STR are the same.  */
14819 bool
14820 aarch64_sve_ldr_operand_p (rtx op)
14821 {
14822   struct aarch64_address_info addr;
14823
14824   return (MEM_P (op)
14825           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
14826                                        false, ADDR_QUERY_ANY)
14827           && addr.type == ADDRESS_REG_IMM);
14828 }
14829
14830 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
14831    We need to be able to access the individual pieces, so the range
14832    is different from LD[234] and ST[234].  */
14833 bool
14834 aarch64_sve_struct_memory_operand_p (rtx op)
14835 {
14836   if (!MEM_P (op))
14837     return false;
14838
14839   machine_mode mode = GET_MODE (op);
14840   struct aarch64_address_info addr;
14841   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
14842                                  ADDR_QUERY_ANY)
14843       || addr.type != ADDRESS_REG_IMM)
14844     return false;
14845
14846   poly_int64 first = addr.const_offset;
14847   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
14848   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
14849           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
14850 }
14851
14852 /* Emit a register copy from operand to operand, taking care not to
14853    early-clobber source registers in the process.
14854
14855    COUNT is the number of components into which the copy needs to be
14856    decomposed.  */
14857 void
14858 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
14859                                 unsigned int count)
14860 {
14861   unsigned int i;
14862   int rdest = REGNO (operands[0]);
14863   int rsrc = REGNO (operands[1]);
14864
14865   if (!reg_overlap_mentioned_p (operands[0], operands[1])
14866       || rdest < rsrc)
14867     for (i = 0; i < count; i++)
14868       emit_move_insn (gen_rtx_REG (mode, rdest + i),
14869                       gen_rtx_REG (mode, rsrc + i));
14870   else
14871     for (i = 0; i < count; i++)
14872       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
14873                       gen_rtx_REG (mode, rsrc + count - i - 1));
14874 }
14875
14876 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
14877    one of VSTRUCT modes: OI, CI, or XI.  */
14878 int
14879 aarch64_simd_attr_length_rglist (machine_mode mode)
14880 {
14881   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
14882   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
14883 }
14884
14885 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
14886    alignment of a vector to 128 bits.  SVE predicates have an alignment of
14887    16 bits.  */
14888 static HOST_WIDE_INT
14889 aarch64_simd_vector_alignment (const_tree type)
14890 {
14891   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14892     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
14893        be set for non-predicate vectors of booleans.  Modes are the most
14894        direct way we have of identifying real SVE predicate types.  */
14895     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
14896   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
14897   return MIN (align, 128);
14898 }
14899
14900 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
14901 static poly_uint64
14902 aarch64_vectorize_preferred_vector_alignment (const_tree type)
14903 {
14904   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
14905     {
14906       /* If the length of the vector is fixed, try to align to that length,
14907          otherwise don't try to align at all.  */
14908       HOST_WIDE_INT result;
14909       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
14910         result = TYPE_ALIGN (TREE_TYPE (type));
14911       return result;
14912     }
14913   return TYPE_ALIGN (type);
14914 }
14915
14916 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
14917 static bool
14918 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
14919 {
14920   if (is_packed)
14921     return false;
14922
14923   /* For fixed-length vectors, check that the vectorizer will aim for
14924      full-vector alignment.  This isn't true for generic GCC vectors
14925      that are wider than the ABI maximum of 128 bits.  */
14926   poly_uint64 preferred_alignment =
14927     aarch64_vectorize_preferred_vector_alignment (type);
14928   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
14929       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
14930                    preferred_alignment))
14931     return false;
14932
14933   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
14934   return true;
14935 }
14936
14937 /* Return true if the vector misalignment factor is supported by the
14938    target.  */
14939 static bool
14940 aarch64_builtin_support_vector_misalignment (machine_mode mode,
14941                                              const_tree type, int misalignment,
14942                                              bool is_packed)
14943 {
14944   if (TARGET_SIMD && STRICT_ALIGNMENT)
14945     {
14946       /* Return if movmisalign pattern is not supported for this mode.  */
14947       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
14948         return false;
14949
14950       /* Misalignment factor is unknown at compile time.  */
14951       if (misalignment == -1)
14952         return false;
14953     }
14954   return default_builtin_support_vector_misalignment (mode, type, misalignment,
14955                                                       is_packed);
14956 }
14957
14958 /* If VALS is a vector constant that can be loaded into a register
14959    using DUP, generate instructions to do so and return an RTX to
14960    assign to the register.  Otherwise return NULL_RTX.  */
14961 static rtx
14962 aarch64_simd_dup_constant (rtx vals)
14963 {
14964   machine_mode mode = GET_MODE (vals);
14965   machine_mode inner_mode = GET_MODE_INNER (mode);
14966   rtx x;
14967
14968   if (!const_vec_duplicate_p (vals, &x))
14969     return NULL_RTX;
14970
14971   /* We can load this constant by using DUP and a constant in a
14972      single ARM register.  This will be cheaper than a vector
14973      load.  */
14974   x = copy_to_mode_reg (inner_mode, x);
14975   return gen_vec_duplicate (mode, x);
14976 }
14977
14978
14979 /* Generate code to load VALS, which is a PARALLEL containing only
14980    constants (for vec_init) or CONST_VECTOR, efficiently into a
14981    register.  Returns an RTX to copy into the register, or NULL_RTX
14982    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
14983 static rtx
14984 aarch64_simd_make_constant (rtx vals)
14985 {
14986   machine_mode mode = GET_MODE (vals);
14987   rtx const_dup;
14988   rtx const_vec = NULL_RTX;
14989   int n_const = 0;
14990   int i;
14991
14992   if (GET_CODE (vals) == CONST_VECTOR)
14993     const_vec = vals;
14994   else if (GET_CODE (vals) == PARALLEL)
14995     {
14996       /* A CONST_VECTOR must contain only CONST_INTs and
14997          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
14998          Only store valid constants in a CONST_VECTOR.  */
14999       int n_elts = XVECLEN (vals, 0);
15000       for (i = 0; i < n_elts; ++i)
15001         {
15002           rtx x = XVECEXP (vals, 0, i);
15003           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15004             n_const++;
15005         }
15006       if (n_const == n_elts)
15007         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
15008     }
15009   else
15010     gcc_unreachable ();
15011
15012   if (const_vec != NULL_RTX
15013       && aarch64_simd_valid_immediate (const_vec, NULL))
15014     /* Load using MOVI/MVNI.  */
15015     return const_vec;
15016   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
15017     /* Loaded using DUP.  */
15018     return const_dup;
15019   else if (const_vec != NULL_RTX)
15020     /* Load from constant pool. We cannot take advantage of single-cycle
15021        LD1 because we need a PC-relative addressing mode.  */
15022     return const_vec;
15023   else
15024     /* A PARALLEL containing something not valid inside CONST_VECTOR.
15025        We cannot construct an initializer.  */
15026     return NULL_RTX;
15027 }
15028
15029 /* Expand a vector initialisation sequence, such that TARGET is
15030    initialised to contain VALS.  */
15031
15032 void
15033 aarch64_expand_vector_init (rtx target, rtx vals)
15034 {
15035   machine_mode mode = GET_MODE (target);
15036   scalar_mode inner_mode = GET_MODE_INNER (mode);
15037   /* The number of vector elements.  */
15038   int n_elts = XVECLEN (vals, 0);
15039   /* The number of vector elements which are not constant.  */
15040   int n_var = 0;
15041   rtx any_const = NULL_RTX;
15042   /* The first element of vals.  */
15043   rtx v0 = XVECEXP (vals, 0, 0);
15044   bool all_same = true;
15045
15046   /* Count the number of variable elements to initialise.  */
15047   for (int i = 0; i < n_elts; ++i)
15048     {
15049       rtx x = XVECEXP (vals, 0, i);
15050       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
15051         ++n_var;
15052       else
15053         any_const = x;
15054
15055       all_same &= rtx_equal_p (x, v0);
15056     }
15057
15058   /* No variable elements, hand off to aarch64_simd_make_constant which knows
15059      how best to handle this.  */
15060   if (n_var == 0)
15061     {
15062       rtx constant = aarch64_simd_make_constant (vals);
15063       if (constant != NULL_RTX)
15064         {
15065           emit_move_insn (target, constant);
15066           return;
15067         }
15068     }
15069
15070   /* Splat a single non-constant element if we can.  */
15071   if (all_same)
15072     {
15073       rtx x = copy_to_mode_reg (inner_mode, v0);
15074       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15075       return;
15076     }
15077
15078   enum insn_code icode = optab_handler (vec_set_optab, mode);
15079   gcc_assert (icode != CODE_FOR_nothing);
15080
15081   /* If there are only variable elements, try to optimize
15082      the insertion using dup for the most common element
15083      followed by insertions.  */
15084
15085   /* The algorithm will fill matches[*][0] with the earliest matching element,
15086      and matches[X][1] with the count of duplicate elements (if X is the
15087      earliest element which has duplicates).  */
15088
15089   if (n_var == n_elts && n_elts <= 16)
15090     {
15091       int matches[16][2] = {0};
15092       for (int i = 0; i < n_elts; i++)
15093         {
15094           for (int j = 0; j <= i; j++)
15095             {
15096               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
15097                 {
15098                   matches[i][0] = j;
15099                   matches[j][1]++;
15100                   break;
15101                 }
15102             }
15103         }
15104       int maxelement = 0;
15105       int maxv = 0;
15106       for (int i = 0; i < n_elts; i++)
15107         if (matches[i][1] > maxv)
15108           {
15109             maxelement = i;
15110             maxv = matches[i][1];
15111           }
15112
15113       /* Create a duplicate of the most common element, unless all elements
15114          are equally useless to us, in which case just immediately set the
15115          vector register using the first element.  */
15116
15117       if (maxv == 1)
15118         {
15119           /* For vectors of two 64-bit elements, we can do even better.  */
15120           if (n_elts == 2
15121               && (inner_mode == E_DImode
15122                   || inner_mode == E_DFmode))
15123
15124             {
15125               rtx x0 = XVECEXP (vals, 0, 0);
15126               rtx x1 = XVECEXP (vals, 0, 1);
15127               /* Combine can pick up this case, but handling it directly
15128                  here leaves clearer RTL.
15129
15130                  This is load_pair_lanes<mode>, and also gives us a clean-up
15131                  for store_pair_lanes<mode>.  */
15132               if (memory_operand (x0, inner_mode)
15133                   && memory_operand (x1, inner_mode)
15134                   && !STRICT_ALIGNMENT
15135                   && rtx_equal_p (XEXP (x1, 0),
15136                                   plus_constant (Pmode,
15137                                                  XEXP (x0, 0),
15138                                                  GET_MODE_SIZE (inner_mode))))
15139                 {
15140                   rtx t;
15141                   if (inner_mode == DFmode)
15142                     t = gen_load_pair_lanesdf (target, x0, x1);
15143                   else
15144                     t = gen_load_pair_lanesdi (target, x0, x1);
15145                   emit_insn (t);
15146                   return;
15147                 }
15148             }
15149           /* The subreg-move sequence below will move into lane zero of the
15150              vector register.  For big-endian we want that position to hold
15151              the last element of VALS.  */
15152           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
15153           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15154           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
15155         }
15156       else
15157         {
15158           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15159           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15160         }
15161
15162       /* Insert the rest.  */
15163       for (int i = 0; i < n_elts; i++)
15164         {
15165           rtx x = XVECEXP (vals, 0, i);
15166           if (matches[i][0] == maxelement)
15167             continue;
15168           x = copy_to_mode_reg (inner_mode, x);
15169           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15170         }
15171       return;
15172     }
15173
15174   /* Initialise a vector which is part-variable.  We want to first try
15175      to build those lanes which are constant in the most efficient way we
15176      can.  */
15177   if (n_var != n_elts)
15178     {
15179       rtx copy = copy_rtx (vals);
15180
15181       /* Load constant part of vector.  We really don't care what goes into the
15182          parts we will overwrite, but we're more likely to be able to load the
15183          constant efficiently if it has fewer, larger, repeating parts
15184          (see aarch64_simd_valid_immediate).  */
15185       for (int i = 0; i < n_elts; i++)
15186         {
15187           rtx x = XVECEXP (vals, 0, i);
15188           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15189             continue;
15190           rtx subst = any_const;
15191           for (int bit = n_elts / 2; bit > 0; bit /= 2)
15192             {
15193               /* Look in the copied vector, as more elements are const.  */
15194               rtx test = XVECEXP (copy, 0, i ^ bit);
15195               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
15196                 {
15197                   subst = test;
15198                   break;
15199                 }
15200             }
15201           XVECEXP (copy, 0, i) = subst;
15202         }
15203       aarch64_expand_vector_init (target, copy);
15204     }
15205
15206   /* Insert the variable lanes directly.  */
15207   for (int i = 0; i < n_elts; i++)
15208     {
15209       rtx x = XVECEXP (vals, 0, i);
15210       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15211         continue;
15212       x = copy_to_mode_reg (inner_mode, x);
15213       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15214     }
15215 }
15216
15217 static unsigned HOST_WIDE_INT
15218 aarch64_shift_truncation_mask (machine_mode mode)
15219 {
15220   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
15221     return 0;
15222   return GET_MODE_UNIT_BITSIZE (mode) - 1;
15223 }
15224
15225 /* Select a format to encode pointers in exception handling data.  */
15226 int
15227 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
15228 {
15229    int type;
15230    switch (aarch64_cmodel)
15231      {
15232      case AARCH64_CMODEL_TINY:
15233      case AARCH64_CMODEL_TINY_PIC:
15234      case AARCH64_CMODEL_SMALL:
15235      case AARCH64_CMODEL_SMALL_PIC:
15236      case AARCH64_CMODEL_SMALL_SPIC:
15237        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
15238           for everything.  */
15239        type = DW_EH_PE_sdata4;
15240        break;
15241      default:
15242        /* No assumptions here.  8-byte relocs required.  */
15243        type = DW_EH_PE_sdata8;
15244        break;
15245      }
15246    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
15247 }
15248
15249 /* The last .arch and .tune assembly strings that we printed.  */
15250 static std::string aarch64_last_printed_arch_string;
15251 static std::string aarch64_last_printed_tune_string;
15252
15253 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
15254    by the function fndecl.  */
15255
15256 void
15257 aarch64_declare_function_name (FILE *stream, const char* name,
15258                                 tree fndecl)
15259 {
15260   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15261
15262   struct cl_target_option *targ_options;
15263   if (target_parts)
15264     targ_options = TREE_TARGET_OPTION (target_parts);
15265   else
15266     targ_options = TREE_TARGET_OPTION (target_option_current_node);
15267   gcc_assert (targ_options);
15268
15269   const struct processor *this_arch
15270     = aarch64_get_arch (targ_options->x_explicit_arch);
15271
15272   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
15273   std::string extension
15274     = aarch64_get_extension_string_for_isa_flags (isa_flags,
15275                                                   this_arch->flags);
15276   /* Only update the assembler .arch string if it is distinct from the last
15277      such string we printed.  */
15278   std::string to_print = this_arch->name + extension;
15279   if (to_print != aarch64_last_printed_arch_string)
15280     {
15281       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
15282       aarch64_last_printed_arch_string = to_print;
15283     }
15284
15285   /* Print the cpu name we're tuning for in the comments, might be
15286      useful to readers of the generated asm.  Do it only when it changes
15287      from function to function and verbose assembly is requested.  */
15288   const struct processor *this_tune
15289     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
15290
15291   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
15292     {
15293       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
15294                    this_tune->name);
15295       aarch64_last_printed_tune_string = this_tune->name;
15296     }
15297
15298   /* Don't forget the type directive for ELF.  */
15299   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
15300   ASM_OUTPUT_LABEL (stream, name);
15301 }
15302
15303 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
15304
15305 static void
15306 aarch64_start_file (void)
15307 {
15308   struct cl_target_option *default_options
15309     = TREE_TARGET_OPTION (target_option_default_node);
15310
15311   const struct processor *default_arch
15312     = aarch64_get_arch (default_options->x_explicit_arch);
15313   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
15314   std::string extension
15315     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
15316                                                   default_arch->flags);
15317
15318    aarch64_last_printed_arch_string = default_arch->name + extension;
15319    aarch64_last_printed_tune_string = "";
15320    asm_fprintf (asm_out_file, "\t.arch %s\n",
15321                 aarch64_last_printed_arch_string.c_str ());
15322
15323    default_file_start ();
15324 }
15325
15326 /* Emit load exclusive.  */
15327
15328 static void
15329 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
15330                              rtx mem, rtx model_rtx)
15331 {
15332   emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
15333 }
15334
15335 /* Emit store exclusive.  */
15336
15337 static void
15338 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
15339                               rtx rval, rtx mem, rtx model_rtx)
15340 {
15341   emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
15342 }
15343
15344 /* Mark the previous jump instruction as unlikely.  */
15345
15346 static void
15347 aarch64_emit_unlikely_jump (rtx insn)
15348 {
15349   rtx_insn *jump = emit_jump_insn (insn);
15350   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
15351 }
15352
15353 /* Expand a compare and swap pattern.  */
15354
15355 void
15356 aarch64_expand_compare_and_swap (rtx operands[])
15357 {
15358   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
15359   machine_mode mode, r_mode;
15360
15361   bval = operands[0];
15362   rval = operands[1];
15363   mem = operands[2];
15364   oldval = operands[3];
15365   newval = operands[4];
15366   is_weak = operands[5];
15367   mod_s = operands[6];
15368   mod_f = operands[7];
15369   mode = GET_MODE (mem);
15370
15371   /* Normally the succ memory model must be stronger than fail, but in the
15372      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
15373      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
15374   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
15375       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
15376     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
15377
15378   r_mode = mode;
15379   if (mode == QImode || mode == HImode)
15380     {
15381       r_mode = SImode;
15382       rval = gen_reg_rtx (r_mode);
15383     }
15384
15385   if (TARGET_LSE)
15386     {
15387       /* The CAS insn requires oldval and rval overlap, but we need to
15388          have a copy of oldval saved across the operation to tell if
15389          the operation is successful.  */
15390       if (reg_overlap_mentioned_p (rval, oldval))
15391         rval = copy_to_mode_reg (r_mode, oldval);
15392       else
15393         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
15394
15395       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
15396                                                    newval, mod_s));
15397       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15398     }
15399   else
15400     {
15401       /* The oldval predicate varies by mode.  Test it and force to reg.  */
15402       insn_code code = code_for_aarch64_compare_and_swap (mode);
15403       if (!insn_data[code].operand[2].predicate (oldval, mode))
15404         oldval = force_reg (mode, oldval);
15405
15406       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
15407                                  is_weak, mod_s, mod_f));
15408       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
15409     }
15410
15411   if (r_mode != mode)
15412     rval = gen_lowpart (mode, rval);
15413   emit_move_insn (operands[1], rval);
15414
15415   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
15416   emit_insn (gen_rtx_SET (bval, x));
15417 }
15418
15419 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
15420    sequence implementing an atomic operation.  */
15421
15422 static void
15423 aarch64_emit_post_barrier (enum memmodel model)
15424 {
15425   const enum memmodel base_model = memmodel_base (model);
15426
15427   if (is_mm_sync (model)
15428       && (base_model == MEMMODEL_ACQUIRE
15429           || base_model == MEMMODEL_ACQ_REL
15430           || base_model == MEMMODEL_SEQ_CST))
15431     {
15432       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
15433     }
15434 }
15435
15436 /* Split a compare and swap pattern.  */
15437
15438 void
15439 aarch64_split_compare_and_swap (rtx operands[])
15440 {
15441   rtx rval, mem, oldval, newval, scratch;
15442   machine_mode mode;
15443   bool is_weak;
15444   rtx_code_label *label1, *label2;
15445   rtx x, cond;
15446   enum memmodel model;
15447   rtx model_rtx;
15448
15449   rval = operands[0];
15450   mem = operands[1];
15451   oldval = operands[2];
15452   newval = operands[3];
15453   is_weak = (operands[4] != const0_rtx);
15454   model_rtx = operands[5];
15455   scratch = operands[7];
15456   mode = GET_MODE (mem);
15457   model = memmodel_from_int (INTVAL (model_rtx));
15458
15459   /* When OLDVAL is zero and we want the strong version we can emit a tighter
15460     loop:
15461     .label1:
15462         LD[A]XR rval, [mem]
15463         CBNZ    rval, .label2
15464         ST[L]XR scratch, newval, [mem]
15465         CBNZ    scratch, .label1
15466     .label2:
15467         CMP     rval, 0.  */
15468   bool strong_zero_p = !is_weak && oldval == const0_rtx;
15469
15470   label1 = NULL;
15471   if (!is_weak)
15472     {
15473       label1 = gen_label_rtx ();
15474       emit_label (label1);
15475     }
15476   label2 = gen_label_rtx ();
15477
15478   /* The initial load can be relaxed for a __sync operation since a final
15479      barrier will be emitted to stop code hoisting.  */
15480   if (is_mm_sync (model))
15481     aarch64_emit_load_exclusive (mode, rval, mem,
15482                                  GEN_INT (MEMMODEL_RELAXED));
15483   else
15484     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
15485
15486   if (strong_zero_p)
15487     {
15488       if (aarch64_track_speculation)
15489         {
15490           /* Emit an explicit compare instruction, so that we can correctly
15491              track the condition codes.  */
15492           rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
15493           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15494         }
15495       else
15496         x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
15497
15498       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15499                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15500       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15501     }
15502   else
15503     {
15504       cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15505       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15506       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15507                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15508       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15509     }
15510
15511   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
15512
15513   if (!is_weak)
15514     {
15515       if (aarch64_track_speculation)
15516         {
15517           /* Emit an explicit compare instruction, so that we can correctly
15518              track the condition codes.  */
15519           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
15520           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15521         }
15522       else
15523         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
15524
15525       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15526                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
15527       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15528     }
15529   else
15530     {
15531       cond = gen_rtx_REG (CCmode, CC_REGNUM);
15532       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
15533       emit_insn (gen_rtx_SET (cond, x));
15534     }
15535
15536   emit_label (label2);
15537   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
15538      to set the condition flags.  If this is not used it will be removed by
15539      later passes.  */
15540   if (strong_zero_p)
15541     {
15542       cond = gen_rtx_REG (CCmode, CC_REGNUM);
15543       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
15544       emit_insn (gen_rtx_SET (cond, x));
15545     }
15546   /* Emit any final barrier needed for a __sync operation.  */
15547   if (is_mm_sync (model))
15548     aarch64_emit_post_barrier (model);
15549 }
15550
15551 /* Split an atomic operation.  */
15552
15553 void
15554 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
15555                          rtx value, rtx model_rtx, rtx cond)
15556 {
15557   machine_mode mode = GET_MODE (mem);
15558   machine_mode wmode = (mode == DImode ? DImode : SImode);
15559   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
15560   const bool is_sync = is_mm_sync (model);
15561   rtx_code_label *label;
15562   rtx x;
15563
15564   /* Split the atomic operation into a sequence.  */
15565   label = gen_label_rtx ();
15566   emit_label (label);
15567
15568   if (new_out)
15569     new_out = gen_lowpart (wmode, new_out);
15570   if (old_out)
15571     old_out = gen_lowpart (wmode, old_out);
15572   else
15573     old_out = new_out;
15574   value = simplify_gen_subreg (wmode, value, mode, 0);
15575
15576   /* The initial load can be relaxed for a __sync operation since a final
15577      barrier will be emitted to stop code hoisting.  */
15578  if (is_sync)
15579     aarch64_emit_load_exclusive (mode, old_out, mem,
15580                                  GEN_INT (MEMMODEL_RELAXED));
15581   else
15582     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
15583
15584   switch (code)
15585     {
15586     case SET:
15587       new_out = value;
15588       break;
15589
15590     case NOT:
15591       x = gen_rtx_AND (wmode, old_out, value);
15592       emit_insn (gen_rtx_SET (new_out, x));
15593       x = gen_rtx_NOT (wmode, new_out);
15594       emit_insn (gen_rtx_SET (new_out, x));
15595       break;
15596
15597     case MINUS:
15598       if (CONST_INT_P (value))
15599         {
15600           value = GEN_INT (-INTVAL (value));
15601           code = PLUS;
15602         }
15603       /* Fall through.  */
15604
15605     default:
15606       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
15607       emit_insn (gen_rtx_SET (new_out, x));
15608       break;
15609     }
15610
15611   aarch64_emit_store_exclusive (mode, cond, mem,
15612                                 gen_lowpart (mode, new_out), model_rtx);
15613
15614   if (aarch64_track_speculation)
15615     {
15616       /* Emit an explicit compare instruction, so that we can correctly
15617          track the condition codes.  */
15618       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
15619       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15620     }
15621   else
15622     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15623
15624   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15625                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
15626   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15627
15628   /* Emit any final barrier needed for a __sync operation.  */
15629   if (is_sync)
15630     aarch64_emit_post_barrier (model);
15631 }
15632
15633 static void
15634 aarch64_init_libfuncs (void)
15635 {
15636    /* Half-precision float operations.  The compiler handles all operations
15637      with NULL libfuncs by converting to SFmode.  */
15638
15639   /* Conversions.  */
15640   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
15641   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
15642
15643   /* Arithmetic.  */
15644   set_optab_libfunc (add_optab, HFmode, NULL);
15645   set_optab_libfunc (sdiv_optab, HFmode, NULL);
15646   set_optab_libfunc (smul_optab, HFmode, NULL);
15647   set_optab_libfunc (neg_optab, HFmode, NULL);
15648   set_optab_libfunc (sub_optab, HFmode, NULL);
15649
15650   /* Comparisons.  */
15651   set_optab_libfunc (eq_optab, HFmode, NULL);
15652   set_optab_libfunc (ne_optab, HFmode, NULL);
15653   set_optab_libfunc (lt_optab, HFmode, NULL);
15654   set_optab_libfunc (le_optab, HFmode, NULL);
15655   set_optab_libfunc (ge_optab, HFmode, NULL);
15656   set_optab_libfunc (gt_optab, HFmode, NULL);
15657   set_optab_libfunc (unord_optab, HFmode, NULL);
15658 }
15659
15660 /* Target hook for c_mode_for_suffix.  */
15661 static machine_mode
15662 aarch64_c_mode_for_suffix (char suffix)
15663 {
15664   if (suffix == 'q')
15665     return TFmode;
15666
15667   return VOIDmode;
15668 }
15669
15670 /* We can only represent floating point constants which will fit in
15671    "quarter-precision" values.  These values are characterised by
15672    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
15673    by:
15674
15675    (-1)^s * (n/16) * 2^r
15676
15677    Where:
15678      's' is the sign bit.
15679      'n' is an integer in the range 16 <= n <= 31.
15680      'r' is an integer in the range -3 <= r <= 4.  */
15681
15682 /* Return true iff X can be represented by a quarter-precision
15683    floating point immediate operand X.  Note, we cannot represent 0.0.  */
15684 bool
15685 aarch64_float_const_representable_p (rtx x)
15686 {
15687   /* This represents our current view of how many bits
15688      make up the mantissa.  */
15689   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
15690   int exponent;
15691   unsigned HOST_WIDE_INT mantissa, mask;
15692   REAL_VALUE_TYPE r, m;
15693   bool fail;
15694
15695   if (!CONST_DOUBLE_P (x))
15696     return false;
15697
15698   if (GET_MODE (x) == VOIDmode
15699       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
15700     return false;
15701
15702   r = *CONST_DOUBLE_REAL_VALUE (x);
15703
15704   /* We cannot represent infinities, NaNs or +/-zero.  We won't
15705      know if we have +zero until we analyse the mantissa, but we
15706      can reject the other invalid values.  */
15707   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
15708       || REAL_VALUE_MINUS_ZERO (r))
15709     return false;
15710
15711   /* Extract exponent.  */
15712   r = real_value_abs (&r);
15713   exponent = REAL_EXP (&r);
15714
15715   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
15716      highest (sign) bit, with a fixed binary point at bit point_pos.
15717      m1 holds the low part of the mantissa, m2 the high part.
15718      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
15719      bits for the mantissa, this can fail (low bits will be lost).  */
15720   real_ldexp (&m, &r, point_pos - exponent);
15721   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
15722
15723   /* If the low part of the mantissa has bits set we cannot represent
15724      the value.  */
15725   if (w.ulow () != 0)
15726     return false;
15727   /* We have rejected the lower HOST_WIDE_INT, so update our
15728      understanding of how many bits lie in the mantissa and
15729      look only at the high HOST_WIDE_INT.  */
15730   mantissa = w.elt (1);
15731   point_pos -= HOST_BITS_PER_WIDE_INT;
15732
15733   /* We can only represent values with a mantissa of the form 1.xxxx.  */
15734   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
15735   if ((mantissa & mask) != 0)
15736     return false;
15737
15738   /* Having filtered unrepresentable values, we may now remove all
15739      but the highest 5 bits.  */
15740   mantissa >>= point_pos - 5;
15741
15742   /* We cannot represent the value 0.0, so reject it.  This is handled
15743      elsewhere.  */
15744   if (mantissa == 0)
15745     return false;
15746
15747   /* Then, as bit 4 is always set, we can mask it off, leaving
15748      the mantissa in the range [0, 15].  */
15749   mantissa &= ~(1 << 4);
15750   gcc_assert (mantissa <= 15);
15751
15752   /* GCC internally does not use IEEE754-like encoding (where normalized
15753      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
15754      Our mantissa values are shifted 4 places to the left relative to
15755      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
15756      by 5 places to correct for GCC's representation.  */
15757   exponent = 5 - exponent;
15758
15759   return (exponent >= 0 && exponent <= 7);
15760 }
15761
15762 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
15763    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
15764    output MOVI/MVNI, ORR or BIC immediate.  */
15765 char*
15766 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
15767                                    enum simd_immediate_check which)
15768 {
15769   bool is_valid;
15770   static char templ[40];
15771   const char *mnemonic;
15772   const char *shift_op;
15773   unsigned int lane_count = 0;
15774   char element_char;
15775
15776   struct simd_immediate_info info;
15777
15778   /* This will return true to show const_vector is legal for use as either
15779      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
15780      It will also update INFO to show how the immediate should be generated.
15781      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
15782   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
15783   gcc_assert (is_valid);
15784
15785   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15786   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
15787
15788   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15789     {
15790       gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
15791       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
15792          move immediate path.  */
15793       if (aarch64_float_const_zero_rtx_p (info.value))
15794         info.value = GEN_INT (0);
15795       else
15796         {
15797           const unsigned int buf_size = 20;
15798           char float_buf[buf_size] = {'\0'};
15799           real_to_decimal_for_mode (float_buf,
15800                                     CONST_DOUBLE_REAL_VALUE (info.value),
15801                                     buf_size, buf_size, 1, info.elt_mode);
15802
15803           if (lane_count == 1)
15804             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
15805           else
15806             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
15807                       lane_count, element_char, float_buf);
15808           return templ;
15809         }
15810     }
15811
15812   gcc_assert (CONST_INT_P (info.value));
15813
15814   if (which == AARCH64_CHECK_MOV)
15815     {
15816       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
15817       shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
15818       if (lane_count == 1)
15819         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
15820                   mnemonic, UINTVAL (info.value));
15821       else if (info.shift)
15822         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15823                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
15824                   element_char, UINTVAL (info.value), shift_op, info.shift);
15825       else
15826         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15827                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
15828                   element_char, UINTVAL (info.value));
15829     }
15830   else
15831     {
15832       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
15833       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
15834       if (info.shift)
15835         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15836                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
15837                   element_char, UINTVAL (info.value), "lsl", info.shift);
15838       else
15839         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15840                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
15841                   element_char, UINTVAL (info.value));
15842     }
15843   return templ;
15844 }
15845
15846 char*
15847 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
15848 {
15849
15850   /* If a floating point number was passed and we desire to use it in an
15851      integer mode do the conversion to integer.  */
15852   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
15853     {
15854       unsigned HOST_WIDE_INT ival;
15855       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
15856           gcc_unreachable ();
15857       immediate = gen_int_mode (ival, mode);
15858     }
15859
15860   machine_mode vmode;
15861   /* use a 64 bit mode for everything except for DI/DF mode, where we use
15862      a 128 bit vector mode.  */
15863   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
15864
15865   vmode = aarch64_simd_container_mode (mode, width);
15866   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
15867   return aarch64_output_simd_mov_immediate (v_op, width);
15868 }
15869
15870 /* Return the output string to use for moving immediate CONST_VECTOR
15871    into an SVE register.  */
15872
15873 char *
15874 aarch64_output_sve_mov_immediate (rtx const_vector)
15875 {
15876   static char templ[40];
15877   struct simd_immediate_info info;
15878   char element_char;
15879
15880   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15881   gcc_assert (is_valid);
15882
15883   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15884
15885   if (info.step)
15886     {
15887       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15888                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15889                 element_char, INTVAL (info.value), INTVAL (info.step));
15890       return templ;
15891     }
15892
15893   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15894     {
15895       if (aarch64_float_const_zero_rtx_p (info.value))
15896         info.value = GEN_INT (0);
15897       else
15898         {
15899           const int buf_size = 20;
15900           char float_buf[buf_size] = {};
15901           real_to_decimal_for_mode (float_buf,
15902                                     CONST_DOUBLE_REAL_VALUE (info.value),
15903                                     buf_size, buf_size, 1, info.elt_mode);
15904
15905           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15906                     element_char, float_buf);
15907           return templ;
15908         }
15909     }
15910
15911   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15912             element_char, INTVAL (info.value));
15913   return templ;
15914 }
15915
15916 /* Return the asm format for a PTRUE instruction whose destination has
15917    mode MODE.  SUFFIX is the element size suffix.  */
15918
15919 char *
15920 aarch64_output_ptrue (machine_mode mode, char suffix)
15921 {
15922   unsigned int nunits;
15923   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15924   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15925     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15926   else
15927     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15928   return buf;
15929 }
15930
15931 /* Split operands into moves from op[1] + op[2] into op[0].  */
15932
15933 void
15934 aarch64_split_combinev16qi (rtx operands[3])
15935 {
15936   unsigned int dest = REGNO (operands[0]);
15937   unsigned int src1 = REGNO (operands[1]);
15938   unsigned int src2 = REGNO (operands[2]);
15939   machine_mode halfmode = GET_MODE (operands[1]);
15940   unsigned int halfregs = REG_NREGS (operands[1]);
15941   rtx destlo, desthi;
15942
15943   gcc_assert (halfmode == V16QImode);
15944
15945   if (src1 == dest && src2 == dest + halfregs)
15946     {
15947       /* No-op move.  Can't split to nothing; emit something.  */
15948       emit_note (NOTE_INSN_DELETED);
15949       return;
15950     }
15951
15952   /* Preserve register attributes for variable tracking.  */
15953   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15954   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15955                                GET_MODE_SIZE (halfmode));
15956
15957   /* Special case of reversed high/low parts.  */
15958   if (reg_overlap_mentioned_p (operands[2], destlo)
15959       && reg_overlap_mentioned_p (operands[1], desthi))
15960     {
15961       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15962       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15963       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15964     }
15965   else if (!reg_overlap_mentioned_p (operands[2], destlo))
15966     {
15967       /* Try to avoid unnecessary moves if part of the result
15968          is in the right place already.  */
15969       if (src1 != dest)
15970         emit_move_insn (destlo, operands[1]);
15971       if (src2 != dest + halfregs)
15972         emit_move_insn (desthi, operands[2]);
15973     }
15974   else
15975     {
15976       if (src2 != dest + halfregs)
15977         emit_move_insn (desthi, operands[2]);
15978       if (src1 != dest)
15979         emit_move_insn (destlo, operands[1]);
15980     }
15981 }
15982
15983 /* vec_perm support.  */
15984
15985 struct expand_vec_perm_d
15986 {
15987   rtx target, op0, op1;
15988   vec_perm_indices perm;
15989   machine_mode vmode;
15990   unsigned int vec_flags;
15991   bool one_vector_p;
15992   bool testing_p;
15993 };
15994
15995 /* Generate a variable permutation.  */
15996
15997 static void
15998 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15999 {
16000   machine_mode vmode = GET_MODE (target);
16001   bool one_vector_p = rtx_equal_p (op0, op1);
16002
16003   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
16004   gcc_checking_assert (GET_MODE (op0) == vmode);
16005   gcc_checking_assert (GET_MODE (op1) == vmode);
16006   gcc_checking_assert (GET_MODE (sel) == vmode);
16007   gcc_checking_assert (TARGET_SIMD);
16008
16009   if (one_vector_p)
16010     {
16011       if (vmode == V8QImode)
16012         {
16013           /* Expand the argument to a V16QI mode by duplicating it.  */
16014           rtx pair = gen_reg_rtx (V16QImode);
16015           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
16016           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16017         }
16018       else
16019         {
16020           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
16021         }
16022     }
16023   else
16024     {
16025       rtx pair;
16026
16027       if (vmode == V8QImode)
16028         {
16029           pair = gen_reg_rtx (V16QImode);
16030           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
16031           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16032         }
16033       else
16034         {
16035           pair = gen_reg_rtx (OImode);
16036           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
16037           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
16038         }
16039     }
16040 }
16041
16042 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
16043    NELT is the number of elements in the vector.  */
16044
16045 void
16046 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
16047                          unsigned int nelt)
16048 {
16049   machine_mode vmode = GET_MODE (target);
16050   bool one_vector_p = rtx_equal_p (op0, op1);
16051   rtx mask;
16052
16053   /* The TBL instruction does not use a modulo index, so we must take care
16054      of that ourselves.  */
16055   mask = aarch64_simd_gen_const_vector_dup (vmode,
16056       one_vector_p ? nelt - 1 : 2 * nelt - 1);
16057   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
16058
16059   /* For big-endian, we also need to reverse the index within the vector
16060      (but not which vector).  */
16061   if (BYTES_BIG_ENDIAN)
16062     {
16063       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
16064       if (!one_vector_p)
16065         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
16066       sel = expand_simple_binop (vmode, XOR, sel, mask,
16067                                  NULL, 0, OPTAB_LIB_WIDEN);
16068     }
16069   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
16070 }
16071
16072 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
16073
16074 static void
16075 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
16076 {
16077   emit_insn (gen_rtx_SET (target,
16078                           gen_rtx_UNSPEC (GET_MODE (target),
16079                                           gen_rtvec (2, op0, op1), code)));
16080 }
16081
16082 /* Expand an SVE vec_perm with the given operands.  */
16083
16084 void
16085 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
16086 {
16087   machine_mode data_mode = GET_MODE (target);
16088   machine_mode sel_mode = GET_MODE (sel);
16089   /* Enforced by the pattern condition.  */
16090   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
16091
16092   /* Note: vec_perm indices are supposed to wrap when they go beyond the
16093      size of the two value vectors, i.e. the upper bits of the indices
16094      are effectively ignored.  SVE TBL instead produces 0 for any
16095      out-of-range indices, so we need to modulo all the vec_perm indices
16096      to ensure they are all in range.  */
16097   rtx sel_reg = force_reg (sel_mode, sel);
16098
16099   /* Check if the sel only references the first values vector.  */
16100   if (GET_CODE (sel) == CONST_VECTOR
16101       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
16102     {
16103       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
16104       return;
16105     }
16106
16107   /* Check if the two values vectors are the same.  */
16108   if (rtx_equal_p (op0, op1))
16109     {
16110       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
16111       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16112                                          NULL, 0, OPTAB_DIRECT);
16113       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
16114       return;
16115     }
16116
16117   /* Run TBL on for each value vector and combine the results.  */
16118
16119   rtx res0 = gen_reg_rtx (data_mode);
16120   rtx res1 = gen_reg_rtx (data_mode);
16121   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
16122   if (GET_CODE (sel) != CONST_VECTOR
16123       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
16124     {
16125       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
16126                                                        2 * nunits - 1);
16127       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16128                                      NULL, 0, OPTAB_DIRECT);
16129     }
16130   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
16131   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
16132                                      NULL, 0, OPTAB_DIRECT);
16133   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
16134   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
16135     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
16136   else
16137     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
16138 }
16139
16140 /* Recognize patterns suitable for the TRN instructions.  */
16141 static bool
16142 aarch64_evpc_trn (struct expand_vec_perm_d *d)
16143 {
16144   HOST_WIDE_INT odd;
16145   poly_uint64 nelt = d->perm.length ();
16146   rtx out, in0, in1, x;
16147   machine_mode vmode = d->vmode;
16148
16149   if (GET_MODE_UNIT_SIZE (vmode) > 8)
16150     return false;
16151
16152   /* Note that these are little-endian tests.
16153      We correct for big-endian later.  */
16154   if (!d->perm[0].is_constant (&odd)
16155       || (odd != 0 && odd != 1)
16156       || !d->perm.series_p (0, 2, odd, 2)
16157       || !d->perm.series_p (1, 2, nelt + odd, 2))
16158     return false;
16159
16160   /* Success!  */
16161   if (d->testing_p)
16162     return true;
16163
16164   in0 = d->op0;
16165   in1 = d->op1;
16166   /* We don't need a big-endian lane correction for SVE; see the comment
16167      at the head of aarch64-sve.md for details.  */
16168   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16169     {
16170       x = in0, in0 = in1, in1 = x;
16171       odd = !odd;
16172     }
16173   out = d->target;
16174
16175   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16176                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
16177   return true;
16178 }
16179
16180 /* Recognize patterns suitable for the UZP instructions.  */
16181 static bool
16182 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
16183 {
16184   HOST_WIDE_INT odd;
16185   rtx out, in0, in1, x;
16186   machine_mode vmode = d->vmode;
16187
16188   if (GET_MODE_UNIT_SIZE (vmode) > 8)
16189     return false;
16190
16191   /* Note that these are little-endian tests.
16192      We correct for big-endian later.  */
16193   if (!d->perm[0].is_constant (&odd)
16194       || (odd != 0 && odd != 1)
16195       || !d->perm.series_p (0, 1, odd, 2))
16196     return false;
16197
16198   /* Success!  */
16199   if (d->testing_p)
16200     return true;
16201
16202   in0 = d->op0;
16203   in1 = d->op1;
16204   /* We don't need a big-endian lane correction for SVE; see the comment
16205      at the head of aarch64-sve.md for details.  */
16206   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16207     {
16208       x = in0, in0 = in1, in1 = x;
16209       odd = !odd;
16210     }
16211   out = d->target;
16212
16213   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16214                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
16215   return true;
16216 }
16217
16218 /* Recognize patterns suitable for the ZIP instructions.  */
16219 static bool
16220 aarch64_evpc_zip (struct expand_vec_perm_d *d)
16221 {
16222   unsigned int high;
16223   poly_uint64 nelt = d->perm.length ();
16224   rtx out, in0, in1, x;
16225   machine_mode vmode = d->vmode;
16226
16227   if (GET_MODE_UNIT_SIZE (vmode) > 8)
16228     return false;
16229
16230   /* Note that these are little-endian tests.
16231      We correct for big-endian later.  */
16232   poly_uint64 first = d->perm[0];
16233   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
16234       || !d->perm.series_p (0, 2, first, 1)
16235       || !d->perm.series_p (1, 2, first + nelt, 1))
16236     return false;
16237   high = maybe_ne (first, 0U);
16238
16239   /* Success!  */
16240   if (d->testing_p)
16241     return true;
16242
16243   in0 = d->op0;
16244   in1 = d->op1;
16245   /* We don't need a big-endian lane correction for SVE; see the comment
16246      at the head of aarch64-sve.md for details.  */
16247   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16248     {
16249       x = in0, in0 = in1, in1 = x;
16250       high = !high;
16251     }
16252   out = d->target;
16253
16254   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16255                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
16256   return true;
16257 }
16258
16259 /* Recognize patterns for the EXT insn.  */
16260
16261 static bool
16262 aarch64_evpc_ext (struct expand_vec_perm_d *d)
16263 {
16264   HOST_WIDE_INT location;
16265   rtx offset;
16266
16267   /* The first element always refers to the first vector.
16268      Check if the extracted indices are increasing by one.  */
16269   if (d->vec_flags == VEC_SVE_PRED
16270       || !d->perm[0].is_constant (&location)
16271       || !d->perm.series_p (0, 1, location, 1))
16272     return false;
16273
16274   /* Success! */
16275   if (d->testing_p)
16276     return true;
16277
16278   /* The case where (location == 0) is a no-op for both big- and little-endian,
16279      and is removed by the mid-end at optimization levels -O1 and higher.
16280
16281      We don't need a big-endian lane correction for SVE; see the comment
16282      at the head of aarch64-sve.md for details.  */
16283   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
16284     {
16285       /* After setup, we want the high elements of the first vector (stored
16286          at the LSB end of the register), and the low elements of the second
16287          vector (stored at the MSB end of the register). So swap.  */
16288       std::swap (d->op0, d->op1);
16289       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
16290          to_constant () is safe since this is restricted to Advanced SIMD
16291          vectors.  */
16292       location = d->perm.length ().to_constant () - location;
16293     }
16294
16295   offset = GEN_INT (location);
16296   emit_set_insn (d->target,
16297                  gen_rtx_UNSPEC (d->vmode,
16298                                  gen_rtvec (3, d->op0, d->op1, offset),
16299                                  UNSPEC_EXT));
16300   return true;
16301 }
16302
16303 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
16304    within each 64-bit, 32-bit or 16-bit granule.  */
16305
16306 static bool
16307 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
16308 {
16309   HOST_WIDE_INT diff;
16310   unsigned int i, size, unspec;
16311   machine_mode pred_mode;
16312
16313   if (d->vec_flags == VEC_SVE_PRED
16314       || !d->one_vector_p
16315       || !d->perm[0].is_constant (&diff))
16316     return false;
16317
16318   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
16319   if (size == 8)
16320     {
16321       unspec = UNSPEC_REV64;
16322       pred_mode = VNx2BImode;
16323     }
16324   else if (size == 4)
16325     {
16326       unspec = UNSPEC_REV32;
16327       pred_mode = VNx4BImode;
16328     }
16329   else if (size == 2)
16330     {
16331       unspec = UNSPEC_REV16;
16332       pred_mode = VNx8BImode;
16333     }
16334   else
16335     return false;
16336
16337   unsigned int step = diff + 1;
16338   for (i = 0; i < step; ++i)
16339     if (!d->perm.series_p (i, step, diff - i, step))
16340       return false;
16341
16342   /* Success! */
16343   if (d->testing_p)
16344     return true;
16345
16346   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
16347   if (d->vec_flags == VEC_SVE_DATA)
16348     {
16349       rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16350       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
16351                             UNSPEC_MERGE_PTRUE);
16352     }
16353   emit_set_insn (d->target, src);
16354   return true;
16355 }
16356
16357 /* Recognize patterns for the REV insn, which reverses elements within
16358    a full vector.  */
16359
16360 static bool
16361 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
16362 {
16363   poly_uint64 nelt = d->perm.length ();
16364
16365   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
16366     return false;
16367
16368   if (!d->perm.series_p (0, 1, nelt - 1, -1))
16369     return false;
16370
16371   /* Success! */
16372   if (d->testing_p)
16373     return true;
16374
16375   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
16376   emit_set_insn (d->target, src);
16377   return true;
16378 }
16379
16380 static bool
16381 aarch64_evpc_dup (struct expand_vec_perm_d *d)
16382 {
16383   rtx out = d->target;
16384   rtx in0;
16385   HOST_WIDE_INT elt;
16386   machine_mode vmode = d->vmode;
16387   rtx lane;
16388
16389   if (d->vec_flags == VEC_SVE_PRED
16390       || d->perm.encoding ().encoded_nelts () != 1
16391       || !d->perm[0].is_constant (&elt))
16392     return false;
16393
16394   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
16395     return false;
16396
16397   /* Success! */
16398   if (d->testing_p)
16399     return true;
16400
16401   /* The generic preparation in aarch64_expand_vec_perm_const_1
16402      swaps the operand order and the permute indices if it finds
16403      d->perm[0] to be in the second operand.  Thus, we can always
16404      use d->op0 and need not do any extra arithmetic to get the
16405      correct lane number.  */
16406   in0 = d->op0;
16407   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
16408
16409   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
16410   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
16411   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
16412   return true;
16413 }
16414
16415 static bool
16416 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
16417 {
16418   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
16419   machine_mode vmode = d->vmode;
16420
16421   /* Make sure that the indices are constant.  */
16422   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
16423   for (unsigned int i = 0; i < encoded_nelts; ++i)
16424     if (!d->perm[i].is_constant ())
16425       return false;
16426
16427   if (d->testing_p)
16428     return true;
16429
16430   /* Generic code will try constant permutation twice.  Once with the
16431      original mode and again with the elements lowered to QImode.
16432      So wait and don't do the selector expansion ourselves.  */
16433   if (vmode != V8QImode && vmode != V16QImode)
16434     return false;
16435
16436   /* to_constant is safe since this routine is specific to Advanced SIMD
16437      vectors.  */
16438   unsigned int nelt = d->perm.length ().to_constant ();
16439   for (unsigned int i = 0; i < nelt; ++i)
16440     /* If big-endian and two vectors we end up with a weird mixed-endian
16441        mode on NEON.  Reverse the index within each word but not the word
16442        itself.  to_constant is safe because we checked is_constant above.  */
16443     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
16444                         ? d->perm[i].to_constant () ^ (nelt - 1)
16445                         : d->perm[i].to_constant ());
16446
16447   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16448   sel = force_reg (vmode, sel);
16449
16450   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
16451   return true;
16452 }
16453
16454 /* Try to implement D using an SVE TBL instruction.  */
16455
16456 static bool
16457 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
16458 {
16459   unsigned HOST_WIDE_INT nelt;
16460
16461   /* Permuting two variable-length vectors could overflow the
16462      index range.  */
16463   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
16464     return false;
16465
16466   if (d->testing_p)
16467     return true;
16468
16469   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
16470   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
16471   if (d->one_vector_p)
16472     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
16473   else
16474     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
16475   return true;
16476 }
16477
16478 static bool
16479 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
16480 {
16481   /* The pattern matching functions above are written to look for a small
16482      number to begin the sequence (0, 1, N/2).  If we begin with an index
16483      from the second operand, we can swap the operands.  */
16484   poly_int64 nelt = d->perm.length ();
16485   if (known_ge (d->perm[0], nelt))
16486     {
16487       d->perm.rotate_inputs (1);
16488       std::swap (d->op0, d->op1);
16489     }
16490
16491   if ((d->vec_flags == VEC_ADVSIMD
16492        || d->vec_flags == VEC_SVE_DATA
16493        || d->vec_flags == VEC_SVE_PRED)
16494       && known_gt (nelt, 1))
16495     {
16496       if (aarch64_evpc_rev_local (d))
16497         return true;
16498       else if (aarch64_evpc_rev_global (d))
16499         return true;
16500       else if (aarch64_evpc_ext (d))
16501         return true;
16502       else if (aarch64_evpc_dup (d))
16503         return true;
16504       else if (aarch64_evpc_zip (d))
16505         return true;
16506       else if (aarch64_evpc_uzp (d))
16507         return true;
16508       else if (aarch64_evpc_trn (d))
16509         return true;
16510       if (d->vec_flags == VEC_SVE_DATA)
16511         return aarch64_evpc_sve_tbl (d);
16512       else if (d->vec_flags == VEC_ADVSIMD)
16513         return aarch64_evpc_tbl (d);
16514     }
16515   return false;
16516 }
16517
16518 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
16519
16520 static bool
16521 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
16522                                   rtx op1, const vec_perm_indices &sel)
16523 {
16524   struct expand_vec_perm_d d;
16525
16526   /* Check whether the mask can be applied to a single vector.  */
16527   if (sel.ninputs () == 1
16528       || (op0 && rtx_equal_p (op0, op1)))
16529     d.one_vector_p = true;
16530   else if (sel.all_from_input_p (0))
16531     {
16532       d.one_vector_p = true;
16533       op1 = op0;
16534     }
16535   else if (sel.all_from_input_p (1))
16536     {
16537       d.one_vector_p = true;
16538       op0 = op1;
16539     }
16540   else
16541     d.one_vector_p = false;
16542
16543   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
16544                      sel.nelts_per_input ());
16545   d.vmode = vmode;
16546   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
16547   d.target = target;
16548   d.op0 = op0;
16549   d.op1 = op1;
16550   d.testing_p = !target;
16551
16552   if (!d.testing_p)
16553     return aarch64_expand_vec_perm_const_1 (&d);
16554
16555   rtx_insn *last = get_last_insn ();
16556   bool ret = aarch64_expand_vec_perm_const_1 (&d);
16557   gcc_assert (last == get_last_insn ());
16558
16559   return ret;
16560 }
16561
16562 /* Generate a byte permute mask for a register of mode MODE,
16563    which has NUNITS units.  */
16564
16565 rtx
16566 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
16567 {
16568   /* We have to reverse each vector because we dont have
16569      a permuted load that can reverse-load according to ABI rules.  */
16570   rtx mask;
16571   rtvec v = rtvec_alloc (16);
16572   unsigned int i, j;
16573   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
16574
16575   gcc_assert (BYTES_BIG_ENDIAN);
16576   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
16577
16578   for (i = 0; i < nunits; i++)
16579     for (j = 0; j < usize; j++)
16580       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
16581   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
16582   return force_reg (V16QImode, mask);
16583 }
16584
16585 /* Return true if X is a valid second operand for the SVE instruction
16586    that implements integer comparison OP_CODE.  */
16587
16588 static bool
16589 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
16590 {
16591   if (register_operand (x, VOIDmode))
16592     return true;
16593
16594   switch (op_code)
16595     {
16596     case LTU:
16597     case LEU:
16598     case GEU:
16599     case GTU:
16600       return aarch64_sve_cmp_immediate_p (x, false);
16601     case LT:
16602     case LE:
16603     case GE:
16604     case GT:
16605     case NE:
16606     case EQ:
16607       return aarch64_sve_cmp_immediate_p (x, true);
16608     default:
16609       gcc_unreachable ();
16610     }
16611 }
16612
16613 /* Use predicated SVE instructions to implement the equivalent of:
16614
16615      (set TARGET OP)
16616
16617    given that PTRUE is an all-true predicate of the appropriate mode.  */
16618
16619 static void
16620 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
16621 {
16622   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
16623                                gen_rtvec (2, ptrue, op),
16624                                UNSPEC_MERGE_PTRUE);
16625   rtx_insn *insn = emit_set_insn (target, unspec);
16626   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
16627 }
16628
16629 /* Likewise, but also clobber the condition codes.  */
16630
16631 static void
16632 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
16633 {
16634   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
16635                                gen_rtvec (2, ptrue, op),
16636                                UNSPEC_MERGE_PTRUE);
16637   rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
16638   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
16639 }
16640
16641 /* Return the UNSPEC_COND_* code for comparison CODE.  */
16642
16643 static unsigned int
16644 aarch64_unspec_cond_code (rtx_code code)
16645 {
16646   switch (code)
16647     {
16648     case NE:
16649       return UNSPEC_COND_NE;
16650     case EQ:
16651       return UNSPEC_COND_EQ;
16652     case LT:
16653       return UNSPEC_COND_LT;
16654     case GT:
16655       return UNSPEC_COND_GT;
16656     case LE:
16657       return UNSPEC_COND_LE;
16658     case GE:
16659       return UNSPEC_COND_GE;
16660     default:
16661       gcc_unreachable ();
16662     }
16663 }
16664
16665 /* Emit:
16666
16667       (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
16668
16669    where <X> is the operation associated with comparison CODE.  This form
16670    of instruction is used when (and (CODE OP0 OP1) PRED) would have different
16671    semantics, such as when PRED might not be all-true and when comparing
16672    inactive lanes could have side effects.  */
16673
16674 static void
16675 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
16676                                   rtx pred, rtx op0, rtx op1)
16677 {
16678   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
16679                                gen_rtvec (3, pred, op0, op1),
16680                                aarch64_unspec_cond_code (code));
16681   emit_set_insn (target, unspec);
16682 }
16683
16684 /* Expand an SVE integer comparison using the SVE equivalent of:
16685
16686      (set TARGET (CODE OP0 OP1)).  */
16687
16688 void
16689 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
16690 {
16691   machine_mode pred_mode = GET_MODE (target);
16692   machine_mode data_mode = GET_MODE (op0);
16693
16694   if (!aarch64_sve_cmp_operand_p (code, op1))
16695     op1 = force_reg (data_mode, op1);
16696
16697   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16698   rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16699   aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
16700 }
16701
16702 /* Emit the SVE equivalent of:
16703
16704       (set TMP1 (CODE1 OP0 OP1))
16705       (set TMP2 (CODE2 OP0 OP1))
16706       (set TARGET (ior:PRED_MODE TMP1 TMP2))
16707
16708    PTRUE is an all-true predicate with the same mode as TARGET.  */
16709
16710 static void
16711 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
16712                            rtx ptrue, rtx op0, rtx op1)
16713 {
16714   machine_mode pred_mode = GET_MODE (ptrue);
16715   rtx tmp1 = gen_reg_rtx (pred_mode);
16716   aarch64_emit_sve_ptrue_op (tmp1, ptrue,
16717                              gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
16718   rtx tmp2 = gen_reg_rtx (pred_mode);
16719   aarch64_emit_sve_ptrue_op (tmp2, ptrue,
16720                              gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
16721   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
16722 }
16723
16724 /* Emit the SVE equivalent of:
16725
16726       (set TMP (CODE OP0 OP1))
16727       (set TARGET (not TMP))
16728
16729    PTRUE is an all-true predicate with the same mode as TARGET.  */
16730
16731 static void
16732 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
16733                                 rtx op0, rtx op1)
16734 {
16735   machine_mode pred_mode = GET_MODE (ptrue);
16736   rtx tmp = gen_reg_rtx (pred_mode);
16737   aarch64_emit_sve_ptrue_op (tmp, ptrue,
16738                              gen_rtx_fmt_ee (code, pred_mode, op0, op1));
16739   aarch64_emit_unop (target, one_cmpl_optab, tmp);
16740 }
16741
16742 /* Expand an SVE floating-point comparison using the SVE equivalent of:
16743
16744      (set TARGET (CODE OP0 OP1))
16745
16746    If CAN_INVERT_P is true, the caller can also handle inverted results;
16747    return true if the result is in fact inverted.  */
16748
16749 bool
16750 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
16751                                   rtx op0, rtx op1, bool can_invert_p)
16752 {
16753   machine_mode pred_mode = GET_MODE (target);
16754   machine_mode data_mode = GET_MODE (op0);
16755
16756   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16757   switch (code)
16758     {
16759     case UNORDERED:
16760       /* UNORDERED has no immediate form.  */
16761       op1 = force_reg (data_mode, op1);
16762       /* fall through */
16763     case LT:
16764     case LE:
16765     case GT:
16766     case GE:
16767     case EQ:
16768     case NE:
16769       {
16770         /* There is native support for the comparison.  */
16771         rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16772         aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16773         return false;
16774       }
16775
16776     case LTGT:
16777       /* This is a trapping operation (LT or GT).  */
16778       aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
16779       return false;
16780
16781     case UNEQ:
16782       if (!flag_trapping_math)
16783         {
16784           /* This would trap for signaling NaNs.  */
16785           op1 = force_reg (data_mode, op1);
16786           aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
16787           return false;
16788         }
16789       /* fall through */
16790     case UNLT:
16791     case UNLE:
16792     case UNGT:
16793     case UNGE:
16794       if (flag_trapping_math)
16795         {
16796           /* Work out which elements are ordered.  */
16797           rtx ordered = gen_reg_rtx (pred_mode);
16798           op1 = force_reg (data_mode, op1);
16799           aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
16800
16801           /* Test the opposite condition for the ordered elements,
16802              then invert the result.  */
16803           if (code == UNEQ)
16804             code = NE;
16805           else
16806             code = reverse_condition_maybe_unordered (code);
16807           if (can_invert_p)
16808             {
16809               aarch64_emit_sve_predicated_cond (target, code,
16810                                                 ordered, op0, op1);
16811               return true;
16812             }
16813           rtx tmp = gen_reg_rtx (pred_mode);
16814           aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
16815           aarch64_emit_unop (target, one_cmpl_optab, tmp);
16816           return false;
16817         }
16818       break;
16819
16820     case ORDERED:
16821       /* ORDERED has no immediate form.  */
16822       op1 = force_reg (data_mode, op1);
16823       break;
16824
16825     default:
16826       gcc_unreachable ();
16827     }
16828
16829   /* There is native support for the inverse comparison.  */
16830   code = reverse_condition_maybe_unordered (code);
16831   if (can_invert_p)
16832     {
16833       rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16834       aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16835       return true;
16836     }
16837   aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
16838   return false;
16839 }
16840
16841 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
16842    of the data being selected and CMP_MODE is the mode of the values being
16843    compared.  */
16844
16845 void
16846 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
16847                           rtx *ops)
16848 {
16849   machine_mode pred_mode
16850     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
16851                              GET_MODE_SIZE (cmp_mode)).require ();
16852   rtx pred = gen_reg_rtx (pred_mode);
16853   if (FLOAT_MODE_P (cmp_mode))
16854     {
16855       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
16856                                             ops[4], ops[5], true))
16857         std::swap (ops[1], ops[2]);
16858     }
16859   else
16860     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
16861
16862   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
16863   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
16864 }
16865
16866 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
16867    true.  However due to issues with register allocation it is preferable
16868    to avoid tieing integer scalar and FP scalar modes.  Executing integer
16869    operations in general registers is better than treating them as scalar
16870    vector operations.  This reduces latency and avoids redundant int<->FP
16871    moves.  So tie modes if they are either the same class, or vector modes
16872    with other vector modes, vector structs or any scalar mode.  */
16873
16874 static bool
16875 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
16876 {
16877   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
16878     return true;
16879
16880   /* We specifically want to allow elements of "structure" modes to
16881      be tieable to the structure.  This more general condition allows
16882      other rarer situations too.  The reason we don't extend this to
16883      predicate modes is that there are no predicate structure modes
16884      nor any specific instructions for extracting part of a predicate
16885      register.  */
16886   if (aarch64_vector_data_mode_p (mode1)
16887       && aarch64_vector_data_mode_p (mode2))
16888     return true;
16889
16890   /* Also allow any scalar modes with vectors.  */
16891   if (aarch64_vector_mode_supported_p (mode1)
16892       || aarch64_vector_mode_supported_p (mode2))
16893     return true;
16894
16895   return false;
16896 }
16897
16898 /* Return a new RTX holding the result of moving POINTER forward by
16899    AMOUNT bytes.  */
16900
16901 static rtx
16902 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16903 {
16904   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16905
16906   return adjust_automodify_address (pointer, GET_MODE (pointer),
16907                                     next, amount);
16908 }
16909
16910 /* Return a new RTX holding the result of moving POINTER forward by the
16911    size of the mode it points to.  */
16912
16913 static rtx
16914 aarch64_progress_pointer (rtx pointer)
16915 {
16916   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16917 }
16918
16919 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16920    MODE bytes.  */
16921
16922 static void
16923 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16924                                               machine_mode mode)
16925 {
16926   rtx reg = gen_reg_rtx (mode);
16927
16928   /* "Cast" the pointers to the correct mode.  */
16929   *src = adjust_address (*src, mode, 0);
16930   *dst = adjust_address (*dst, mode, 0);
16931   /* Emit the memcpy.  */
16932   emit_move_insn (reg, *src);
16933   emit_move_insn (*dst, reg);
16934   /* Move the pointers forward.  */
16935   *src = aarch64_progress_pointer (*src);
16936   *dst = aarch64_progress_pointer (*dst);
16937 }
16938
16939 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
16940    we succeed, otherwise return false.  */
16941
16942 bool
16943 aarch64_expand_movmem (rtx *operands)
16944 {
16945   int n, mode_bits;
16946   rtx dst = operands[0];
16947   rtx src = operands[1];
16948   rtx base;
16949   machine_mode cur_mode = BLKmode, next_mode;
16950   bool speed_p = !optimize_function_for_size_p (cfun);
16951
16952   /* When optimizing for size, give a better estimate of the length of a
16953      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
16954      will always require an even number of instructions to do now.  And each
16955      operation requires both a load+store, so devide the max number by 2.  */
16956   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
16957
16958   /* We can't do anything smart if the amount to copy is not constant.  */
16959   if (!CONST_INT_P (operands[2]))
16960     return false;
16961
16962   n = INTVAL (operands[2]);
16963
16964   /* Try to keep the number of instructions low.  For all cases we will do at
16965      most two moves for the residual amount, since we'll always overlap the
16966      remainder.  */
16967   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
16968     return false;
16969
16970   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16971   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16972
16973   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16974   src = adjust_automodify_address (src, VOIDmode, base, 0);
16975
16976   /* Convert n to bits to make the rest of the code simpler.  */
16977   n = n * BITS_PER_UNIT;
16978
16979   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
16980      larger than TImode, but we should not use them for loads/stores here.  */
16981   const int copy_limit = GET_MODE_BITSIZE (TImode);
16982
16983   while (n > 0)
16984     {
16985       /* Find the largest mode in which to do the copy in without over reading
16986          or writing.  */
16987       opt_scalar_int_mode mode_iter;
16988       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
16989         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
16990           cur_mode = mode_iter.require ();
16991
16992       gcc_assert (cur_mode != BLKmode);
16993
16994       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
16995       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
16996
16997       n -= mode_bits;
16998
16999       /* Do certain trailing copies as overlapping if it's going to be
17000          cheaper.  i.e. less instructions to do so.  For instance doing a 15
17001          byte copy it's more efficient to do two overlapping 8 byte copies than
17002          8 + 6 + 1.  */
17003       if (n > 0 && n <= 8 * BITS_PER_UNIT)
17004         {
17005           next_mode = smallest_mode_for_size (n, MODE_INT);
17006           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
17007           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
17008           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
17009           n = n_bits;
17010         }
17011     }
17012
17013   return true;
17014 }
17015
17016 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
17017    SImode stores.  Handle the case when the constant has identical
17018    bottom and top halves.  This is beneficial when the two stores can be
17019    merged into an STP and we avoid synthesising potentially expensive
17020    immediates twice.  Return true if such a split is possible.  */
17021
17022 bool
17023 aarch64_split_dimode_const_store (rtx dst, rtx src)
17024 {
17025   rtx lo = gen_lowpart (SImode, src);
17026   rtx hi = gen_highpart_mode (SImode, DImode, src);
17027
17028   bool size_p = optimize_function_for_size_p (cfun);
17029
17030   if (!rtx_equal_p (lo, hi))
17031     return false;
17032
17033   unsigned int orig_cost
17034     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
17035   unsigned int lo_cost
17036     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
17037
17038   /* We want to transform:
17039      MOV        x1, 49370
17040      MOVK       x1, 0x140, lsl 16
17041      MOVK       x1, 0xc0da, lsl 32
17042      MOVK       x1, 0x140, lsl 48
17043      STR        x1, [x0]
17044    into:
17045      MOV        w1, 49370
17046      MOVK       w1, 0x140, lsl 16
17047      STP        w1, w1, [x0]
17048    So we want to perform this only when we save two instructions
17049    or more.  When optimizing for size, however, accept any code size
17050    savings we can.  */
17051   if (size_p && orig_cost <= lo_cost)
17052     return false;
17053
17054   if (!size_p
17055       && (orig_cost <= lo_cost + 1))
17056     return false;
17057
17058   rtx mem_lo = adjust_address (dst, SImode, 0);
17059   if (!aarch64_mem_pair_operand (mem_lo, SImode))
17060     return false;
17061
17062   rtx tmp_reg = gen_reg_rtx (SImode);
17063   aarch64_expand_mov_immediate (tmp_reg, lo);
17064   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
17065   /* Don't emit an explicit store pair as this may not be always profitable.
17066      Let the sched-fusion logic decide whether to merge them.  */
17067   emit_move_insn (mem_lo, tmp_reg);
17068   emit_move_insn (mem_hi, tmp_reg);
17069
17070   return true;
17071 }
17072
17073 /* Generate RTL for a conditional branch with rtx comparison CODE in
17074    mode CC_MODE.  The destination of the unlikely conditional branch
17075    is LABEL_REF.  */
17076
17077 void
17078 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
17079                               rtx label_ref)
17080 {
17081   rtx x;
17082   x = gen_rtx_fmt_ee (code, VOIDmode,
17083                       gen_rtx_REG (cc_mode, CC_REGNUM),
17084                       const0_rtx);
17085
17086   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17087                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
17088                             pc_rtx);
17089   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17090 }
17091
17092 /* Generate DImode scratch registers for 128-bit (TImode) addition.
17093
17094    OP1 represents the TImode destination operand 1
17095    OP2 represents the TImode destination operand 2
17096    LOW_DEST represents the low half (DImode) of TImode operand 0
17097    LOW_IN1 represents the low half (DImode) of TImode operand 1
17098    LOW_IN2 represents the low half (DImode) of TImode operand 2
17099    HIGH_DEST represents the high half (DImode) of TImode operand 0
17100    HIGH_IN1 represents the high half (DImode) of TImode operand 1
17101    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
17102
17103 void
17104 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17105                             rtx *low_in1, rtx *low_in2,
17106                             rtx *high_dest, rtx *high_in1,
17107                             rtx *high_in2)
17108 {
17109   *low_dest = gen_reg_rtx (DImode);
17110   *low_in1 = gen_lowpart (DImode, op1);
17111   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17112                                   subreg_lowpart_offset (DImode, TImode));
17113   *high_dest = gen_reg_rtx (DImode);
17114   *high_in1 = gen_highpart (DImode, op1);
17115   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17116                                    subreg_highpart_offset (DImode, TImode));
17117 }
17118
17119 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
17120
17121    This function differs from 'arch64_addti_scratch_regs' in that
17122    OP1 can be an immediate constant (zero). We must call
17123    subreg_highpart_offset with DImode and TImode arguments, otherwise
17124    VOIDmode will be used for the const_int which generates an internal
17125    error from subreg_size_highpart_offset which does not expect a size of zero.
17126
17127    OP1 represents the TImode destination operand 1
17128    OP2 represents the TImode destination operand 2
17129    LOW_DEST represents the low half (DImode) of TImode operand 0
17130    LOW_IN1 represents the low half (DImode) of TImode operand 1
17131    LOW_IN2 represents the low half (DImode) of TImode operand 2
17132    HIGH_DEST represents the high half (DImode) of TImode operand 0
17133    HIGH_IN1 represents the high half (DImode) of TImode operand 1
17134    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
17135
17136
17137 void
17138 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17139                              rtx *low_in1, rtx *low_in2,
17140                              rtx *high_dest, rtx *high_in1,
17141                              rtx *high_in2)
17142 {
17143   *low_dest = gen_reg_rtx (DImode);
17144   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
17145                                   subreg_lowpart_offset (DImode, TImode));
17146
17147   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17148                                   subreg_lowpart_offset (DImode, TImode));
17149   *high_dest = gen_reg_rtx (DImode);
17150
17151   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
17152                                    subreg_highpart_offset (DImode, TImode));
17153   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17154                                    subreg_highpart_offset (DImode, TImode));
17155 }
17156
17157 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
17158
17159    OP0 represents the TImode destination operand 0
17160    LOW_DEST represents the low half (DImode) of TImode operand 0
17161    LOW_IN1 represents the low half (DImode) of TImode operand 1
17162    LOW_IN2 represents the low half (DImode) of TImode operand 2
17163    HIGH_DEST represents the high half (DImode) of TImode operand 0
17164    HIGH_IN1 represents the high half (DImode) of TImode operand 1
17165    HIGH_IN2 represents the high half (DImode) of TImode operand 2
17166    UNSIGNED_P is true if the operation is being performed on unsigned
17167    values.  */
17168 void
17169 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
17170                        rtx low_in2, rtx high_dest, rtx high_in1,
17171                        rtx high_in2, bool unsigned_p)
17172 {
17173   if (low_in2 == const0_rtx)
17174     {
17175       low_dest = low_in1;
17176       high_in2 = force_reg (DImode, high_in2);
17177       if (unsigned_p)
17178         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
17179       else
17180         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
17181     }
17182   else
17183     {
17184       if (CONST_INT_P (low_in2))
17185         {
17186           high_in2 = force_reg (DImode, high_in2);
17187           emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
17188                                               GEN_INT (-INTVAL (low_in2))));
17189         }
17190       else
17191         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
17192
17193       if (unsigned_p)
17194         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
17195       else
17196         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
17197     }
17198
17199   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
17200   emit_move_insn (gen_highpart (DImode, op0), high_dest);
17201
17202 }
17203
17204 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
17205
17206 static unsigned HOST_WIDE_INT
17207 aarch64_asan_shadow_offset (void)
17208 {
17209   return (HOST_WIDE_INT_1 << 36);
17210 }
17211
17212 static rtx
17213 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
17214                         int code, tree treeop0, tree treeop1)
17215 {
17216   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17217   rtx op0, op1;
17218   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17219   insn_code icode;
17220   struct expand_operand ops[4];
17221
17222   start_sequence ();
17223   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17224
17225   op_mode = GET_MODE (op0);
17226   if (op_mode == VOIDmode)
17227     op_mode = GET_MODE (op1);
17228
17229   switch (op_mode)
17230     {
17231     case E_QImode:
17232     case E_HImode:
17233     case E_SImode:
17234       cmp_mode = SImode;
17235       icode = CODE_FOR_cmpsi;
17236       break;
17237
17238     case E_DImode:
17239       cmp_mode = DImode;
17240       icode = CODE_FOR_cmpdi;
17241       break;
17242
17243     case E_SFmode:
17244       cmp_mode = SFmode;
17245       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17246       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
17247       break;
17248
17249     case E_DFmode:
17250       cmp_mode = DFmode;
17251       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17252       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
17253       break;
17254
17255     default:
17256       end_sequence ();
17257       return NULL_RTX;
17258     }
17259
17260   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
17261   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
17262   if (!op0 || !op1)
17263     {
17264       end_sequence ();
17265       return NULL_RTX;
17266     }
17267   *prep_seq = get_insns ();
17268   end_sequence ();
17269
17270   create_fixed_operand (&ops[0], op0);
17271   create_fixed_operand (&ops[1], op1);
17272
17273   start_sequence ();
17274   if (!maybe_expand_insn (icode, 2, ops))
17275     {
17276       end_sequence ();
17277       return NULL_RTX;
17278     }
17279   *gen_seq = get_insns ();
17280   end_sequence ();
17281
17282   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
17283                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
17284 }
17285
17286 static rtx
17287 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
17288                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
17289 {
17290   rtx op0, op1, target;
17291   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17292   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17293   insn_code icode;
17294   struct expand_operand ops[6];
17295   int aarch64_cond;
17296
17297   push_to_sequence (*prep_seq);
17298   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17299
17300   op_mode = GET_MODE (op0);
17301   if (op_mode == VOIDmode)
17302     op_mode = GET_MODE (op1);
17303
17304   switch (op_mode)
17305     {
17306     case E_QImode:
17307     case E_HImode:
17308     case E_SImode:
17309       cmp_mode = SImode;
17310       icode = CODE_FOR_ccmpsi;
17311       break;
17312
17313     case E_DImode:
17314       cmp_mode = DImode;
17315       icode = CODE_FOR_ccmpdi;
17316       break;
17317
17318     case E_SFmode:
17319       cmp_mode = SFmode;
17320       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17321       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
17322       break;
17323
17324     case E_DFmode:
17325       cmp_mode = DFmode;
17326       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17327       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
17328       break;
17329
17330     default:
17331       end_sequence ();
17332       return NULL_RTX;
17333     }
17334
17335   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
17336   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
17337   if (!op0 || !op1)
17338     {
17339       end_sequence ();
17340       return NULL_RTX;
17341     }
17342   *prep_seq = get_insns ();
17343   end_sequence ();
17344
17345   target = gen_rtx_REG (cc_mode, CC_REGNUM);
17346   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
17347
17348   if (bit_code != AND)
17349     {
17350       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
17351                                                 GET_MODE (XEXP (prev, 0))),
17352                              VOIDmode, XEXP (prev, 0), const0_rtx);
17353       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
17354     }
17355
17356   create_fixed_operand (&ops[0], XEXP (prev, 0));
17357   create_fixed_operand (&ops[1], target);
17358   create_fixed_operand (&ops[2], op0);
17359   create_fixed_operand (&ops[3], op1);
17360   create_fixed_operand (&ops[4], prev);
17361   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
17362
17363   push_to_sequence (*gen_seq);
17364   if (!maybe_expand_insn (icode, 6, ops))
17365     {
17366       end_sequence ();
17367       return NULL_RTX;
17368     }
17369
17370   *gen_seq = get_insns ();
17371   end_sequence ();
17372
17373   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
17374 }
17375
17376 #undef TARGET_GEN_CCMP_FIRST
17377 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
17378
17379 #undef TARGET_GEN_CCMP_NEXT
17380 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
17381
17382 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
17383    instruction fusion of some sort.  */
17384
17385 static bool
17386 aarch64_macro_fusion_p (void)
17387 {
17388   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
17389 }
17390
17391
17392 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
17393    should be kept together during scheduling.  */
17394
17395 static bool
17396 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
17397 {
17398   rtx set_dest;
17399   rtx prev_set = single_set (prev);
17400   rtx curr_set = single_set (curr);
17401   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
17402   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
17403
17404   if (!aarch64_macro_fusion_p ())
17405     return false;
17406
17407   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
17408     {
17409       /* We are trying to match:
17410          prev (mov)  == (set (reg r0) (const_int imm16))
17411          curr (movk) == (set (zero_extract (reg r0)
17412                                            (const_int 16)
17413                                            (const_int 16))
17414                              (const_int imm16_1))  */
17415
17416       set_dest = SET_DEST (curr_set);
17417
17418       if (GET_CODE (set_dest) == ZERO_EXTRACT
17419           && CONST_INT_P (SET_SRC (curr_set))
17420           && CONST_INT_P (SET_SRC (prev_set))
17421           && CONST_INT_P (XEXP (set_dest, 2))
17422           && INTVAL (XEXP (set_dest, 2)) == 16
17423           && REG_P (XEXP (set_dest, 0))
17424           && REG_P (SET_DEST (prev_set))
17425           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
17426         {
17427           return true;
17428         }
17429     }
17430
17431   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
17432     {
17433
17434       /*  We're trying to match:
17435           prev (adrp) == (set (reg r1)
17436                               (high (symbol_ref ("SYM"))))
17437           curr (add) == (set (reg r0)
17438                              (lo_sum (reg r1)
17439                                      (symbol_ref ("SYM"))))
17440           Note that r0 need not necessarily be the same as r1, especially
17441           during pre-regalloc scheduling.  */
17442
17443       if (satisfies_constraint_Ush (SET_SRC (prev_set))
17444           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17445         {
17446           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
17447               && REG_P (XEXP (SET_SRC (curr_set), 0))
17448               && REGNO (XEXP (SET_SRC (curr_set), 0))
17449                  == REGNO (SET_DEST (prev_set))
17450               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
17451                               XEXP (SET_SRC (curr_set), 1)))
17452             return true;
17453         }
17454     }
17455
17456   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
17457     {
17458
17459       /* We're trying to match:
17460          prev (movk) == (set (zero_extract (reg r0)
17461                                            (const_int 16)
17462                                            (const_int 32))
17463                              (const_int imm16_1))
17464          curr (movk) == (set (zero_extract (reg r0)
17465                                            (const_int 16)
17466                                            (const_int 48))
17467                              (const_int imm16_2))  */
17468
17469       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
17470           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
17471           && REG_P (XEXP (SET_DEST (prev_set), 0))
17472           && REG_P (XEXP (SET_DEST (curr_set), 0))
17473           && REGNO (XEXP (SET_DEST (prev_set), 0))
17474              == REGNO (XEXP (SET_DEST (curr_set), 0))
17475           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
17476           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
17477           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
17478           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
17479           && CONST_INT_P (SET_SRC (prev_set))
17480           && CONST_INT_P (SET_SRC (curr_set)))
17481         return true;
17482
17483     }
17484   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
17485     {
17486       /* We're trying to match:
17487           prev (adrp) == (set (reg r0)
17488                               (high (symbol_ref ("SYM"))))
17489           curr (ldr) == (set (reg r1)
17490                              (mem (lo_sum (reg r0)
17491                                              (symbol_ref ("SYM")))))
17492                  or
17493           curr (ldr) == (set (reg r1)
17494                              (zero_extend (mem
17495                                            (lo_sum (reg r0)
17496                                                    (symbol_ref ("SYM"))))))  */
17497       if (satisfies_constraint_Ush (SET_SRC (prev_set))
17498           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17499         {
17500           rtx curr_src = SET_SRC (curr_set);
17501
17502           if (GET_CODE (curr_src) == ZERO_EXTEND)
17503             curr_src = XEXP (curr_src, 0);
17504
17505           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
17506               && REG_P (XEXP (XEXP (curr_src, 0), 0))
17507               && REGNO (XEXP (XEXP (curr_src, 0), 0))
17508                  == REGNO (SET_DEST (prev_set))
17509               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
17510                               XEXP (SET_SRC (prev_set), 0)))
17511               return true;
17512         }
17513     }
17514
17515   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
17516        && aarch_crypto_can_dual_issue (prev, curr))
17517     return true;
17518
17519   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
17520       && any_condjump_p (curr))
17521     {
17522       unsigned int condreg1, condreg2;
17523       rtx cc_reg_1;
17524       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
17525       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
17526
17527       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
17528           && prev
17529           && modified_in_p (cc_reg_1, prev))
17530         {
17531           enum attr_type prev_type = get_attr_type (prev);
17532
17533           /* FIXME: this misses some which is considered simple arthematic
17534              instructions for ThunderX.  Simple shifts are missed here.  */
17535           if (prev_type == TYPE_ALUS_SREG
17536               || prev_type == TYPE_ALUS_IMM
17537               || prev_type == TYPE_LOGICS_REG
17538               || prev_type == TYPE_LOGICS_IMM)
17539             return true;
17540         }
17541     }
17542
17543   if (prev_set
17544       && curr_set
17545       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
17546       && any_condjump_p (curr))
17547     {
17548       /* We're trying to match:
17549           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
17550           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
17551                                                          (const_int 0))
17552                                                  (label_ref ("SYM"))
17553                                                  (pc))  */
17554       if (SET_DEST (curr_set) == (pc_rtx)
17555           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
17556           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
17557           && REG_P (SET_DEST (prev_set))
17558           && REGNO (SET_DEST (prev_set))
17559              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
17560         {
17561           /* Fuse ALU operations followed by conditional branch instruction.  */
17562           switch (get_attr_type (prev))
17563             {
17564             case TYPE_ALU_IMM:
17565             case TYPE_ALU_SREG:
17566             case TYPE_ADC_REG:
17567             case TYPE_ADC_IMM:
17568             case TYPE_ADCS_REG:
17569             case TYPE_ADCS_IMM:
17570             case TYPE_LOGIC_REG:
17571             case TYPE_LOGIC_IMM:
17572             case TYPE_CSEL:
17573             case TYPE_ADR:
17574             case TYPE_MOV_IMM:
17575             case TYPE_SHIFT_REG:
17576             case TYPE_SHIFT_IMM:
17577             case TYPE_BFM:
17578             case TYPE_RBIT:
17579             case TYPE_REV:
17580             case TYPE_EXTEND:
17581               return true;
17582
17583             default:;
17584             }
17585         }
17586     }
17587
17588   return false;
17589 }
17590
17591 /* Return true iff the instruction fusion described by OP is enabled.  */
17592
17593 bool
17594 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
17595 {
17596   return (aarch64_tune_params.fusible_ops & op) != 0;
17597 }
17598
17599 /* If MEM is in the form of [base+offset], extract the two parts
17600    of address and set to BASE and OFFSET, otherwise return false
17601    after clearing BASE and OFFSET.  */
17602
17603 bool
17604 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
17605 {
17606   rtx addr;
17607
17608   gcc_assert (MEM_P (mem));
17609
17610   addr = XEXP (mem, 0);
17611
17612   if (REG_P (addr))
17613     {
17614       *base = addr;
17615       *offset = const0_rtx;
17616       return true;
17617     }
17618
17619   if (GET_CODE (addr) == PLUS
17620       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
17621     {
17622       *base = XEXP (addr, 0);
17623       *offset = XEXP (addr, 1);
17624       return true;
17625     }
17626
17627   *base = NULL_RTX;
17628   *offset = NULL_RTX;
17629
17630   return false;
17631 }
17632
17633 /* Types for scheduling fusion.  */
17634 enum sched_fusion_type
17635 {
17636   SCHED_FUSION_NONE = 0,
17637   SCHED_FUSION_LD_SIGN_EXTEND,
17638   SCHED_FUSION_LD_ZERO_EXTEND,
17639   SCHED_FUSION_LD,
17640   SCHED_FUSION_ST,
17641   SCHED_FUSION_NUM
17642 };
17643
17644 /* If INSN is a load or store of address in the form of [base+offset],
17645    extract the two parts and set to BASE and OFFSET.  Return scheduling
17646    fusion type this INSN is.  */
17647
17648 static enum sched_fusion_type
17649 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
17650 {
17651   rtx x, dest, src;
17652   enum sched_fusion_type fusion = SCHED_FUSION_LD;
17653
17654   gcc_assert (INSN_P (insn));
17655   x = PATTERN (insn);
17656   if (GET_CODE (x) != SET)
17657     return SCHED_FUSION_NONE;
17658
17659   src = SET_SRC (x);
17660   dest = SET_DEST (x);
17661
17662   machine_mode dest_mode = GET_MODE (dest);
17663
17664   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
17665     return SCHED_FUSION_NONE;
17666
17667   if (GET_CODE (src) == SIGN_EXTEND)
17668     {
17669       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
17670       src = XEXP (src, 0);
17671       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
17672         return SCHED_FUSION_NONE;
17673     }
17674   else if (GET_CODE (src) == ZERO_EXTEND)
17675     {
17676       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
17677       src = XEXP (src, 0);
17678       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
17679         return SCHED_FUSION_NONE;
17680     }
17681
17682   if (GET_CODE (src) == MEM && REG_P (dest))
17683     extract_base_offset_in_addr (src, base, offset);
17684   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
17685     {
17686       fusion = SCHED_FUSION_ST;
17687       extract_base_offset_in_addr (dest, base, offset);
17688     }
17689   else
17690     return SCHED_FUSION_NONE;
17691
17692   if (*base == NULL_RTX || *offset == NULL_RTX)
17693     fusion = SCHED_FUSION_NONE;
17694
17695   return fusion;
17696 }
17697
17698 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
17699
17700    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
17701    and PRI are only calculated for these instructions.  For other instruction,
17702    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
17703    type instruction fusion can be added by returning different priorities.
17704
17705    It's important that irrelevant instructions get the largest FUSION_PRI.  */
17706
17707 static void
17708 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
17709                                int *fusion_pri, int *pri)
17710 {
17711   int tmp, off_val;
17712   rtx base, offset;
17713   enum sched_fusion_type fusion;
17714
17715   gcc_assert (INSN_P (insn));
17716
17717   tmp = max_pri - 1;
17718   fusion = fusion_load_store (insn, &base, &offset);
17719   if (fusion == SCHED_FUSION_NONE)
17720     {
17721       *pri = tmp;
17722       *fusion_pri = tmp;
17723       return;
17724     }
17725
17726   /* Set FUSION_PRI according to fusion type and base register.  */
17727   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
17728
17729   /* Calculate PRI.  */
17730   tmp /= 2;
17731
17732   /* INSN with smaller offset goes first.  */
17733   off_val = (int)(INTVAL (offset));
17734   if (off_val >= 0)
17735     tmp -= (off_val & 0xfffff);
17736   else
17737     tmp += ((- off_val) & 0xfffff);
17738
17739   *pri = tmp;
17740   return;
17741 }
17742
17743 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
17744    Adjust priority of sha1h instructions so they are scheduled before
17745    other SHA1 instructions.  */
17746
17747 static int
17748 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
17749 {
17750   rtx x = PATTERN (insn);
17751
17752   if (GET_CODE (x) == SET)
17753     {
17754       x = SET_SRC (x);
17755
17756       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
17757         return priority + 10;
17758     }
17759
17760   return priority;
17761 }
17762
17763 /* Given OPERANDS of consecutive load/store, check if we can merge
17764    them into ldp/stp.  LOAD is true if they are load instructions.
17765    MODE is the mode of memory operands.  */
17766
17767 bool
17768 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
17769                                 machine_mode mode)
17770 {
17771   HOST_WIDE_INT offval_1, offval_2, msize;
17772   enum reg_class rclass_1, rclass_2;
17773   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
17774
17775   if (load)
17776     {
17777       mem_1 = operands[1];
17778       mem_2 = operands[3];
17779       reg_1 = operands[0];
17780       reg_2 = operands[2];
17781       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
17782       if (REGNO (reg_1) == REGNO (reg_2))
17783         return false;
17784     }
17785   else
17786     {
17787       mem_1 = operands[0];
17788       mem_2 = operands[2];
17789       reg_1 = operands[1];
17790       reg_2 = operands[3];
17791     }
17792
17793   /* The mems cannot be volatile.  */
17794   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
17795     return false;
17796
17797   /* If we have SImode and slow unaligned ldp,
17798      check the alignment to be at least 8 byte. */
17799   if (mode == SImode
17800       && (aarch64_tune_params.extra_tuning_flags
17801           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17802       && !optimize_size
17803       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
17804     return false;
17805
17806   /* Check if the addresses are in the form of [base+offset].  */
17807   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17808   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
17809     return false;
17810   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17811   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
17812     return false;
17813
17814   /* Check if the bases are same.  */
17815   if (!rtx_equal_p (base_1, base_2))
17816     return false;
17817
17818   /* The operands must be of the same size.  */
17819   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
17820                          GET_MODE_SIZE (GET_MODE (mem_2))));
17821
17822   offval_1 = INTVAL (offset_1);
17823   offval_2 = INTVAL (offset_2);
17824   /* We should only be trying this for fixed-sized modes.  There is no
17825      SVE LDP/STP instruction.  */
17826   msize = GET_MODE_SIZE (mode).to_constant ();
17827   /* Check if the offsets are consecutive.  */
17828   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
17829     return false;
17830
17831   /* Check if the addresses are clobbered by load.  */
17832   if (load)
17833     {
17834       if (reg_mentioned_p (reg_1, mem_1))
17835         return false;
17836
17837       /* In increasing order, the last load can clobber the address.  */
17838       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
17839         return false;
17840     }
17841
17842   /* One of the memory accesses must be a mempair operand.
17843      If it is not the first one, they need to be swapped by the
17844      peephole.  */
17845   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
17846        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
17847     return false;
17848
17849   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
17850     rclass_1 = FP_REGS;
17851   else
17852     rclass_1 = GENERAL_REGS;
17853
17854   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
17855     rclass_2 = FP_REGS;
17856   else
17857     rclass_2 = GENERAL_REGS;
17858
17859   /* Check if the registers are of same class.  */
17860   if (rclass_1 != rclass_2)
17861     return false;
17862
17863   return true;
17864 }
17865
17866 /* Given OPERANDS of consecutive load/store that can be merged,
17867    swap them if they are not in ascending order.  */
17868 void
17869 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
17870 {
17871   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
17872   HOST_WIDE_INT offval_1, offval_2;
17873
17874   if (load)
17875     {
17876       mem_1 = operands[1];
17877       mem_2 = operands[3];
17878     }
17879   else
17880     {
17881       mem_1 = operands[0];
17882       mem_2 = operands[2];
17883     }
17884
17885   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17886   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17887
17888   offval_1 = INTVAL (offset_1);
17889   offval_2 = INTVAL (offset_2);
17890
17891   if (offval_1 > offval_2)
17892     {
17893       /* Irrespective of whether this is a load or a store,
17894          we do the same swap.  */
17895       std::swap (operands[0], operands[2]);
17896       std::swap (operands[1], operands[3]);
17897     }
17898 }
17899
17900 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
17901    comparison between the two.  */
17902 int
17903 aarch64_host_wide_int_compare (const void *x, const void *y)
17904 {
17905   return wi::cmps (* ((const HOST_WIDE_INT *) x),
17906                    * ((const HOST_WIDE_INT *) y));
17907 }
17908
17909 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
17910    other pointing to a REG rtx containing an offset, compare the offsets
17911    of the two pairs.
17912
17913    Return:
17914
17915         1 iff offset (X) > offset (Y)
17916         0 iff offset (X) == offset (Y)
17917         -1 iff offset (X) < offset (Y)  */
17918 int
17919 aarch64_ldrstr_offset_compare (const void *x, const void *y)
17920 {
17921   const rtx * operands_1 = (const rtx *) x;
17922   const rtx * operands_2 = (const rtx *) y;
17923   rtx mem_1, mem_2, base, offset_1, offset_2;
17924
17925   if (MEM_P (operands_1[0]))
17926     mem_1 = operands_1[0];
17927   else
17928     mem_1 = operands_1[1];
17929
17930   if (MEM_P (operands_2[0]))
17931     mem_2 = operands_2[0];
17932   else
17933     mem_2 = operands_2[1];
17934
17935   /* Extract the offsets.  */
17936   extract_base_offset_in_addr (mem_1, &base, &offset_1);
17937   extract_base_offset_in_addr (mem_2, &base, &offset_2);
17938
17939   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
17940
17941   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
17942 }
17943
17944 /* Given OPERANDS of consecutive load/store, check if we can merge
17945    them into ldp/stp by adjusting the offset.  LOAD is true if they
17946    are load instructions.  MODE is the mode of memory operands.
17947
17948    Given below consecutive stores:
17949
17950      str  w1, [xb, 0x100]
17951      str  w1, [xb, 0x104]
17952      str  w1, [xb, 0x108]
17953      str  w1, [xb, 0x10c]
17954
17955    Though the offsets are out of the range supported by stp, we can
17956    still pair them after adjusting the offset, like:
17957
17958      add  scratch, xb, 0x100
17959      stp  w1, w1, [scratch]
17960      stp  w1, w1, [scratch, 0x8]
17961
17962    The peephole patterns detecting this opportunity should guarantee
17963    the scratch register is avaliable.  */
17964
17965 bool
17966 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
17967                                        scalar_mode mode)
17968 {
17969   const int num_insns = 4;
17970   enum reg_class rclass;
17971   HOST_WIDE_INT offvals[num_insns], msize;
17972   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
17973
17974   if (load)
17975     {
17976       for (int i = 0; i < num_insns; i++)
17977         {
17978           reg[i] = operands[2 * i];
17979           mem[i] = operands[2 * i + 1];
17980
17981           gcc_assert (REG_P (reg[i]));
17982         }
17983
17984       /* Do not attempt to merge the loads if the loads clobber each other.  */
17985       for (int i = 0; i < 8; i += 2)
17986         for (int j = i + 2; j < 8; j += 2)
17987           if (reg_overlap_mentioned_p (operands[i], operands[j]))
17988             return false;
17989     }
17990   else
17991     for (int i = 0; i < num_insns; i++)
17992       {
17993         mem[i] = operands[2 * i];
17994         reg[i] = operands[2 * i + 1];
17995       }
17996
17997   /* Skip if memory operand is by itself valid for ldp/stp.  */
17998   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
17999     return false;
18000
18001   for (int i = 0; i < num_insns; i++)
18002     {
18003       /* The mems cannot be volatile.  */
18004       if (MEM_VOLATILE_P (mem[i]))
18005         return false;
18006
18007       /* Check if the addresses are in the form of [base+offset].  */
18008       extract_base_offset_in_addr (mem[i], base + i, offset + i);
18009       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
18010         return false;
18011     }
18012
18013   /* Check if the registers are of same class.  */
18014   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
18015     ? FP_REGS : GENERAL_REGS;
18016
18017   for (int i = 1; i < num_insns; i++)
18018     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
18019       {
18020         if (rclass != FP_REGS)
18021           return false;
18022       }
18023     else
18024       {
18025         if (rclass != GENERAL_REGS)
18026           return false;
18027       }
18028
18029   /* Only the last register in the order in which they occur
18030      may be clobbered by the load.  */
18031   if (rclass == GENERAL_REGS && load)
18032     for (int i = 0; i < num_insns - 1; i++)
18033       if (reg_mentioned_p (reg[i], mem[i]))
18034         return false;
18035
18036   /* Check if the bases are same.  */
18037   for (int i = 0; i < num_insns - 1; i++)
18038     if (!rtx_equal_p (base[i], base[i + 1]))
18039       return false;
18040
18041   for (int i = 0; i < num_insns; i++)
18042     offvals[i] = INTVAL (offset[i]);
18043
18044   msize = GET_MODE_SIZE (mode);
18045
18046   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
18047   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
18048          aarch64_host_wide_int_compare);
18049
18050   if (!(offvals[1] == offvals[0] + msize
18051         && offvals[3] == offvals[2] + msize))
18052     return false;
18053
18054   /* Check that offsets are within range of each other.  The ldp/stp
18055      instructions have 7 bit immediate offsets, so use 0x80.  */
18056   if (offvals[2] - offvals[0] >= msize * 0x80)
18057     return false;
18058
18059   /* The offsets must be aligned with respect to each other.  */
18060   if (offvals[0] % msize != offvals[2] % msize)
18061     return false;
18062
18063   /* If we have SImode and slow unaligned ldp,
18064      check the alignment to be at least 8 byte. */
18065   if (mode == SImode
18066       && (aarch64_tune_params.extra_tuning_flags
18067           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18068       && !optimize_size
18069       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
18070     return false;
18071
18072   return true;
18073 }
18074
18075 /* Given OPERANDS of consecutive load/store, this function pairs them
18076    into LDP/STP after adjusting the offset.  It depends on the fact
18077    that the operands can be sorted so the offsets are correct for STP.
18078    MODE is the mode of memory operands.  CODE is the rtl operator
18079    which should be applied to all memory operands, it's SIGN_EXTEND,
18080    ZERO_EXTEND or UNKNOWN.  */
18081
18082 bool
18083 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
18084                              scalar_mode mode, RTX_CODE code)
18085 {
18086   rtx base, offset_1, offset_3, t1, t2;
18087   rtx mem_1, mem_2, mem_3, mem_4;
18088   rtx temp_operands[8];
18089   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
18090                 stp_off_upper_limit, stp_off_lower_limit, msize;
18091
18092   /* We make changes on a copy as we may still bail out.  */
18093   for (int i = 0; i < 8; i ++)
18094     temp_operands[i] = operands[i];
18095
18096   /* Sort the operands.  */
18097   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
18098
18099   if (load)
18100     {
18101       mem_1 = temp_operands[1];
18102       mem_2 = temp_operands[3];
18103       mem_3 = temp_operands[5];
18104       mem_4 = temp_operands[7];
18105     }
18106   else
18107     {
18108       mem_1 = temp_operands[0];
18109       mem_2 = temp_operands[2];
18110       mem_3 = temp_operands[4];
18111       mem_4 = temp_operands[6];
18112       gcc_assert (code == UNKNOWN);
18113     }
18114
18115   extract_base_offset_in_addr (mem_1, &base, &offset_1);
18116   extract_base_offset_in_addr (mem_3, &base, &offset_3);
18117   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
18118               && offset_3 != NULL_RTX);
18119
18120   /* Adjust offset so it can fit in LDP/STP instruction.  */
18121   msize = GET_MODE_SIZE (mode);
18122   stp_off_upper_limit = msize * (0x40 - 1);
18123   stp_off_lower_limit = - msize * 0x40;
18124
18125   off_val_1 = INTVAL (offset_1);
18126   off_val_3 = INTVAL (offset_3);
18127
18128   /* The base offset is optimally half way between the two STP/LDP offsets.  */
18129   if (msize <= 4)
18130     base_off = (off_val_1 + off_val_3) / 2;
18131   else
18132     /* However, due to issues with negative LDP/STP offset generation for
18133        larger modes, for DF, DI and vector modes. we must not use negative
18134        addresses smaller than 9 signed unadjusted bits can store.  This
18135        provides the most range in this case.  */
18136     base_off = off_val_1;
18137
18138   /* Adjust the base so that it is aligned with the addresses but still
18139      optimal.  */
18140   if (base_off % msize != off_val_1 % msize)
18141     /* Fix the offset, bearing in mind we want to make it bigger not
18142        smaller.  */
18143     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18144   else if (msize <= 4)
18145     /* The negative range of LDP/STP is one larger than the positive range.  */
18146     base_off += msize;
18147
18148   /* Check if base offset is too big or too small.  We can attempt to resolve
18149      this issue by setting it to the maximum value and seeing if the offsets
18150      still fit.  */
18151   if (base_off >= 0x1000)
18152     {
18153       base_off = 0x1000 - 1;
18154       /* We must still make sure that the base offset is aligned with respect
18155          to the address.  But it may may not be made any bigger.  */
18156       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18157     }
18158
18159   /* Likewise for the case where the base is too small.  */
18160   if (base_off <= -0x1000)
18161     {
18162       base_off = -0x1000 + 1;
18163       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18164     }
18165
18166   /* Offset of the first STP/LDP.  */
18167   new_off_1 = off_val_1 - base_off;
18168
18169   /* Offset of the second STP/LDP.  */
18170   new_off_3 = off_val_3 - base_off;
18171
18172   /* The offsets must be within the range of the LDP/STP instructions.  */
18173   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
18174       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
18175     return false;
18176
18177   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
18178                                                   new_off_1), true);
18179   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
18180                                                   new_off_1 + msize), true);
18181   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
18182                                                   new_off_3), true);
18183   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
18184                                                   new_off_3 + msize), true);
18185
18186   if (!aarch64_mem_pair_operand (mem_1, mode)
18187       || !aarch64_mem_pair_operand (mem_3, mode))
18188     return false;
18189
18190   if (code == ZERO_EXTEND)
18191     {
18192       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
18193       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
18194       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
18195       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
18196     }
18197   else if (code == SIGN_EXTEND)
18198     {
18199       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
18200       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
18201       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
18202       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
18203     }
18204
18205   if (load)
18206     {
18207       operands[0] = temp_operands[0];
18208       operands[1] = mem_1;
18209       operands[2] = temp_operands[2];
18210       operands[3] = mem_2;
18211       operands[4] = temp_operands[4];
18212       operands[5] = mem_3;
18213       operands[6] = temp_operands[6];
18214       operands[7] = mem_4;
18215     }
18216   else
18217     {
18218       operands[0] = mem_1;
18219       operands[1] = temp_operands[1];
18220       operands[2] = mem_2;
18221       operands[3] = temp_operands[3];
18222       operands[4] = mem_3;
18223       operands[5] = temp_operands[5];
18224       operands[6] = mem_4;
18225       operands[7] = temp_operands[7];
18226     }
18227
18228   /* Emit adjusting instruction.  */
18229   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
18230   /* Emit ldp/stp instructions.  */
18231   t1 = gen_rtx_SET (operands[0], operands[1]);
18232   t2 = gen_rtx_SET (operands[2], operands[3]);
18233   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18234   t1 = gen_rtx_SET (operands[4], operands[5]);
18235   t2 = gen_rtx_SET (operands[6], operands[7]);
18236   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18237   return true;
18238 }
18239
18240 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
18241    it isn't worth branching around empty masked ops (including masked
18242    stores).  */
18243
18244 static bool
18245 aarch64_empty_mask_is_expensive (unsigned)
18246 {
18247   return false;
18248 }
18249
18250 /* Return 1 if pseudo register should be created and used to hold
18251    GOT address for PIC code.  */
18252
18253 bool
18254 aarch64_use_pseudo_pic_reg (void)
18255 {
18256   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
18257 }
18258
18259 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
18260
18261 static int
18262 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
18263 {
18264   switch (XINT (x, 1))
18265     {
18266     case UNSPEC_GOTSMALLPIC:
18267     case UNSPEC_GOTSMALLPIC28K:
18268     case UNSPEC_GOTTINYPIC:
18269       return 0;
18270     default:
18271       break;
18272     }
18273
18274   return default_unspec_may_trap_p (x, flags);
18275 }
18276
18277
18278 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
18279    return the log2 of that value.  Otherwise return -1.  */
18280
18281 int
18282 aarch64_fpconst_pow_of_2 (rtx x)
18283 {
18284   const REAL_VALUE_TYPE *r;
18285
18286   if (!CONST_DOUBLE_P (x))
18287     return -1;
18288
18289   r = CONST_DOUBLE_REAL_VALUE (x);
18290
18291   if (REAL_VALUE_NEGATIVE (*r)
18292       || REAL_VALUE_ISNAN (*r)
18293       || REAL_VALUE_ISINF (*r)
18294       || !real_isinteger (r, DFmode))
18295     return -1;
18296
18297   return exact_log2 (real_to_integer (r));
18298 }
18299
18300 /* If X is a vector of equal CONST_DOUBLE values and that value is
18301    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
18302
18303 int
18304 aarch64_vec_fpconst_pow_of_2 (rtx x)
18305 {
18306   int nelts;
18307   if (GET_CODE (x) != CONST_VECTOR
18308       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
18309     return -1;
18310
18311   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
18312     return -1;
18313
18314   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
18315   if (firstval <= 0)
18316     return -1;
18317
18318   for (int i = 1; i < nelts; i++)
18319     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
18320       return -1;
18321
18322   return firstval;
18323 }
18324
18325 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
18326    to float.
18327
18328    __fp16 always promotes through this hook.
18329    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
18330    through the generic excess precision logic rather than here.  */
18331
18332 static tree
18333 aarch64_promoted_type (const_tree t)
18334 {
18335   if (SCALAR_FLOAT_TYPE_P (t)
18336       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
18337     return float_type_node;
18338
18339   return NULL_TREE;
18340 }
18341
18342 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
18343
18344 static bool
18345 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
18346                            optimization_type opt_type)
18347 {
18348   switch (op)
18349     {
18350     case rsqrt_optab:
18351       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
18352
18353     default:
18354       return true;
18355     }
18356 }
18357
18358 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
18359
18360 static unsigned int
18361 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
18362                                         int *offset)
18363 {
18364   /* Polynomial invariant 1 == (VG / 2) - 1.  */
18365   gcc_assert (i == 1);
18366   *factor = 2;
18367   *offset = 1;
18368   return AARCH64_DWARF_VG;
18369 }
18370
18371 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
18372    if MODE is HFmode, and punt to the generic implementation otherwise.  */
18373
18374 static bool
18375 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
18376 {
18377   return (mode == HFmode
18378           ? true
18379           : default_libgcc_floating_mode_supported_p (mode));
18380 }
18381
18382 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
18383    if MODE is HFmode, and punt to the generic implementation otherwise.  */
18384
18385 static bool
18386 aarch64_scalar_mode_supported_p (scalar_mode mode)
18387 {
18388   return (mode == HFmode
18389           ? true
18390           : default_scalar_mode_supported_p (mode));
18391 }
18392
18393 /* Set the value of FLT_EVAL_METHOD.
18394    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
18395
18396     0: evaluate all operations and constants, whose semantic type has at
18397        most the range and precision of type float, to the range and
18398        precision of float; evaluate all other operations and constants to
18399        the range and precision of the semantic type;
18400
18401     N, where _FloatN is a supported interchange floating type
18402        evaluate all operations and constants, whose semantic type has at
18403        most the range and precision of _FloatN type, to the range and
18404        precision of the _FloatN type; evaluate all other operations and
18405        constants to the range and precision of the semantic type;
18406
18407    If we have the ARMv8.2-A extensions then we support _Float16 in native
18408    precision, so we should set this to 16.  Otherwise, we support the type,
18409    but want to evaluate expressions in float precision, so set this to
18410    0.  */
18411
18412 static enum flt_eval_method
18413 aarch64_excess_precision (enum excess_precision_type type)
18414 {
18415   switch (type)
18416     {
18417       case EXCESS_PRECISION_TYPE_FAST:
18418       case EXCESS_PRECISION_TYPE_STANDARD:
18419         /* We can calculate either in 16-bit range and precision or
18420            32-bit range and precision.  Make that decision based on whether
18421            we have native support for the ARMv8.2-A 16-bit floating-point
18422            instructions or not.  */
18423         return (TARGET_FP_F16INST
18424                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
18425                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
18426       case EXCESS_PRECISION_TYPE_IMPLICIT:
18427         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
18428       default:
18429         gcc_unreachable ();
18430     }
18431   return FLT_EVAL_METHOD_UNPREDICTABLE;
18432 }
18433
18434 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
18435    scheduled for speculative execution.  Reject the long-running division
18436    and square-root instructions.  */
18437
18438 static bool
18439 aarch64_sched_can_speculate_insn (rtx_insn *insn)
18440 {
18441   switch (get_attr_type (insn))
18442     {
18443       case TYPE_SDIV:
18444       case TYPE_UDIV:
18445       case TYPE_FDIVS:
18446       case TYPE_FDIVD:
18447       case TYPE_FSQRTS:
18448       case TYPE_FSQRTD:
18449       case TYPE_NEON_FP_SQRT_S:
18450       case TYPE_NEON_FP_SQRT_D:
18451       case TYPE_NEON_FP_SQRT_S_Q:
18452       case TYPE_NEON_FP_SQRT_D_Q:
18453       case TYPE_NEON_FP_DIV_S:
18454       case TYPE_NEON_FP_DIV_D:
18455       case TYPE_NEON_FP_DIV_S_Q:
18456       case TYPE_NEON_FP_DIV_D_Q:
18457         return false;
18458       default:
18459         return true;
18460     }
18461 }
18462
18463 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
18464
18465 static int
18466 aarch64_compute_pressure_classes (reg_class *classes)
18467 {
18468   int i = 0;
18469   classes[i++] = GENERAL_REGS;
18470   classes[i++] = FP_REGS;
18471   /* PR_REGS isn't a useful pressure class because many predicate pseudo
18472      registers need to go in PR_LO_REGS at some point during their
18473      lifetime.  Splitting it into two halves has the effect of making
18474      all predicates count against PR_LO_REGS, so that we try whenever
18475      possible to restrict the number of live predicates to 8.  This
18476      greatly reduces the amount of spilling in certain loops.  */
18477   classes[i++] = PR_LO_REGS;
18478   classes[i++] = PR_HI_REGS;
18479   return i;
18480 }
18481
18482 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
18483
18484 static bool
18485 aarch64_can_change_mode_class (machine_mode from,
18486                                machine_mode to, reg_class_t)
18487 {
18488   if (BYTES_BIG_ENDIAN)
18489     {
18490       bool from_sve_p = aarch64_sve_data_mode_p (from);
18491       bool to_sve_p = aarch64_sve_data_mode_p (to);
18492
18493       /* Don't allow changes between SVE data modes and non-SVE modes.
18494          See the comment at the head of aarch64-sve.md for details.  */
18495       if (from_sve_p != to_sve_p)
18496         return false;
18497
18498       /* Don't allow changes in element size: lane 0 of the new vector
18499          would not then be lane 0 of the old vector.  See the comment
18500          above aarch64_maybe_expand_sve_subreg_move for a more detailed
18501          description.
18502
18503          In the worst case, this forces a register to be spilled in
18504          one mode and reloaded in the other, which handles the
18505          endianness correctly.  */
18506       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
18507         return false;
18508     }
18509   return true;
18510 }
18511
18512 /* Implement TARGET_EARLY_REMAT_MODES.  */
18513
18514 static void
18515 aarch64_select_early_remat_modes (sbitmap modes)
18516 {
18517   /* SVE values are not normally live across a call, so it should be
18518      worth doing early rematerialization even in VL-specific mode.  */
18519   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
18520     {
18521       machine_mode mode = (machine_mode) i;
18522       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
18523       if (vec_flags & VEC_ANY_SVE)
18524         bitmap_set_bit (modes, i);
18525     }
18526 }
18527
18528 /* Override the default target speculation_safe_value.  */
18529 static rtx
18530 aarch64_speculation_safe_value (machine_mode mode,
18531                                 rtx result, rtx val, rtx failval)
18532 {
18533   /* Maybe we should warn if falling back to hard barriers.  They are
18534      likely to be noticably more expensive than the alternative below.  */
18535   if (!aarch64_track_speculation)
18536     return default_speculation_safe_value (mode, result, val, failval);
18537
18538   if (!REG_P (val))
18539     val = copy_to_mode_reg (mode, val);
18540
18541   if (!aarch64_reg_or_zero (failval, mode))
18542     failval = copy_to_mode_reg (mode, failval);
18543
18544   emit_insn (gen_despeculate_copy (mode, result, val, failval));
18545   return result;
18546 }
18547
18548 /* Implement TARGET_ESTIMATED_POLY_VALUE.
18549    Look into the tuning structure for an estimate.
18550    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
18551    Advanced SIMD 128 bits.  */
18552
18553 static HOST_WIDE_INT
18554 aarch64_estimated_poly_value (poly_int64 val)
18555 {
18556   enum aarch64_sve_vector_bits_enum width_source
18557     = aarch64_tune_params.sve_width;
18558
18559   /* If we still don't have an estimate, use the default.  */
18560   if (width_source == SVE_SCALABLE)
18561     return default_estimated_poly_value (val);
18562
18563   HOST_WIDE_INT over_128 = width_source - 128;
18564   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
18565 }
18566
18567
18568 /* Return true for types that could be supported as SIMD return or
18569    argument types.  */
18570
18571 static bool
18572 supported_simd_type (tree t)
18573 {
18574   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
18575     {
18576       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
18577       return s == 1 || s == 2 || s == 4 || s == 8;
18578     }
18579   return false;
18580 }
18581
18582 /* Return true for types that currently are supported as SIMD return
18583    or argument types.  */
18584
18585 static bool
18586 currently_supported_simd_type (tree t, tree b)
18587 {
18588   if (COMPLEX_FLOAT_TYPE_P (t))
18589     return false;
18590
18591   if (TYPE_SIZE (t) != TYPE_SIZE (b))
18592     return false;
18593
18594   return supported_simd_type (t);
18595 }
18596
18597 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
18598
18599 static int
18600 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
18601                                         struct cgraph_simd_clone *clonei,
18602                                         tree base_type, int num)
18603 {
18604   tree t, ret_type, arg_type;
18605   unsigned int elt_bits, vec_bits, count;
18606
18607   if (!TARGET_SIMD)
18608     return 0;
18609
18610   if (clonei->simdlen
18611       && (clonei->simdlen < 2
18612           || clonei->simdlen > 1024
18613           || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
18614     {
18615       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18616                   "unsupported simdlen %d", clonei->simdlen);
18617       return 0;
18618     }
18619
18620   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
18621   if (TREE_CODE (ret_type) != VOID_TYPE
18622       && !currently_supported_simd_type (ret_type, base_type))
18623     {
18624       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
18625         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18626                     "GCC does not currently support mixed size types "
18627                     "for %<simd%> functions");
18628       else if (supported_simd_type (ret_type))
18629         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18630                     "GCC does not currently support return type %qT "
18631                     "for %<simd%> functions", ret_type);
18632       else
18633         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18634                     "unsupported return type %qT for %<simd%> functions",
18635                     ret_type);
18636       return 0;
18637     }
18638
18639   for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
18640     {
18641       arg_type = TREE_TYPE (t);
18642
18643       if (!currently_supported_simd_type (arg_type, base_type))
18644         {
18645           if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
18646             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18647                         "GCC does not currently support mixed size types "
18648                         "for %<simd%> functions");
18649           else
18650             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18651                         "GCC does not currently support argument type %qT "
18652                         "for %<simd%> functions", arg_type);
18653           return 0;
18654         }
18655     }
18656
18657   clonei->vecsize_mangle = 'n';
18658   clonei->mask_mode = VOIDmode;
18659   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
18660   if (clonei->simdlen == 0)
18661     {
18662       count = 2;
18663       vec_bits = (num == 0 ? 64 : 128);
18664       clonei->simdlen = vec_bits / elt_bits;
18665     }
18666   else
18667     {
18668       count = 1;
18669       vec_bits = clonei->simdlen * elt_bits;
18670       if (vec_bits != 64 && vec_bits != 128)
18671         {
18672           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18673                       "GCC does not currently support simdlen %d for type %qT",
18674                       clonei->simdlen, base_type);
18675           return 0;
18676         }
18677     }
18678   clonei->vecsize_int = vec_bits;
18679   clonei->vecsize_float = vec_bits;
18680   return count;
18681 }
18682
18683 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
18684
18685 static void
18686 aarch64_simd_clone_adjust (struct cgraph_node *node)
18687 {
18688   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
18689      use the correct ABI.  */
18690
18691   tree t = TREE_TYPE (node->decl);
18692   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
18693                                         TYPE_ATTRIBUTES (t));
18694 }
18695
18696 /* Implement TARGET_SIMD_CLONE_USABLE.  */
18697
18698 static int
18699 aarch64_simd_clone_usable (struct cgraph_node *node)
18700 {
18701   switch (node->simdclone->vecsize_mangle)
18702     {
18703     case 'n':
18704       if (!TARGET_SIMD)
18705         return -1;
18706       return 0;
18707     default:
18708       gcc_unreachable ();
18709     }
18710 }
18711
18712 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
18713    global variable based guard use the default else
18714    return a null tree.  */
18715 static tree
18716 aarch64_stack_protect_guard (void)
18717 {
18718   if (aarch64_stack_protector_guard == SSP_GLOBAL)
18719     return default_stack_protect_guard ();
18720
18721   return NULL_TREE;
18722 }
18723
18724
18725 /* Target-specific selftests.  */
18726
18727 #if CHECKING_P
18728
18729 namespace selftest {
18730
18731 /* Selftest for the RTL loader.
18732    Verify that the RTL loader copes with a dump from
18733    print_rtx_function.  This is essentially just a test that class
18734    function_reader can handle a real dump, but it also verifies
18735    that lookup_reg_by_dump_name correctly handles hard regs.
18736    The presence of hard reg names in the dump means that the test is
18737    target-specific, hence it is in this file.  */
18738
18739 static void
18740 aarch64_test_loading_full_dump ()
18741 {
18742   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
18743
18744   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
18745
18746   rtx_insn *insn_1 = get_insn_by_uid (1);
18747   ASSERT_EQ (NOTE, GET_CODE (insn_1));
18748
18749   rtx_insn *insn_15 = get_insn_by_uid (15);
18750   ASSERT_EQ (INSN, GET_CODE (insn_15));
18751   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
18752
18753   /* Verify crtl->return_rtx.  */
18754   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
18755   ASSERT_EQ (0, REGNO (crtl->return_rtx));
18756   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
18757 }
18758
18759 /* Run all target-specific selftests.  */
18760
18761 static void
18762 aarch64_run_selftests (void)
18763 {
18764   aarch64_test_loading_full_dump ();
18765 }
18766
18767 } // namespace selftest
18768
18769 #endif /* #if CHECKING_P */
18770
18771 #undef TARGET_STACK_PROTECT_GUARD
18772 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
18773
18774 #undef TARGET_ADDRESS_COST
18775 #define TARGET_ADDRESS_COST aarch64_address_cost
18776
18777 /* This hook will determines whether unnamed bitfields affect the alignment
18778    of the containing structure.  The hook returns true if the structure
18779    should inherit the alignment requirements of an unnamed bitfield's
18780    type.  */
18781 #undef TARGET_ALIGN_ANON_BITFIELD
18782 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
18783
18784 #undef TARGET_ASM_ALIGNED_DI_OP
18785 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
18786
18787 #undef TARGET_ASM_ALIGNED_HI_OP
18788 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
18789
18790 #undef TARGET_ASM_ALIGNED_SI_OP
18791 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
18792
18793 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
18794 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
18795   hook_bool_const_tree_hwi_hwi_const_tree_true
18796
18797 #undef TARGET_ASM_FILE_START
18798 #define TARGET_ASM_FILE_START aarch64_start_file
18799
18800 #undef TARGET_ASM_OUTPUT_MI_THUNK
18801 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
18802
18803 #undef TARGET_ASM_SELECT_RTX_SECTION
18804 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
18805
18806 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
18807 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
18808
18809 #undef TARGET_BUILD_BUILTIN_VA_LIST
18810 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
18811
18812 #undef TARGET_CALLEE_COPIES
18813 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
18814
18815 #undef TARGET_CAN_ELIMINATE
18816 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
18817
18818 #undef TARGET_CAN_INLINE_P
18819 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
18820
18821 #undef TARGET_CANNOT_FORCE_CONST_MEM
18822 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
18823
18824 #undef TARGET_CASE_VALUES_THRESHOLD
18825 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
18826
18827 #undef TARGET_CONDITIONAL_REGISTER_USAGE
18828 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
18829
18830 /* Only the least significant bit is used for initialization guard
18831    variables.  */
18832 #undef TARGET_CXX_GUARD_MASK_BIT
18833 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
18834
18835 #undef TARGET_C_MODE_FOR_SUFFIX
18836 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
18837
18838 #ifdef TARGET_BIG_ENDIAN_DEFAULT
18839 #undef  TARGET_DEFAULT_TARGET_FLAGS
18840 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
18841 #endif
18842
18843 #undef TARGET_CLASS_MAX_NREGS
18844 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
18845
18846 #undef TARGET_BUILTIN_DECL
18847 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
18848
18849 #undef TARGET_BUILTIN_RECIPROCAL
18850 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
18851
18852 #undef TARGET_C_EXCESS_PRECISION
18853 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
18854
18855 #undef  TARGET_EXPAND_BUILTIN
18856 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
18857
18858 #undef TARGET_EXPAND_BUILTIN_VA_START
18859 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
18860
18861 #undef TARGET_FOLD_BUILTIN
18862 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
18863
18864 #undef TARGET_FUNCTION_ARG
18865 #define TARGET_FUNCTION_ARG aarch64_function_arg
18866
18867 #undef TARGET_FUNCTION_ARG_ADVANCE
18868 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
18869
18870 #undef TARGET_FUNCTION_ARG_BOUNDARY
18871 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
18872
18873 #undef TARGET_FUNCTION_ARG_PADDING
18874 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
18875
18876 #undef TARGET_GET_RAW_RESULT_MODE
18877 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
18878 #undef TARGET_GET_RAW_ARG_MODE
18879 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
18880
18881 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
18882 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
18883
18884 #undef TARGET_FUNCTION_VALUE
18885 #define TARGET_FUNCTION_VALUE aarch64_function_value
18886
18887 #undef TARGET_FUNCTION_VALUE_REGNO_P
18888 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
18889
18890 #undef TARGET_GIMPLE_FOLD_BUILTIN
18891 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
18892
18893 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
18894 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
18895
18896 #undef  TARGET_INIT_BUILTINS
18897 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
18898
18899 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
18900 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
18901   aarch64_ira_change_pseudo_allocno_class
18902
18903 #undef TARGET_LEGITIMATE_ADDRESS_P
18904 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
18905
18906 #undef TARGET_LEGITIMATE_CONSTANT_P
18907 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
18908
18909 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
18910 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
18911   aarch64_legitimize_address_displacement
18912
18913 #undef TARGET_LIBGCC_CMP_RETURN_MODE
18914 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
18915
18916 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
18917 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
18918 aarch64_libgcc_floating_mode_supported_p
18919
18920 #undef TARGET_MANGLE_TYPE
18921 #define TARGET_MANGLE_TYPE aarch64_mangle_type
18922
18923 #undef TARGET_MEMORY_MOVE_COST
18924 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
18925
18926 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
18927 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
18928
18929 #undef TARGET_MUST_PASS_IN_STACK
18930 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
18931
18932 /* This target hook should return true if accesses to volatile bitfields
18933    should use the narrowest mode possible.  It should return false if these
18934    accesses should use the bitfield container type.  */
18935 #undef TARGET_NARROW_VOLATILE_BITFIELD
18936 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
18937
18938 #undef  TARGET_OPTION_OVERRIDE
18939 #define TARGET_OPTION_OVERRIDE aarch64_override_options
18940
18941 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
18942 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
18943   aarch64_override_options_after_change
18944
18945 #undef TARGET_OPTION_SAVE
18946 #define TARGET_OPTION_SAVE aarch64_option_save
18947
18948 #undef TARGET_OPTION_RESTORE
18949 #define TARGET_OPTION_RESTORE aarch64_option_restore
18950
18951 #undef TARGET_OPTION_PRINT
18952 #define TARGET_OPTION_PRINT aarch64_option_print
18953
18954 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
18955 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
18956
18957 #undef TARGET_SET_CURRENT_FUNCTION
18958 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
18959
18960 #undef TARGET_PASS_BY_REFERENCE
18961 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
18962
18963 #undef TARGET_PREFERRED_RELOAD_CLASS
18964 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
18965
18966 #undef TARGET_SCHED_REASSOCIATION_WIDTH
18967 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
18968
18969 #undef TARGET_PROMOTED_TYPE
18970 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
18971
18972 #undef TARGET_SECONDARY_RELOAD
18973 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
18974
18975 #undef TARGET_SHIFT_TRUNCATION_MASK
18976 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
18977
18978 #undef TARGET_SETUP_INCOMING_VARARGS
18979 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
18980
18981 #undef TARGET_STRUCT_VALUE_RTX
18982 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
18983
18984 #undef TARGET_REGISTER_MOVE_COST
18985 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
18986
18987 #undef TARGET_RETURN_IN_MEMORY
18988 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
18989
18990 #undef TARGET_RETURN_IN_MSB
18991 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
18992
18993 #undef TARGET_RTX_COSTS
18994 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
18995
18996 #undef TARGET_SCALAR_MODE_SUPPORTED_P
18997 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
18998
18999 #undef TARGET_SCHED_ISSUE_RATE
19000 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
19001
19002 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
19003 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
19004   aarch64_sched_first_cycle_multipass_dfa_lookahead
19005
19006 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
19007 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
19008   aarch64_first_cycle_multipass_dfa_lookahead_guard
19009
19010 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
19011 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
19012   aarch64_get_separate_components
19013
19014 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
19015 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
19016   aarch64_components_for_bb
19017
19018 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
19019 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
19020   aarch64_disqualify_components
19021
19022 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
19023 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
19024   aarch64_emit_prologue_components
19025
19026 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
19027 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
19028   aarch64_emit_epilogue_components
19029
19030 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
19031 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
19032   aarch64_set_handled_components
19033
19034 #undef TARGET_TRAMPOLINE_INIT
19035 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
19036
19037 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
19038 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
19039
19040 #undef TARGET_VECTOR_MODE_SUPPORTED_P
19041 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
19042
19043 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
19044 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
19045   aarch64_builtin_support_vector_misalignment
19046
19047 #undef TARGET_ARRAY_MODE
19048 #define TARGET_ARRAY_MODE aarch64_array_mode
19049
19050 #undef TARGET_ARRAY_MODE_SUPPORTED_P
19051 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
19052
19053 #undef TARGET_VECTORIZE_ADD_STMT_COST
19054 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
19055
19056 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
19057 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
19058   aarch64_builtin_vectorization_cost
19059
19060 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
19061 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
19062
19063 #undef TARGET_VECTORIZE_BUILTINS
19064 #define TARGET_VECTORIZE_BUILTINS
19065
19066 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
19067 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
19068   aarch64_builtin_vectorized_function
19069
19070 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
19071 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
19072   aarch64_autovectorize_vector_sizes
19073
19074 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
19075 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
19076   aarch64_atomic_assign_expand_fenv
19077
19078 /* Section anchor support.  */
19079
19080 #undef TARGET_MIN_ANCHOR_OFFSET
19081 #define TARGET_MIN_ANCHOR_OFFSET -256
19082
19083 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
19084    byte offset; we can do much more for larger data types, but have no way
19085    to determine the size of the access.  We assume accesses are aligned.  */
19086 #undef TARGET_MAX_ANCHOR_OFFSET
19087 #define TARGET_MAX_ANCHOR_OFFSET 4095
19088
19089 #undef TARGET_VECTOR_ALIGNMENT
19090 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
19091
19092 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
19093 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
19094   aarch64_vectorize_preferred_vector_alignment
19095 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
19096 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
19097   aarch64_simd_vector_alignment_reachable
19098
19099 /* vec_perm support.  */
19100
19101 #undef TARGET_VECTORIZE_VEC_PERM_CONST
19102 #define TARGET_VECTORIZE_VEC_PERM_CONST \
19103   aarch64_vectorize_vec_perm_const
19104
19105 #undef TARGET_VECTORIZE_GET_MASK_MODE
19106 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
19107 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
19108 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
19109   aarch64_empty_mask_is_expensive
19110 #undef TARGET_PREFERRED_ELSE_VALUE
19111 #define TARGET_PREFERRED_ELSE_VALUE \
19112   aarch64_preferred_else_value
19113
19114 #undef TARGET_INIT_LIBFUNCS
19115 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
19116
19117 #undef TARGET_FIXED_CONDITION_CODE_REGS
19118 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
19119
19120 #undef TARGET_FLAGS_REGNUM
19121 #define TARGET_FLAGS_REGNUM CC_REGNUM
19122
19123 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
19124 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
19125
19126 #undef TARGET_ASAN_SHADOW_OFFSET
19127 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
19128
19129 #undef TARGET_LEGITIMIZE_ADDRESS
19130 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
19131
19132 #undef TARGET_SCHED_CAN_SPECULATE_INSN
19133 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
19134
19135 #undef TARGET_CAN_USE_DOLOOP_P
19136 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
19137
19138 #undef TARGET_SCHED_ADJUST_PRIORITY
19139 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
19140
19141 #undef TARGET_SCHED_MACRO_FUSION_P
19142 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
19143
19144 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
19145 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
19146
19147 #undef TARGET_SCHED_FUSION_PRIORITY
19148 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
19149
19150 #undef TARGET_UNSPEC_MAY_TRAP_P
19151 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
19152
19153 #undef TARGET_USE_PSEUDO_PIC_REG
19154 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
19155
19156 #undef TARGET_PRINT_OPERAND
19157 #define TARGET_PRINT_OPERAND aarch64_print_operand
19158
19159 #undef TARGET_PRINT_OPERAND_ADDRESS
19160 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
19161
19162 #undef TARGET_OPTAB_SUPPORTED_P
19163 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
19164
19165 #undef TARGET_OMIT_STRUCT_RETURN_REG
19166 #define TARGET_OMIT_STRUCT_RETURN_REG true
19167
19168 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
19169 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
19170   aarch64_dwarf_poly_indeterminate_value
19171
19172 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
19173 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
19174 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
19175
19176 #undef TARGET_HARD_REGNO_NREGS
19177 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
19178 #undef TARGET_HARD_REGNO_MODE_OK
19179 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
19180
19181 #undef TARGET_MODES_TIEABLE_P
19182 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
19183
19184 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
19185 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
19186   aarch64_hard_regno_call_part_clobbered
19187
19188 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
19189 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
19190   aarch64_remove_extra_call_preserved_regs
19191
19192 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
19193 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
19194   aarch64_return_call_with_max_clobbers
19195
19196 #undef TARGET_CONSTANT_ALIGNMENT
19197 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
19198
19199 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
19200 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
19201   aarch64_stack_clash_protection_alloca_probe_range
19202
19203 #undef TARGET_COMPUTE_PRESSURE_CLASSES
19204 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
19205
19206 #undef TARGET_CAN_CHANGE_MODE_CLASS
19207 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
19208
19209 #undef TARGET_SELECT_EARLY_REMAT_MODES
19210 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
19211
19212 #undef TARGET_SPECULATION_SAFE_VALUE
19213 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
19214
19215 #undef TARGET_ESTIMATED_POLY_VALUE
19216 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
19217
19218 #undef TARGET_ATTRIBUTE_TABLE
19219 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
19220
19221 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
19222 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
19223   aarch64_simd_clone_compute_vecsize_and_simdlen
19224
19225 #undef TARGET_SIMD_CLONE_ADJUST
19226 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
19227
19228 #undef TARGET_SIMD_CLONE_USABLE
19229 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
19230
19231 #if CHECKING_P
19232 #undef TARGET_RUN_TARGET_SELFTESTS
19233 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
19234 #endif /* #if CHECKING_P */
19235
19236 struct gcc_target targetm = TARGET_INITIALIZER;
19237
19238 #include "gt-aarch64.h"