gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2019 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "cgraph.h"
  44 #include "diagnostic.h"
  45 #include "insn-attr.h"
  46 #include "alias.h"
  47 #include "fold-const.h"
  48 #include "stor-layout.h"
  49 #include "calls.h"
  50 #include "varasm.h"
  51 #include "output.h"
  52 #include "flags.h"
  53 #include "explow.h"
  54 #include "expr.h"
  55 #include "reload.h"
  56 #include "langhooks.h"
  57 #include "opts.h"
  58 #include "params.h"
  59 #include "gimplify.h"
  60 #include "dwarf2.h"
  61 #include "gimple-iterator.h"
  62 #include "tree-vectorizer.h"
  63 #include "aarch64-cost-tables.h"
  64 #include "dumpfile.h"
  65 #include "builtins.h"
  66 #include "rtl-iter.h"
  67 #include "tm-constrs.h"
  68 #include "sched-int.h"
  69 #include "target-globals.h"
  70 #include "common/common-target.h"
  71 #include "cfgrtl.h"
  72 #include "selftest.h"
  73 #include "selftest-rtl.h"
  74 #include "rtx-vector-builder.h"
  75 #include "intl.h"
  76
  77 /* This file should be included last.  */
  78 #include "target-def.h"
  79
  80 /* Defined for convenience.  */
  81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  82
  83 /* Information about a legitimate vector immediate operand.  */
  84 struct simd_immediate_info
  85 {
  86   enum insn_type { MOV, MVN };
  87   enum modifier_type { LSL, MSL };
  88
  89   simd_immediate_info () {}
  90   simd_immediate_info (scalar_float_mode, rtx);
  91   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  92                        insn_type = MOV, modifier_type = LSL,
  93                        unsigned int = 0);
  94   simd_immediate_info (scalar_mode, rtx, rtx);
  95
  96   /* The mode of the elements.  */
  97   scalar_mode elt_mode;
  98
  99   /* The value of each element if all elements are the same, or the
 100      first value if the constant is a series.  */
 101   rtx value;
 102
 103   /* The value of the step if the constant is a series, null otherwise.  */
 104   rtx step;
 105
 106   /* The instruction to use to move the immediate into a vector.  */
 107   insn_type insn;
 108
 109   /* The kind of shift modifier to use, and the number of bits to shift.
 110      This is (LSL, 0) if no shift is needed.  */
 111   modifier_type modifier;
 112   unsigned int shift;
 113 };
 114
 115 /* Construct a floating-point immediate in which each element has mode
 116    ELT_MODE_IN and value VALUE_IN.  */
 117 inline simd_immediate_info
 118 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 119   : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
 120     modifier (LSL), shift (0)
 121 {}
 122
 123 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 124    and value VALUE_IN.  The other parameters are as for the structure
 125    fields.  */
 126 inline simd_immediate_info
 127 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 128                        unsigned HOST_WIDE_INT value_in,
 129                        insn_type insn_in, modifier_type modifier_in,
 130                        unsigned int shift_in)
 131   : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
 132     step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
 133 {}
 134
 135 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 136    and where element I is equal to VALUE_IN + I * STEP_IN.  */
 137 inline simd_immediate_info
 138 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
 139   : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
 140     modifier (LSL), shift (0)
 141 {}
 142
 143 /* The current code model.  */
 144 enum aarch64_code_model aarch64_cmodel;
 145
 146 /* The number of 64-bit elements in an SVE vector.  */
 147 poly_uint16 aarch64_sve_vg;
 148
 149 #ifdef HAVE_AS_TLS
 150 #undef TARGET_HAVE_TLS
 151 #define TARGET_HAVE_TLS 1
 152 #endif
 153
 154 static bool aarch64_composite_type_p (const_tree, machine_mode);
 155 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 156                                                      const_tree,
 157                                                      machine_mode *, int *,
 158                                                      bool *);
 159 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 160 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 161 static void aarch64_override_options_after_change (void);
 162 static bool aarch64_vector_mode_supported_p (machine_mode);
 163 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 164 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 165                                                          const_tree type,
 166                                                          int misalignment,
 167                                                          bool is_packed);
 168 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 169 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 170                                             aarch64_addr_query_type);
 171 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 172
 173 /* Major revision number of the ARM Architecture implemented by the target.  */
 174 unsigned aarch64_architecture_version;
 175
 176 /* The processor for which instructions should be scheduled.  */
 177 enum aarch64_processor aarch64_tune = cortexa53;
 178
 179 /* Mask to specify which instruction scheduling options should be used.  */
 180 uint64_t aarch64_tune_flags = 0;
 181
 182 /* Global flag for PC relative loads.  */
 183 bool aarch64_pcrelative_literal_loads;
 184
 185 /* Global flag for whether frame pointer is enabled.  */
 186 bool aarch64_use_frame_pointer;
 187
 188 #define BRANCH_PROTECT_STR_MAX 255
 189 char *accepted_branch_protection_string = NULL;
 190
 191 static enum aarch64_parse_opt_result
 192 aarch64_parse_branch_protection (const char*, char**);
 193
 194 /* Support for command line parsing of boolean flags in the tuning
 195    structures.  */
 196 struct aarch64_flag_desc
 197 {
 198   const char* name;
 199   unsigned int flag;
 200 };
 201
 202 #define AARCH64_FUSION_PAIR(name, internal_name) \
 203   { name, AARCH64_FUSE_##internal_name },
 204 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 205 {
 206   { "none", AARCH64_FUSE_NOTHING },
 207 #include "aarch64-fusion-pairs.def"
 208   { "all", AARCH64_FUSE_ALL },
 209   { NULL, AARCH64_FUSE_NOTHING }
 210 };
 211
 212 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 213   { name, AARCH64_EXTRA_TUNE_##internal_name },
 214 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 215 {
 216   { "none", AARCH64_EXTRA_TUNE_NONE },
 217 #include "aarch64-tuning-flags.def"
 218   { "all", AARCH64_EXTRA_TUNE_ALL },
 219   { NULL, AARCH64_EXTRA_TUNE_NONE }
 220 };
 221
 222 /* Tuning parameters.  */
 223
 224 static const struct cpu_addrcost_table generic_addrcost_table =
 225 {
 226     {
 227       1, /* hi  */
 228       0, /* si  */
 229       0, /* di  */
 230       1, /* ti  */
 231     },
 232   0, /* pre_modify  */
 233   0, /* post_modify  */
 234   0, /* register_offset  */
 235   0, /* register_sextend  */
 236   0, /* register_zextend  */
 237   0 /* imm_offset  */
 238 };
 239
 240 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 241 {
 242     {
 243       0, /* hi  */
 244       0, /* si  */
 245       0, /* di  */
 246       2, /* ti  */
 247     },
 248   0, /* pre_modify  */
 249   0, /* post_modify  */
 250   1, /* register_offset  */
 251   1, /* register_sextend  */
 252   2, /* register_zextend  */
 253   0, /* imm_offset  */
 254 };
 255
 256 static const struct cpu_addrcost_table xgene1_addrcost_table =
 257 {
 258     {
 259       1, /* hi  */
 260       0, /* si  */
 261       0, /* di  */
 262       1, /* ti  */
 263     },
 264   1, /* pre_modify  */
 265   1, /* post_modify  */
 266   0, /* register_offset  */
 267   1, /* register_sextend  */
 268   1, /* register_zextend  */
 269   0, /* imm_offset  */
 270 };
 271
 272 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 273 {
 274     {
 275       1, /* hi  */
 276       1, /* si  */
 277       1, /* di  */
 278       2, /* ti  */
 279     },
 280   0, /* pre_modify  */
 281   0, /* post_modify  */
 282   2, /* register_offset  */
 283   3, /* register_sextend  */
 284   3, /* register_zextend  */
 285   0, /* imm_offset  */
 286 };
 287
 288 static const struct cpu_addrcost_table tsv110_addrcost_table =
 289 {
 290     {
 291       1, /* hi  */
 292       0, /* si  */
 293       0, /* di  */
 294       1, /* ti  */
 295     },
 296   0, /* pre_modify  */
 297   0, /* post_modify  */
 298   0, /* register_offset  */
 299   1, /* register_sextend  */
 300   1, /* register_zextend  */
 301   0, /* imm_offset  */
 302 };
 303
 304 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 305 {
 306     {
 307       1, /* hi  */
 308       1, /* si  */
 309       1, /* di  */
 310       2, /* ti  */
 311     },
 312   1, /* pre_modify  */
 313   1, /* post_modify  */
 314   3, /* register_offset  */
 315   3, /* register_sextend  */
 316   3, /* register_zextend  */
 317   2, /* imm_offset  */
 318 };
 319
 320 static const struct cpu_regmove_cost generic_regmove_cost =
 321 {
 322   1, /* GP2GP  */
 323   /* Avoid the use of slow int<->fp moves for spilling by setting
 324      their cost higher than memmov_cost.  */
 325   5, /* GP2FP  */
 326   5, /* FP2GP  */
 327   2 /* FP2FP  */
 328 };
 329
 330 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 331 {
 332   1, /* GP2GP  */
 333   /* Avoid the use of slow int<->fp moves for spilling by setting
 334      their cost higher than memmov_cost.  */
 335   5, /* GP2FP  */
 336   5, /* FP2GP  */
 337   2 /* FP2FP  */
 338 };
 339
 340 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 341 {
 342   1, /* GP2GP  */
 343   /* Avoid the use of slow int<->fp moves for spilling by setting
 344      their cost higher than memmov_cost.  */
 345   5, /* GP2FP  */
 346   5, /* FP2GP  */
 347   2 /* FP2FP  */
 348 };
 349
 350 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 351 {
 352   1, /* GP2GP  */
 353   /* Avoid the use of slow int<->fp moves for spilling by setting
 354      their cost higher than memmov_cost (actual, 4 and 9).  */
 355   9, /* GP2FP  */
 356   9, /* FP2GP  */
 357   1 /* FP2FP  */
 358 };
 359
 360 static const struct cpu_regmove_cost thunderx_regmove_cost =
 361 {
 362   2, /* GP2GP  */
 363   2, /* GP2FP  */
 364   6, /* FP2GP  */
 365   4 /* FP2FP  */
 366 };
 367
 368 static const struct cpu_regmove_cost xgene1_regmove_cost =
 369 {
 370   1, /* GP2GP  */
 371   /* Avoid the use of slow int<->fp moves for spilling by setting
 372      their cost higher than memmov_cost.  */
 373   8, /* GP2FP  */
 374   8, /* FP2GP  */
 375   2 /* FP2FP  */
 376 };
 377
 378 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 379 {
 380   2, /* GP2GP  */
 381   /* Avoid the use of int<->fp moves for spilling.  */
 382   6, /* GP2FP  */
 383   6, /* FP2GP  */
 384   4 /* FP2FP  */
 385 };
 386
 387 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 388 {
 389   1, /* GP2GP  */
 390   /* Avoid the use of int<->fp moves for spilling.  */
 391   8, /* GP2FP  */
 392   8, /* FP2GP  */
 393   4  /* FP2FP  */
 394 };
 395
 396 static const struct cpu_regmove_cost tsv110_regmove_cost =
 397 {
 398   1, /* GP2GP  */
 399   /* Avoid the use of slow int<->fp moves for spilling by setting
 400      their cost higher than memmov_cost.  */
 401   2, /* GP2FP  */
 402   3, /* FP2GP  */
 403   2  /* FP2FP  */
 404 };
 405
 406 /* Generic costs for vector insn classes.  */
 407 static const struct cpu_vector_cost generic_vector_cost =
 408 {
 409   1, /* scalar_int_stmt_cost  */
 410   1, /* scalar_fp_stmt_cost  */
 411   1, /* scalar_load_cost  */
 412   1, /* scalar_store_cost  */
 413   1, /* vec_int_stmt_cost  */
 414   1, /* vec_fp_stmt_cost  */
 415   2, /* vec_permute_cost  */
 416   1, /* vec_to_scalar_cost  */
 417   1, /* scalar_to_vec_cost  */
 418   1, /* vec_align_load_cost  */
 419   1, /* vec_unalign_load_cost  */
 420   1, /* vec_unalign_store_cost  */
 421   1, /* vec_store_cost  */
 422   3, /* cond_taken_branch_cost  */
 423   1 /* cond_not_taken_branch_cost  */
 424 };
 425
 426 /* QDF24XX costs for vector insn classes.  */
 427 static const struct cpu_vector_cost qdf24xx_vector_cost =
 428 {
 429   1, /* scalar_int_stmt_cost  */
 430   1, /* scalar_fp_stmt_cost  */
 431   1, /* scalar_load_cost  */
 432   1, /* scalar_store_cost  */
 433   1, /* vec_int_stmt_cost  */
 434   3, /* vec_fp_stmt_cost  */
 435   2, /* vec_permute_cost  */
 436   1, /* vec_to_scalar_cost  */
 437   1, /* scalar_to_vec_cost  */
 438   1, /* vec_align_load_cost  */
 439   1, /* vec_unalign_load_cost  */
 440   1, /* vec_unalign_store_cost  */
 441   1, /* vec_store_cost  */
 442   3, /* cond_taken_branch_cost  */
 443   1 /* cond_not_taken_branch_cost  */
 444 };
 445
 446 /* ThunderX costs for vector insn classes.  */
 447 static const struct cpu_vector_cost thunderx_vector_cost =
 448 {
 449   1, /* scalar_int_stmt_cost  */
 450   1, /* scalar_fp_stmt_cost  */
 451   3, /* scalar_load_cost  */
 452   1, /* scalar_store_cost  */
 453   4, /* vec_int_stmt_cost  */
 454   1, /* vec_fp_stmt_cost  */
 455   4, /* vec_permute_cost  */
 456   2, /* vec_to_scalar_cost  */
 457   2, /* scalar_to_vec_cost  */
 458   3, /* vec_align_load_cost  */
 459   5, /* vec_unalign_load_cost  */
 460   5, /* vec_unalign_store_cost  */
 461   1, /* vec_store_cost  */
 462   3, /* cond_taken_branch_cost  */
 463   3 /* cond_not_taken_branch_cost  */
 464 };
 465
 466 static const struct cpu_vector_cost tsv110_vector_cost =
 467 {
 468   1, /* scalar_int_stmt_cost  */
 469   1, /* scalar_fp_stmt_cost  */
 470   5, /* scalar_load_cost  */
 471   1, /* scalar_store_cost  */
 472   2, /* vec_int_stmt_cost  */
 473   2, /* vec_fp_stmt_cost  */
 474   2, /* vec_permute_cost  */
 475   3, /* vec_to_scalar_cost  */
 476   2, /* scalar_to_vec_cost  */
 477   5, /* vec_align_load_cost  */
 478   5, /* vec_unalign_load_cost  */
 479   1, /* vec_unalign_store_cost  */
 480   1, /* vec_store_cost  */
 481   1, /* cond_taken_branch_cost  */
 482   1 /* cond_not_taken_branch_cost  */
 483 };
 484
 485 /* Generic costs for vector insn classes.  */
 486 static const struct cpu_vector_cost cortexa57_vector_cost =
 487 {
 488   1, /* scalar_int_stmt_cost  */
 489   1, /* scalar_fp_stmt_cost  */
 490   4, /* scalar_load_cost  */
 491   1, /* scalar_store_cost  */
 492   2, /* vec_int_stmt_cost  */
 493   2, /* vec_fp_stmt_cost  */
 494   3, /* vec_permute_cost  */
 495   8, /* vec_to_scalar_cost  */
 496   8, /* scalar_to_vec_cost  */
 497   4, /* vec_align_load_cost  */
 498   4, /* vec_unalign_load_cost  */
 499   1, /* vec_unalign_store_cost  */
 500   1, /* vec_store_cost  */
 501   1, /* cond_taken_branch_cost  */
 502   1 /* cond_not_taken_branch_cost  */
 503 };
 504
 505 static const struct cpu_vector_cost exynosm1_vector_cost =
 506 {
 507   1, /* scalar_int_stmt_cost  */
 508   1, /* scalar_fp_stmt_cost  */
 509   5, /* scalar_load_cost  */
 510   1, /* scalar_store_cost  */
 511   3, /* vec_int_stmt_cost  */
 512   3, /* vec_fp_stmt_cost  */
 513   3, /* vec_permute_cost  */
 514   3, /* vec_to_scalar_cost  */
 515   3, /* scalar_to_vec_cost  */
 516   5, /* vec_align_load_cost  */
 517   5, /* vec_unalign_load_cost  */
 518   1, /* vec_unalign_store_cost  */
 519   1, /* vec_store_cost  */
 520   1, /* cond_taken_branch_cost  */
 521   1 /* cond_not_taken_branch_cost  */
 522 };
 523
 524 /* Generic costs for vector insn classes.  */
 525 static const struct cpu_vector_cost xgene1_vector_cost =
 526 {
 527   1, /* scalar_int_stmt_cost  */
 528   1, /* scalar_fp_stmt_cost  */
 529   5, /* scalar_load_cost  */
 530   1, /* scalar_store_cost  */
 531   2, /* vec_int_stmt_cost  */
 532   2, /* vec_fp_stmt_cost  */
 533   2, /* vec_permute_cost  */
 534   4, /* vec_to_scalar_cost  */
 535   4, /* scalar_to_vec_cost  */
 536   10, /* vec_align_load_cost  */
 537   10, /* vec_unalign_load_cost  */
 538   2, /* vec_unalign_store_cost  */
 539   2, /* vec_store_cost  */
 540   2, /* cond_taken_branch_cost  */
 541   1 /* cond_not_taken_branch_cost  */
 542 };
 543
 544 /* Costs for vector insn classes for Vulcan.  */
 545 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 546 {
 547   1, /* scalar_int_stmt_cost  */
 548   6, /* scalar_fp_stmt_cost  */
 549   4, /* scalar_load_cost  */
 550   1, /* scalar_store_cost  */
 551   5, /* vec_int_stmt_cost  */
 552   6, /* vec_fp_stmt_cost  */
 553   3, /* vec_permute_cost  */
 554   6, /* vec_to_scalar_cost  */
 555   5, /* scalar_to_vec_cost  */
 556   8, /* vec_align_load_cost  */
 557   8, /* vec_unalign_load_cost  */
 558   4, /* vec_unalign_store_cost  */
 559   4, /* vec_store_cost  */
 560   2, /* cond_taken_branch_cost  */
 561   1  /* cond_not_taken_branch_cost  */
 562 };
 563
 564 /* Generic costs for branch instructions.  */
 565 static const struct cpu_branch_cost generic_branch_cost =
 566 {
 567   1,  /* Predictable.  */
 568   3   /* Unpredictable.  */
 569 };
 570
 571 /* Generic approximation modes.  */
 572 static const cpu_approx_modes generic_approx_modes =
 573 {
 574   AARCH64_APPROX_NONE,  /* division  */
 575   AARCH64_APPROX_NONE,  /* sqrt  */
 576   AARCH64_APPROX_NONE   /* recip_sqrt  */
 577 };
 578
 579 /* Approximation modes for Exynos M1.  */
 580 static const cpu_approx_modes exynosm1_approx_modes =
 581 {
 582   AARCH64_APPROX_NONE,  /* division  */
 583   AARCH64_APPROX_ALL,   /* sqrt  */
 584   AARCH64_APPROX_ALL    /* recip_sqrt  */
 585 };
 586
 587 /* Approximation modes for X-Gene 1.  */
 588 static const cpu_approx_modes xgene1_approx_modes =
 589 {
 590   AARCH64_APPROX_NONE,  /* division  */
 591   AARCH64_APPROX_NONE,  /* sqrt  */
 592   AARCH64_APPROX_ALL    /* recip_sqrt  */
 593 };
 594
 595 /* Generic prefetch settings (which disable prefetch).  */
 596 static const cpu_prefetch_tune generic_prefetch_tune =
 597 {
 598   0,                    /* num_slots  */
 599   -1,                   /* l1_cache_size  */
 600   -1,                   /* l1_cache_line_size  */
 601   -1,                   /* l2_cache_size  */
 602   true,                 /* prefetch_dynamic_strides */
 603   -1,                   /* minimum_stride */
 604   -1                    /* default_opt_level  */
 605 };
 606
 607 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 608 {
 609   0,                    /* num_slots  */
 610   -1,                   /* l1_cache_size  */
 611   64,                   /* l1_cache_line_size  */
 612   -1,                   /* l2_cache_size  */
 613   true,                 /* prefetch_dynamic_strides */
 614   -1,                   /* minimum_stride */
 615   -1                    /* default_opt_level  */
 616 };
 617
 618 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 619 {
 620   4,                    /* num_slots  */
 621   32,                   /* l1_cache_size  */
 622   64,                   /* l1_cache_line_size  */
 623   512,                  /* l2_cache_size  */
 624   false,                /* prefetch_dynamic_strides */
 625   2048,                 /* minimum_stride */
 626   3                     /* default_opt_level  */
 627 };
 628
 629 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 630 {
 631   8,                    /* num_slots  */
 632   32,                   /* l1_cache_size  */
 633   128,                  /* l1_cache_line_size  */
 634   16*1024,              /* l2_cache_size  */
 635   true,                 /* prefetch_dynamic_strides */
 636   -1,                   /* minimum_stride */
 637   3                     /* default_opt_level  */
 638 };
 639
 640 static const cpu_prefetch_tune thunderx_prefetch_tune =
 641 {
 642   8,                    /* num_slots  */
 643   32,                   /* l1_cache_size  */
 644   128,                  /* l1_cache_line_size  */
 645   -1,                   /* l2_cache_size  */
 646   true,                 /* prefetch_dynamic_strides */
 647   -1,                   /* minimum_stride */
 648   -1                    /* default_opt_level  */
 649 };
 650
 651 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 652 {
 653   8,                    /* num_slots  */
 654   32,                   /* l1_cache_size  */
 655   64,                   /* l1_cache_line_size  */
 656   256,                  /* l2_cache_size  */
 657   true,                 /* prefetch_dynamic_strides */
 658   -1,                   /* minimum_stride */
 659   -1                    /* default_opt_level  */
 660 };
 661
 662 static const cpu_prefetch_tune tsv110_prefetch_tune =
 663 {
 664   0,                    /* num_slots  */
 665   64,                   /* l1_cache_size  */
 666   64,                   /* l1_cache_line_size  */
 667   512,                  /* l2_cache_size  */
 668   true,                 /* prefetch_dynamic_strides */
 669   -1,                   /* minimum_stride */
 670   -1                    /* default_opt_level  */
 671 };
 672
 673 static const cpu_prefetch_tune xgene1_prefetch_tune =
 674 {
 675   8,                    /* num_slots  */
 676   32,                   /* l1_cache_size  */
 677   64,                   /* l1_cache_line_size  */
 678   256,                  /* l2_cache_size  */
 679   true,                 /* prefetch_dynamic_strides */
 680   -1,                   /* minimum_stride */
 681   -1                    /* default_opt_level  */
 682 };
 683
 684 static const struct tune_params generic_tunings =
 685 {
 686   &cortexa57_extra_costs,
 687   &generic_addrcost_table,
 688   &generic_regmove_cost,
 689   &generic_vector_cost,
 690   &generic_branch_cost,
 691   &generic_approx_modes,
 692   SVE_NOT_IMPLEMENTED, /* sve_width  */
 693   4, /* memmov_cost  */
 694   2, /* issue_rate  */
 695   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 696   "8",  /* function_align.  */
 697   "4",  /* jump_align.  */
 698   "8",  /* loop_align.  */
 699   2,    /* int_reassoc_width.  */
 700   4,    /* fp_reassoc_width.  */
 701   1,    /* vec_reassoc_width.  */
 702   2,    /* min_div_recip_mul_sf.  */
 703   2,    /* min_div_recip_mul_df.  */
 704   0,    /* max_case_values.  */
 705   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 706   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 707   &generic_prefetch_tune
 708 };
 709
 710 static const struct tune_params cortexa35_tunings =
 711 {
 712   &cortexa53_extra_costs,
 713   &generic_addrcost_table,
 714   &cortexa53_regmove_cost,
 715   &generic_vector_cost,
 716   &generic_branch_cost,
 717   &generic_approx_modes,
 718   SVE_NOT_IMPLEMENTED, /* sve_width  */
 719   4, /* memmov_cost  */
 720   1, /* issue_rate  */
 721   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 722    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 723   "16", /* function_align.  */
 724   "4",  /* jump_align.  */
 725   "8",  /* loop_align.  */
 726   2,    /* int_reassoc_width.  */
 727   4,    /* fp_reassoc_width.  */
 728   1,    /* vec_reassoc_width.  */
 729   2,    /* min_div_recip_mul_sf.  */
 730   2,    /* min_div_recip_mul_df.  */
 731   0,    /* max_case_values.  */
 732   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 733   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 734   &generic_prefetch_tune
 735 };
 736
 737 static const struct tune_params cortexa53_tunings =
 738 {
 739   &cortexa53_extra_costs,
 740   &generic_addrcost_table,
 741   &cortexa53_regmove_cost,
 742   &generic_vector_cost,
 743   &generic_branch_cost,
 744   &generic_approx_modes,
 745   SVE_NOT_IMPLEMENTED, /* sve_width  */
 746   4, /* memmov_cost  */
 747   2, /* issue_rate  */
 748   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 749    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 750   "16", /* function_align.  */
 751   "4",  /* jump_align.  */
 752   "8",  /* loop_align.  */
 753   2,    /* int_reassoc_width.  */
 754   4,    /* fp_reassoc_width.  */
 755   1,    /* vec_reassoc_width.  */
 756   2,    /* min_div_recip_mul_sf.  */
 757   2,    /* min_div_recip_mul_df.  */
 758   0,    /* max_case_values.  */
 759   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 760   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 761   &generic_prefetch_tune
 762 };
 763
 764 static const struct tune_params cortexa57_tunings =
 765 {
 766   &cortexa57_extra_costs,
 767   &generic_addrcost_table,
 768   &cortexa57_regmove_cost,
 769   &cortexa57_vector_cost,
 770   &generic_branch_cost,
 771   &generic_approx_modes,
 772   SVE_NOT_IMPLEMENTED, /* sve_width  */
 773   4, /* memmov_cost  */
 774   3, /* issue_rate  */
 775   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 776    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 777   "16", /* function_align.  */
 778   "4",  /* jump_align.  */
 779   "8",  /* loop_align.  */
 780   2,    /* int_reassoc_width.  */
 781   4,    /* fp_reassoc_width.  */
 782   1,    /* vec_reassoc_width.  */
 783   2,    /* min_div_recip_mul_sf.  */
 784   2,    /* min_div_recip_mul_df.  */
 785   0,    /* max_case_values.  */
 786   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 787   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 788   &generic_prefetch_tune
 789 };
 790
 791 static const struct tune_params cortexa72_tunings =
 792 {
 793   &cortexa57_extra_costs,
 794   &generic_addrcost_table,
 795   &cortexa57_regmove_cost,
 796   &cortexa57_vector_cost,
 797   &generic_branch_cost,
 798   &generic_approx_modes,
 799   SVE_NOT_IMPLEMENTED, /* sve_width  */
 800   4, /* memmov_cost  */
 801   3, /* issue_rate  */
 802   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 803    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 804   "16", /* function_align.  */
 805   "4",  /* jump_align.  */
 806   "8",  /* loop_align.  */
 807   2,    /* int_reassoc_width.  */
 808   4,    /* fp_reassoc_width.  */
 809   1,    /* vec_reassoc_width.  */
 810   2,    /* min_div_recip_mul_sf.  */
 811   2,    /* min_div_recip_mul_df.  */
 812   0,    /* max_case_values.  */
 813   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 814   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 815   &generic_prefetch_tune
 816 };
 817
 818 static const struct tune_params cortexa73_tunings =
 819 {
 820   &cortexa57_extra_costs,
 821   &generic_addrcost_table,
 822   &cortexa57_regmove_cost,
 823   &cortexa57_vector_cost,
 824   &generic_branch_cost,
 825   &generic_approx_modes,
 826   SVE_NOT_IMPLEMENTED, /* sve_width  */
 827   4, /* memmov_cost.  */
 828   2, /* issue_rate.  */
 829   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 830    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 831   "16", /* function_align.  */
 832   "4",  /* jump_align.  */
 833   "8",  /* loop_align.  */
 834   2,    /* int_reassoc_width.  */
 835   4,    /* fp_reassoc_width.  */
 836   1,    /* vec_reassoc_width.  */
 837   2,    /* min_div_recip_mul_sf.  */
 838   2,    /* min_div_recip_mul_df.  */
 839   0,    /* max_case_values.  */
 840   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 841   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 842   &generic_prefetch_tune
 843 };
 844
 845
 846
 847 static const struct tune_params exynosm1_tunings =
 848 {
 849   &exynosm1_extra_costs,
 850   &exynosm1_addrcost_table,
 851   &exynosm1_regmove_cost,
 852   &exynosm1_vector_cost,
 853   &generic_branch_cost,
 854   &exynosm1_approx_modes,
 855   SVE_NOT_IMPLEMENTED, /* sve_width  */
 856   4,    /* memmov_cost  */
 857   3,    /* issue_rate  */
 858   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 859   "4",  /* function_align.  */
 860   "4",  /* jump_align.  */
 861   "4",  /* loop_align.  */
 862   2,    /* int_reassoc_width.  */
 863   4,    /* fp_reassoc_width.  */
 864   1,    /* vec_reassoc_width.  */
 865   2,    /* min_div_recip_mul_sf.  */
 866   2,    /* min_div_recip_mul_df.  */
 867   48,   /* max_case_values.  */
 868   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 869   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 870   &exynosm1_prefetch_tune
 871 };
 872
 873 static const struct tune_params thunderxt88_tunings =
 874 {
 875   &thunderx_extra_costs,
 876   &generic_addrcost_table,
 877   &thunderx_regmove_cost,
 878   &thunderx_vector_cost,
 879   &generic_branch_cost,
 880   &generic_approx_modes,
 881   SVE_NOT_IMPLEMENTED, /* sve_width  */
 882   6, /* memmov_cost  */
 883   2, /* issue_rate  */
 884   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 885   "8",  /* function_align.  */
 886   "8",  /* jump_align.  */
 887   "8",  /* loop_align.  */
 888   2,    /* int_reassoc_width.  */
 889   4,    /* fp_reassoc_width.  */
 890   1,    /* vec_reassoc_width.  */
 891   2,    /* min_div_recip_mul_sf.  */
 892   2,    /* min_div_recip_mul_df.  */
 893   0,    /* max_case_values.  */
 894   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 895   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 896   &thunderxt88_prefetch_tune
 897 };
 898
 899 static const struct tune_params thunderx_tunings =
 900 {
 901   &thunderx_extra_costs,
 902   &generic_addrcost_table,
 903   &thunderx_regmove_cost,
 904   &thunderx_vector_cost,
 905   &generic_branch_cost,
 906   &generic_approx_modes,
 907   SVE_NOT_IMPLEMENTED, /* sve_width  */
 908   6, /* memmov_cost  */
 909   2, /* issue_rate  */
 910   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 911   "8",  /* function_align.  */
 912   "8",  /* jump_align.  */
 913   "8",  /* loop_align.  */
 914   2,    /* int_reassoc_width.  */
 915   4,    /* fp_reassoc_width.  */
 916   1,    /* vec_reassoc_width.  */
 917   2,    /* min_div_recip_mul_sf.  */
 918   2,    /* min_div_recip_mul_df.  */
 919   0,    /* max_case_values.  */
 920   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 921   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 922    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 923   &thunderx_prefetch_tune
 924 };
 925
 926 static const struct tune_params tsv110_tunings =
 927 {
 928   &tsv110_extra_costs,
 929   &tsv110_addrcost_table,
 930   &tsv110_regmove_cost,
 931   &tsv110_vector_cost,
 932   &generic_branch_cost,
 933   &generic_approx_modes,
 934   SVE_NOT_IMPLEMENTED, /* sve_width  */
 935   4,    /* memmov_cost  */
 936   4,    /* issue_rate  */
 937   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
 938    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 939   "16", /* function_align.  */
 940   "4",  /* jump_align.  */
 941   "8",  /* loop_align.  */
 942   2,    /* int_reassoc_width.  */
 943   4,    /* fp_reassoc_width.  */
 944   1,    /* vec_reassoc_width.  */
 945   2,    /* min_div_recip_mul_sf.  */
 946   2,    /* min_div_recip_mul_df.  */
 947   0,    /* max_case_values.  */
 948   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 949   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
 950   &tsv110_prefetch_tune
 951 };
 952
 953 static const struct tune_params xgene1_tunings =
 954 {
 955   &xgene1_extra_costs,
 956   &xgene1_addrcost_table,
 957   &xgene1_regmove_cost,
 958   &xgene1_vector_cost,
 959   &generic_branch_cost,
 960   &xgene1_approx_modes,
 961   SVE_NOT_IMPLEMENTED, /* sve_width  */
 962   6, /* memmov_cost  */
 963   4, /* issue_rate  */
 964   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 965   "16", /* function_align.  */
 966   "16", /* jump_align.  */
 967   "16", /* loop_align.  */
 968   2,    /* int_reassoc_width.  */
 969   4,    /* fp_reassoc_width.  */
 970   1,    /* vec_reassoc_width.  */
 971   2,    /* min_div_recip_mul_sf.  */
 972   2,    /* min_div_recip_mul_df.  */
 973   17,   /* max_case_values.  */
 974   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 975   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
 976   &xgene1_prefetch_tune
 977 };
 978
 979 static const struct tune_params emag_tunings =
 980 {
 981   &xgene1_extra_costs,
 982   &xgene1_addrcost_table,
 983   &xgene1_regmove_cost,
 984   &xgene1_vector_cost,
 985   &generic_branch_cost,
 986   &xgene1_approx_modes,
 987   SVE_NOT_IMPLEMENTED,
 988   6, /* memmov_cost  */
 989   4, /* issue_rate  */
 990   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 991   "16", /* function_align.  */
 992   "16", /* jump_align.  */
 993   "16", /* loop_align.  */
 994   2,    /* int_reassoc_width.  */
 995   4,    /* fp_reassoc_width.  */
 996   1,    /* vec_reassoc_width.  */
 997   2,    /* min_div_recip_mul_sf.  */
 998   2,    /* min_div_recip_mul_df.  */
 999   17,   /* max_case_values.  */
1000   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1001   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1002   &xgene1_prefetch_tune
1003 };
1004
1005 static const struct tune_params qdf24xx_tunings =
1006 {
1007   &qdf24xx_extra_costs,
1008   &qdf24xx_addrcost_table,
1009   &qdf24xx_regmove_cost,
1010   &qdf24xx_vector_cost,
1011   &generic_branch_cost,
1012   &generic_approx_modes,
1013   SVE_NOT_IMPLEMENTED, /* sve_width  */
1014   4, /* memmov_cost  */
1015   4, /* issue_rate  */
1016   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1017    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1018   "16", /* function_align.  */
1019   "8",  /* jump_align.  */
1020   "16", /* loop_align.  */
1021   2,    /* int_reassoc_width.  */
1022   4,    /* fp_reassoc_width.  */
1023   1,    /* vec_reassoc_width.  */
1024   2,    /* min_div_recip_mul_sf.  */
1025   2,    /* min_div_recip_mul_df.  */
1026   0,    /* max_case_values.  */
1027   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1028   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1029   &qdf24xx_prefetch_tune
1030 };
1031
1032 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1033    for now.  */
1034 static const struct tune_params saphira_tunings =
1035 {
1036   &generic_extra_costs,
1037   &generic_addrcost_table,
1038   &generic_regmove_cost,
1039   &generic_vector_cost,
1040   &generic_branch_cost,
1041   &generic_approx_modes,
1042   SVE_NOT_IMPLEMENTED, /* sve_width  */
1043   4, /* memmov_cost  */
1044   4, /* issue_rate  */
1045   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1046    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1047   "16", /* function_align.  */
1048   "8",  /* jump_align.  */
1049   "16", /* loop_align.  */
1050   2,    /* int_reassoc_width.  */
1051   4,    /* fp_reassoc_width.  */
1052   1,    /* vec_reassoc_width.  */
1053   2,    /* min_div_recip_mul_sf.  */
1054   2,    /* min_div_recip_mul_df.  */
1055   0,    /* max_case_values.  */
1056   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1057   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1058   &generic_prefetch_tune
1059 };
1060
1061 static const struct tune_params thunderx2t99_tunings =
1062 {
1063   &thunderx2t99_extra_costs,
1064   &thunderx2t99_addrcost_table,
1065   &thunderx2t99_regmove_cost,
1066   &thunderx2t99_vector_cost,
1067   &generic_branch_cost,
1068   &generic_approx_modes,
1069   SVE_NOT_IMPLEMENTED, /* sve_width  */
1070   4, /* memmov_cost.  */
1071   4, /* issue_rate.  */
1072   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1073    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
1074   "16", /* function_align.  */
1075   "8",  /* jump_align.  */
1076   "16", /* loop_align.  */
1077   3,    /* int_reassoc_width.  */
1078   2,    /* fp_reassoc_width.  */
1079   2,    /* vec_reassoc_width.  */
1080   2,    /* min_div_recip_mul_sf.  */
1081   2,    /* min_div_recip_mul_df.  */
1082   0,    /* max_case_values.  */
1083   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1084   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1085   &thunderx2t99_prefetch_tune
1086 };
1087
1088 static const struct tune_params neoversen1_tunings =
1089 {
1090   &cortexa57_extra_costs,
1091   &generic_addrcost_table,
1092   &generic_regmove_cost,
1093   &cortexa57_vector_cost,
1094   &generic_branch_cost,
1095   &generic_approx_modes,
1096   SVE_NOT_IMPLEMENTED, /* sve_width  */
1097   4, /* memmov_cost  */
1098   3, /* issue_rate  */
1099   AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
1100   "32:16",      /* function_align.  */
1101   "32:16",      /* jump_align.  */
1102   "32:16",      /* loop_align.  */
1103   2,    /* int_reassoc_width.  */
1104   4,    /* fp_reassoc_width.  */
1105   2,    /* vec_reassoc_width.  */
1106   2,    /* min_div_recip_mul_sf.  */
1107   2,    /* min_div_recip_mul_df.  */
1108   0,    /* max_case_values.  */
1109   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1110   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1111   &generic_prefetch_tune
1112 };
1113
1114 /* Support for fine-grained override of the tuning structures.  */
1115 struct aarch64_tuning_override_function
1116 {
1117   const char* name;
1118   void (*parse_override)(const char*, struct tune_params*);
1119 };
1120
1121 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1122 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1123 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1124
1125 static const struct aarch64_tuning_override_function
1126 aarch64_tuning_override_functions[] =
1127 {
1128   { "fuse", aarch64_parse_fuse_string },
1129   { "tune", aarch64_parse_tune_string },
1130   { "sve_width", aarch64_parse_sve_width_string },
1131   { NULL, NULL }
1132 };
1133
1134 /* A processor implementing AArch64.  */
1135 struct processor
1136 {
1137   const char *const name;
1138   enum aarch64_processor ident;
1139   enum aarch64_processor sched_core;
1140   enum aarch64_arch arch;
1141   unsigned architecture_version;
1142   const uint64_t flags;
1143   const struct tune_params *const tune;
1144 };
1145
1146 /* Architectures implementing AArch64.  */
1147 static const struct processor all_architectures[] =
1148 {
1149 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1150   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1151 #include "aarch64-arches.def"
1152   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1153 };
1154
1155 /* Processor cores implementing AArch64.  */
1156 static const struct processor all_cores[] =
1157 {
1158 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1159   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1160   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1161   FLAGS, &COSTS##_tunings},
1162 #include "aarch64-cores.def"
1163   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1164     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1165   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1166 };
1167
1168
1169 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1170    handling code or by target attributes.  */
1171 static const struct processor *selected_arch;
1172 static const struct processor *selected_cpu;
1173 static const struct processor *selected_tune;
1174
1175 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1176
1177 /* The current tuning set.  */
1178 struct tune_params aarch64_tune_params = generic_tunings;
1179
1180 /* Table of machine attributes.  */
1181 static const struct attribute_spec aarch64_attribute_table[] =
1182 {
1183   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1184        affects_type_identity, handler, exclude } */
1185   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,  NULL, NULL },
1186   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1187 };
1188
1189 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1190
1191 /* An ISA extension in the co-processor and main instruction set space.  */
1192 struct aarch64_option_extension
1193 {
1194   const char *const name;
1195   const unsigned long flags_on;
1196   const unsigned long flags_off;
1197 };
1198
1199 typedef enum aarch64_cond_code
1200 {
1201   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1202   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1203   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1204 }
1205 aarch64_cc;
1206
1207 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1208
1209 struct aarch64_branch_protect_type
1210 {
1211   /* The type's name that the user passes to the branch-protection option
1212     string.  */
1213   const char* name;
1214   /* Function to handle the protection type and set global variables.
1215     First argument is the string token corresponding with this type and the
1216     second argument is the next token in the option string.
1217     Return values:
1218     * AARCH64_PARSE_OK: Handling was sucessful.
1219     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1220       should print an error.
1221     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1222       own error.  */
1223   enum aarch64_parse_opt_result (*handler)(char*, char*);
1224   /* A list of types that can follow this type in the option string.  */
1225   const aarch64_branch_protect_type* subtypes;
1226   unsigned int num_subtypes;
1227 };
1228
1229 static enum aarch64_parse_opt_result
1230 aarch64_handle_no_branch_protection (char* str, char* rest)
1231 {
1232   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1233   aarch64_enable_bti = 0;
1234   if (rest)
1235     {
1236       error ("unexpected %<%s%> after %<%s%>", rest, str);
1237       return AARCH64_PARSE_INVALID_FEATURE;
1238     }
1239   return AARCH64_PARSE_OK;
1240 }
1241
1242 static enum aarch64_parse_opt_result
1243 aarch64_handle_standard_branch_protection (char* str, char* rest)
1244 {
1245   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1246   aarch64_ra_sign_key = AARCH64_KEY_A;
1247   aarch64_enable_bti = 1;
1248   if (rest)
1249     {
1250       error ("unexpected %<%s%> after %<%s%>", rest, str);
1251       return AARCH64_PARSE_INVALID_FEATURE;
1252     }
1253   return AARCH64_PARSE_OK;
1254 }
1255
1256 static enum aarch64_parse_opt_result
1257 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1258                                     char* rest ATTRIBUTE_UNUSED)
1259 {
1260   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1261   aarch64_ra_sign_key = AARCH64_KEY_A;
1262   return AARCH64_PARSE_OK;
1263 }
1264
1265 static enum aarch64_parse_opt_result
1266 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1267                               char* rest ATTRIBUTE_UNUSED)
1268 {
1269   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1270   return AARCH64_PARSE_OK;
1271 }
1272
1273 static enum aarch64_parse_opt_result
1274 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1275                               char* rest ATTRIBUTE_UNUSED)
1276 {
1277   aarch64_ra_sign_key = AARCH64_KEY_B;
1278   return AARCH64_PARSE_OK;
1279 }
1280
1281 static enum aarch64_parse_opt_result
1282 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1283                                     char* rest ATTRIBUTE_UNUSED)
1284 {
1285   aarch64_enable_bti = 1;
1286   return AARCH64_PARSE_OK;
1287 }
1288
1289 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1290   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1291   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1292   { NULL, NULL, NULL, 0 }
1293 };
1294
1295 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1296   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1297   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1298   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1299     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1300   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1301   { NULL, NULL, NULL, 0 }
1302 };
1303
1304 /* The condition codes of the processor, and the inverse function.  */
1305 static const char * const aarch64_condition_codes[] =
1306 {
1307   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1308   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1309 };
1310
1311 /* The preferred condition codes for SVE conditions.  */
1312 static const char *const aarch64_sve_condition_codes[] =
1313 {
1314   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1315   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1316 };
1317
1318 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1319 const char *
1320 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1321                         const char * branch_format)
1322 {
1323     rtx_code_label * tmp_label = gen_label_rtx ();
1324     char label_buf[256];
1325     char buffer[128];
1326     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1327                                  CODE_LABEL_NUMBER (tmp_label));
1328     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1329     rtx dest_label = operands[pos_label];
1330     operands[pos_label] = tmp_label;
1331
1332     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1333     output_asm_insn (buffer, operands);
1334
1335     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1336     operands[pos_label] = dest_label;
1337     output_asm_insn (buffer, operands);
1338     return "";
1339 }
1340
1341 void
1342 aarch64_err_no_fpadvsimd (machine_mode mode)
1343 {
1344   if (TARGET_GENERAL_REGS_ONLY)
1345     if (FLOAT_MODE_P (mode))
1346       error ("%qs is incompatible with the use of floating-point types",
1347              "-mgeneral-regs-only");
1348     else
1349       error ("%qs is incompatible with the use of vector types",
1350              "-mgeneral-regs-only");
1351   else
1352     if (FLOAT_MODE_P (mode))
1353       error ("%qs feature modifier is incompatible with the use of"
1354              " floating-point types", "+nofp");
1355     else
1356       error ("%qs feature modifier is incompatible with the use of"
1357              " vector types", "+nofp");
1358 }
1359
1360 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1361    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1362    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1363    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1364    and GENERAL_REGS is lower than the memory cost (in this case the best class
1365    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1366    cost results in bad allocations with many redundant int<->FP moves which
1367    are expensive on various cores.
1368    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1369    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1370    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1371    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1372    The result of this is that it is no longer inefficient to have a higher
1373    memory move cost than the register move cost.
1374 */
1375
1376 static reg_class_t
1377 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1378                                          reg_class_t best_class)
1379 {
1380   machine_mode mode;
1381
1382   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1383       || !reg_class_subset_p (FP_REGS, allocno_class))
1384     return allocno_class;
1385
1386   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1387       || !reg_class_subset_p (FP_REGS, best_class))
1388     return best_class;
1389
1390   mode = PSEUDO_REGNO_MODE (regno);
1391   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1392 }
1393
1394 static unsigned int
1395 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1396 {
1397   if (GET_MODE_UNIT_SIZE (mode) == 4)
1398     return aarch64_tune_params.min_div_recip_mul_sf;
1399   return aarch64_tune_params.min_div_recip_mul_df;
1400 }
1401
1402 /* Return the reassociation width of treeop OPC with mode MODE.  */
1403 static int
1404 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1405 {
1406   if (VECTOR_MODE_P (mode))
1407     return aarch64_tune_params.vec_reassoc_width;
1408   if (INTEGRAL_MODE_P (mode))
1409     return aarch64_tune_params.int_reassoc_width;
1410   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1411   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1412     return aarch64_tune_params.fp_reassoc_width;
1413   return 1;
1414 }
1415
1416 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1417 unsigned
1418 aarch64_dbx_register_number (unsigned regno)
1419 {
1420    if (GP_REGNUM_P (regno))
1421      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1422    else if (regno == SP_REGNUM)
1423      return AARCH64_DWARF_SP;
1424    else if (FP_REGNUM_P (regno))
1425      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1426    else if (PR_REGNUM_P (regno))
1427      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1428    else if (regno == VG_REGNUM)
1429      return AARCH64_DWARF_VG;
1430
1431    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1432       equivalent DWARF register.  */
1433    return DWARF_FRAME_REGISTERS;
1434 }
1435
1436 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1437 static bool
1438 aarch64_advsimd_struct_mode_p (machine_mode mode)
1439 {
1440   return (TARGET_SIMD
1441           && (mode == OImode || mode == CImode || mode == XImode));
1442 }
1443
1444 /* Return true if MODE is an SVE predicate mode.  */
1445 static bool
1446 aarch64_sve_pred_mode_p (machine_mode mode)
1447 {
1448   return (TARGET_SVE
1449           && (mode == VNx16BImode
1450               || mode == VNx8BImode
1451               || mode == VNx4BImode
1452               || mode == VNx2BImode));
1453 }
1454
1455 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1456 const unsigned int VEC_ADVSIMD  = 1;
1457 const unsigned int VEC_SVE_DATA = 2;
1458 const unsigned int VEC_SVE_PRED = 4;
1459 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1460    a structure of 2, 3 or 4 vectors.  */
1461 const unsigned int VEC_STRUCT   = 8;
1462 /* Useful combinations of the above.  */
1463 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1464 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1465
1466 /* Return a set of flags describing the vector properties of mode MODE.
1467    Ignore modes that are not supported by the current target.  */
1468 static unsigned int
1469 aarch64_classify_vector_mode (machine_mode mode)
1470 {
1471   if (aarch64_advsimd_struct_mode_p (mode))
1472     return VEC_ADVSIMD | VEC_STRUCT;
1473
1474   if (aarch64_sve_pred_mode_p (mode))
1475     return VEC_SVE_PRED;
1476
1477   scalar_mode inner = GET_MODE_INNER (mode);
1478   if (VECTOR_MODE_P (mode)
1479       && (inner == QImode
1480           || inner == HImode
1481           || inner == HFmode
1482           || inner == SImode
1483           || inner == SFmode
1484           || inner == DImode
1485           || inner == DFmode))
1486     {
1487       if (TARGET_SVE)
1488         {
1489           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1490             return VEC_SVE_DATA;
1491           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1492               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1493               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1494             return VEC_SVE_DATA | VEC_STRUCT;
1495         }
1496
1497       /* This includes V1DF but not V1DI (which doesn't exist).  */
1498       if (TARGET_SIMD
1499           && (known_eq (GET_MODE_BITSIZE (mode), 64)
1500               || known_eq (GET_MODE_BITSIZE (mode), 128)))
1501         return VEC_ADVSIMD;
1502     }
1503
1504   return 0;
1505 }
1506
1507 /* Return true if MODE is any of the data vector modes, including
1508    structure modes.  */
1509 static bool
1510 aarch64_vector_data_mode_p (machine_mode mode)
1511 {
1512   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1513 }
1514
1515 /* Return true if MODE is an SVE data vector mode; either a single vector
1516    or a structure of vectors.  */
1517 static bool
1518 aarch64_sve_data_mode_p (machine_mode mode)
1519 {
1520   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1521 }
1522
1523 /* Implement target hook TARGET_ARRAY_MODE.  */
1524 static opt_machine_mode
1525 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1526 {
1527   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1528       && IN_RANGE (nelems, 2, 4))
1529     return mode_for_vector (GET_MODE_INNER (mode),
1530                             GET_MODE_NUNITS (mode) * nelems);
1531
1532   return opt_machine_mode ();
1533 }
1534
1535 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1536 static bool
1537 aarch64_array_mode_supported_p (machine_mode mode,
1538                                 unsigned HOST_WIDE_INT nelems)
1539 {
1540   if (TARGET_SIMD
1541       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1542           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1543       && (nelems >= 2 && nelems <= 4))
1544     return true;
1545
1546   return false;
1547 }
1548
1549 /* Return the SVE predicate mode to use for elements that have
1550    ELEM_NBYTES bytes, if such a mode exists.  */
1551
1552 opt_machine_mode
1553 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1554 {
1555   if (TARGET_SVE)
1556     {
1557       if (elem_nbytes == 1)
1558         return VNx16BImode;
1559       if (elem_nbytes == 2)
1560         return VNx8BImode;
1561       if (elem_nbytes == 4)
1562         return VNx4BImode;
1563       if (elem_nbytes == 8)
1564         return VNx2BImode;
1565     }
1566   return opt_machine_mode ();
1567 }
1568
1569 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1570
1571 static opt_machine_mode
1572 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1573 {
1574   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1575     {
1576       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1577       machine_mode pred_mode;
1578       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1579         return pred_mode;
1580     }
1581
1582   return default_get_mask_mode (nunits, nbytes);
1583 }
1584
1585 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1586    prefer to use the first arithmetic operand as the else value if
1587    the else value doesn't matter, since that exactly matches the SVE
1588    destructive merging form.  For ternary operations we could either
1589    pick the first operand and use FMAD-like instructions or the last
1590    operand and use FMLA-like instructions; the latter seems more
1591    natural.  */
1592
1593 static tree
1594 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1595 {
1596   return nops == 3 ? ops[2] : ops[0];
1597 }
1598
1599 /* Implement TARGET_HARD_REGNO_NREGS.  */
1600
1601 static unsigned int
1602 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1603 {
1604   /* ??? Logically we should only need to provide a value when
1605      HARD_REGNO_MODE_OK says that the combination is valid,
1606      but at the moment we need to handle all modes.  Just ignore
1607      any runtime parts for registers that can't store them.  */
1608   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1609   switch (aarch64_regno_regclass (regno))
1610     {
1611     case FP_REGS:
1612     case FP_LO_REGS:
1613       if (aarch64_sve_data_mode_p (mode))
1614         return exact_div (GET_MODE_SIZE (mode),
1615                           BYTES_PER_SVE_VECTOR).to_constant ();
1616       return CEIL (lowest_size, UNITS_PER_VREG);
1617     case PR_REGS:
1618     case PR_LO_REGS:
1619     case PR_HI_REGS:
1620       return 1;
1621     default:
1622       return CEIL (lowest_size, UNITS_PER_WORD);
1623     }
1624   gcc_unreachable ();
1625 }
1626
1627 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1628
1629 static bool
1630 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1631 {
1632   if (GET_MODE_CLASS (mode) == MODE_CC)
1633     return regno == CC_REGNUM;
1634
1635   if (regno == VG_REGNUM)
1636     /* This must have the same size as _Unwind_Word.  */
1637     return mode == DImode;
1638
1639   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1640   if (vec_flags & VEC_SVE_PRED)
1641     return PR_REGNUM_P (regno);
1642
1643   if (PR_REGNUM_P (regno))
1644     return 0;
1645
1646   if (regno == SP_REGNUM)
1647     /* The purpose of comparing with ptr_mode is to support the
1648        global register variable associated with the stack pointer
1649        register via the syntax of asm ("wsp") in ILP32.  */
1650     return mode == Pmode || mode == ptr_mode;
1651
1652   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1653     return mode == Pmode;
1654
1655   if (GP_REGNUM_P (regno))
1656     {
1657       if (known_le (GET_MODE_SIZE (mode), 8))
1658         return true;
1659       else if (known_le (GET_MODE_SIZE (mode), 16))
1660         return (regno & 1) == 0;
1661     }
1662   else if (FP_REGNUM_P (regno))
1663     {
1664       if (vec_flags & VEC_STRUCT)
1665         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1666       else
1667         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1668     }
1669
1670   return false;
1671 }
1672
1673 /* Return true if this is a definition of a vectorized simd function.  */
1674
1675 static bool
1676 aarch64_simd_decl_p (tree fndecl)
1677 {
1678   tree fntype;
1679
1680   if (fndecl == NULL)
1681     return false;
1682   fntype = TREE_TYPE (fndecl);
1683   if (fntype == NULL)
1684     return false;
1685
1686   /* Functions with the aarch64_vector_pcs attribute use the simd ABI.  */
1687   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1688     return true;
1689
1690   return false;
1691 }
1692
1693 /* Return the mode a register save/restore should use.  DImode for integer
1694    registers, DFmode for FP registers in non-SIMD functions (they only save
1695    the bottom half of a 128 bit register), or TFmode for FP registers in
1696    SIMD functions.  */
1697
1698 static machine_mode
1699 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1700 {
1701   return GP_REGNUM_P (regno)
1702            ? E_DImode
1703            : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1704 }
1705
1706 /* Return true if the instruction is a call to a SIMD function, false
1707    if it is not a SIMD function or if we do not know anything about
1708    the function.  */
1709
1710 static bool
1711 aarch64_simd_call_p (rtx_insn *insn)
1712 {
1713   rtx symbol;
1714   rtx call;
1715   tree fndecl;
1716
1717   gcc_assert (CALL_P (insn));
1718   call = get_call_rtx_from (insn);
1719   symbol = XEXP (XEXP (call, 0), 0);
1720   if (GET_CODE (symbol) != SYMBOL_REF)
1721     return false;
1722   fndecl = SYMBOL_REF_DECL (symbol);
1723   if (!fndecl)
1724     return false;
1725
1726   return aarch64_simd_decl_p (fndecl);
1727 }
1728
1729 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS.  If INSN calls
1730    a function that uses the SIMD ABI, take advantage of the extra
1731    call-preserved registers that the ABI provides.  */
1732
1733 void
1734 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1735                                           HARD_REG_SET *return_set)
1736 {
1737   if (aarch64_simd_call_p (insn))
1738     {
1739       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1740         if (FP_SIMD_SAVED_REGNUM_P (regno))
1741           CLEAR_HARD_REG_BIT (*return_set, regno);
1742     }
1743 }
1744
1745 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1746    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1747    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1748
1749 static bool
1750 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1751                                         machine_mode mode)
1752 {
1753   bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1754   return FP_REGNUM_P (regno)
1755          && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1756 }
1757
1758 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS.  */
1759
1760 rtx_insn *
1761 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1762 {
1763   gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1764
1765   if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1766     return call_1;
1767   else
1768     return call_2;
1769 }
1770
1771 /* Implement REGMODE_NATURAL_SIZE.  */
1772 poly_uint64
1773 aarch64_regmode_natural_size (machine_mode mode)
1774 {
1775   /* The natural size for SVE data modes is one SVE data vector,
1776      and similarly for predicates.  We can't independently modify
1777      anything smaller than that.  */
1778   /* ??? For now, only do this for variable-width SVE registers.
1779      Doing it for constant-sized registers breaks lower-subreg.c.  */
1780   /* ??? And once that's fixed, we should probably have similar
1781      code for Advanced SIMD.  */
1782   if (!aarch64_sve_vg.is_constant ())
1783     {
1784       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1785       if (vec_flags & VEC_SVE_PRED)
1786         return BYTES_PER_SVE_PRED;
1787       if (vec_flags & VEC_SVE_DATA)
1788         return BYTES_PER_SVE_VECTOR;
1789     }
1790   return UNITS_PER_WORD;
1791 }
1792
1793 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1794 machine_mode
1795 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1796                                      machine_mode mode)
1797 {
1798   /* The predicate mode determines which bits are significant and
1799      which are "don't care".  Decreasing the number of lanes would
1800      lose data while increasing the number of lanes would make bits
1801      unnecessarily significant.  */
1802   if (PR_REGNUM_P (regno))
1803     return mode;
1804   if (known_ge (GET_MODE_SIZE (mode), 4))
1805     return mode;
1806   else
1807     return SImode;
1808 }
1809
1810 /* Return true if I's bits are consecutive ones from the MSB.  */
1811 bool
1812 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1813 {
1814   return exact_log2 (-i) != HOST_WIDE_INT_M1;
1815 }
1816
1817 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1818    that strcpy from constants will be faster.  */
1819
1820 static HOST_WIDE_INT
1821 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1822 {
1823   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1824     return MAX (align, BITS_PER_WORD);
1825   return align;
1826 }
1827
1828 /* Return true if calls to DECL should be treated as
1829    long-calls (ie called via a register).  */
1830 static bool
1831 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1832 {
1833   return false;
1834 }
1835
1836 /* Return true if calls to symbol-ref SYM should be treated as
1837    long-calls (ie called via a register).  */
1838 bool
1839 aarch64_is_long_call_p (rtx sym)
1840 {
1841   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1842 }
1843
1844 /* Return true if calls to symbol-ref SYM should not go through
1845    plt stubs.  */
1846
1847 bool
1848 aarch64_is_noplt_call_p (rtx sym)
1849 {
1850   const_tree decl = SYMBOL_REF_DECL (sym);
1851
1852   if (flag_pic
1853       && decl
1854       && (!flag_plt
1855           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1856       && !targetm.binds_local_p (decl))
1857     return true;
1858
1859   return false;
1860 }
1861
1862 /* Return true if the offsets to a zero/sign-extract operation
1863    represent an expression that matches an extend operation.  The
1864    operands represent the paramters from
1865
1866    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1867 bool
1868 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1869                                 rtx extract_imm)
1870 {
1871   HOST_WIDE_INT mult_val, extract_val;
1872
1873   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1874     return false;
1875
1876   mult_val = INTVAL (mult_imm);
1877   extract_val = INTVAL (extract_imm);
1878
1879   if (extract_val > 8
1880       && extract_val < GET_MODE_BITSIZE (mode)
1881       && exact_log2 (extract_val & ~7) > 0
1882       && (extract_val & 7) <= 4
1883       && mult_val == (1 << (extract_val & 7)))
1884     return true;
1885
1886   return false;
1887 }
1888
1889 /* Emit an insn that's a simple single-set.  Both the operands must be
1890    known to be valid.  */
1891 inline static rtx_insn *
1892 emit_set_insn (rtx x, rtx y)
1893 {
1894   return emit_insn (gen_rtx_SET (x, y));
1895 }
1896
1897 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1898    return the rtx for register 0 in the proper mode.  */
1899 rtx
1900 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1901 {
1902   machine_mode mode = SELECT_CC_MODE (code, x, y);
1903   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1904
1905   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1906   return cc_reg;
1907 }
1908
1909 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
1910
1911 static rtx
1912 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
1913                                   machine_mode y_mode)
1914 {
1915   if (y_mode == E_QImode || y_mode == E_HImode)
1916     {
1917       if (CONST_INT_P (y))
1918         y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
1919       else
1920         {
1921           rtx t, cc_reg;
1922           machine_mode cc_mode;
1923
1924           t = gen_rtx_ZERO_EXTEND (SImode, y);
1925           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
1926           cc_mode = CC_SWPmode;
1927           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
1928           emit_set_insn (cc_reg, t);
1929           return cc_reg;
1930         }
1931     }
1932
1933   return aarch64_gen_compare_reg (code, x, y);
1934 }
1935
1936 /* Build the SYMBOL_REF for __tls_get_addr.  */
1937
1938 static GTY(()) rtx tls_get_addr_libfunc;
1939
1940 rtx
1941 aarch64_tls_get_addr (void)
1942 {
1943   if (!tls_get_addr_libfunc)
1944     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1945   return tls_get_addr_libfunc;
1946 }
1947
1948 /* Return the TLS model to use for ADDR.  */
1949
1950 static enum tls_model
1951 tls_symbolic_operand_type (rtx addr)
1952 {
1953   enum tls_model tls_kind = TLS_MODEL_NONE;
1954   if (GET_CODE (addr) == CONST)
1955     {
1956       poly_int64 addend;
1957       rtx sym = strip_offset (addr, &addend);
1958       if (GET_CODE (sym) == SYMBOL_REF)
1959         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1960     }
1961   else if (GET_CODE (addr) == SYMBOL_REF)
1962     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1963
1964   return tls_kind;
1965 }
1966
1967 /* We'll allow lo_sum's in addresses in our legitimate addresses
1968    so that combine would take care of combining addresses where
1969    necessary, but for generation purposes, we'll generate the address
1970    as :
1971    RTL                               Absolute
1972    tmp = hi (symbol_ref);            adrp  x1, foo
1973    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1974                                      nop
1975
1976    PIC                               TLS
1977    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1978    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1979                                      bl   __tls_get_addr
1980                                      nop
1981
1982    Load TLS symbol, depending on TLS mechanism and TLS access model.
1983
1984    Global Dynamic - Traditional TLS:
1985    adrp tmp, :tlsgd:imm
1986    add  dest, tmp, #:tlsgd_lo12:imm
1987    bl   __tls_get_addr
1988
1989    Global Dynamic - TLS Descriptors:
1990    adrp dest, :tlsdesc:imm
1991    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1992    add  dest, dest, #:tlsdesc_lo12:imm
1993    blr  tmp
1994    mrs  tp, tpidr_el0
1995    add  dest, dest, tp
1996
1997    Initial Exec:
1998    mrs  tp, tpidr_el0
1999    adrp tmp, :gottprel:imm
2000    ldr  dest, [tmp, #:gottprel_lo12:imm]
2001    add  dest, dest, tp
2002
2003    Local Exec:
2004    mrs  tp, tpidr_el0
2005    add  t0, tp, #:tprel_hi12:imm, lsl #12
2006    add  t0, t0, #:tprel_lo12_nc:imm
2007 */
2008
2009 static void
2010 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2011                                    enum aarch64_symbol_type type)
2012 {
2013   switch (type)
2014     {
2015     case SYMBOL_SMALL_ABSOLUTE:
2016       {
2017         /* In ILP32, the mode of dest can be either SImode or DImode.  */
2018         rtx tmp_reg = dest;
2019         machine_mode mode = GET_MODE (dest);
2020
2021         gcc_assert (mode == Pmode || mode == ptr_mode);
2022
2023         if (can_create_pseudo_p ())
2024           tmp_reg = gen_reg_rtx (mode);
2025
2026         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2027         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2028         return;
2029       }
2030
2031     case SYMBOL_TINY_ABSOLUTE:
2032       emit_insn (gen_rtx_SET (dest, imm));
2033       return;
2034
2035     case SYMBOL_SMALL_GOT_28K:
2036       {
2037         machine_mode mode = GET_MODE (dest);
2038         rtx gp_rtx = pic_offset_table_rtx;
2039         rtx insn;
2040         rtx mem;
2041
2042         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2043            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2044            decide rtx costs, in which case pic_offset_table_rtx is not
2045            initialized.  For that case no need to generate the first adrp
2046            instruction as the final cost for global variable access is
2047            one instruction.  */
2048         if (gp_rtx != NULL)
2049           {
2050             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2051                using the page base as GOT base, the first page may be wasted,
2052                in the worst scenario, there is only 28K space for GOT).
2053
2054                The generate instruction sequence for accessing global variable
2055                is:
2056
2057                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2058
2059                Only one instruction needed. But we must initialize
2060                pic_offset_table_rtx properly.  We generate initialize insn for
2061                every global access, and allow CSE to remove all redundant.
2062
2063                The final instruction sequences will look like the following
2064                for multiply global variables access.
2065
2066                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2067
2068                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2069                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2070                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2071                  ...  */
2072
2073             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2074             crtl->uses_pic_offset_table = 1;
2075             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2076
2077             if (mode != GET_MODE (gp_rtx))
2078              gp_rtx = gen_lowpart (mode, gp_rtx);
2079
2080           }
2081
2082         if (mode == ptr_mode)
2083           {
2084             if (mode == DImode)
2085               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2086             else
2087               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2088
2089             mem = XVECEXP (SET_SRC (insn), 0, 0);
2090           }
2091         else
2092           {
2093             gcc_assert (mode == Pmode);
2094
2095             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2096             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2097           }
2098
2099         /* The operand is expected to be MEM.  Whenever the related insn
2100            pattern changed, above code which calculate mem should be
2101            updated.  */
2102         gcc_assert (GET_CODE (mem) == MEM);
2103         MEM_READONLY_P (mem) = 1;
2104         MEM_NOTRAP_P (mem) = 1;
2105         emit_insn (insn);
2106         return;
2107       }
2108
2109     case SYMBOL_SMALL_GOT_4G:
2110       {
2111         /* In ILP32, the mode of dest can be either SImode or DImode,
2112            while the got entry is always of SImode size.  The mode of
2113            dest depends on how dest is used: if dest is assigned to a
2114            pointer (e.g. in the memory), it has SImode; it may have
2115            DImode if dest is dereferenced to access the memeory.
2116            This is why we have to handle three different ldr_got_small
2117            patterns here (two patterns for ILP32).  */
2118
2119         rtx insn;
2120         rtx mem;
2121         rtx tmp_reg = dest;
2122         machine_mode mode = GET_MODE (dest);
2123
2124         if (can_create_pseudo_p ())
2125           tmp_reg = gen_reg_rtx (mode);
2126
2127         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2128         if (mode == ptr_mode)
2129           {
2130             if (mode == DImode)
2131               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2132             else
2133               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2134
2135             mem = XVECEXP (SET_SRC (insn), 0, 0);
2136           }
2137         else
2138           {
2139             gcc_assert (mode == Pmode);
2140
2141             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2142             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2143           }
2144
2145         gcc_assert (GET_CODE (mem) == MEM);
2146         MEM_READONLY_P (mem) = 1;
2147         MEM_NOTRAP_P (mem) = 1;
2148         emit_insn (insn);
2149         return;
2150       }
2151
2152     case SYMBOL_SMALL_TLSGD:
2153       {
2154         rtx_insn *insns;
2155         machine_mode mode = GET_MODE (dest);
2156         rtx result = gen_rtx_REG (mode, R0_REGNUM);
2157
2158         start_sequence ();
2159         if (TARGET_ILP32)
2160           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2161         else
2162           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2163         insns = get_insns ();
2164         end_sequence ();
2165
2166         RTL_CONST_CALL_P (insns) = 1;
2167         emit_libcall_block (insns, dest, result, imm);
2168         return;
2169       }
2170
2171     case SYMBOL_SMALL_TLSDESC:
2172       {
2173         machine_mode mode = GET_MODE (dest);
2174         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2175         rtx tp;
2176
2177         gcc_assert (mode == Pmode || mode == ptr_mode);
2178
2179         /* In ILP32, the got entry is always of SImode size.  Unlike
2180            small GOT, the dest is fixed at reg 0.  */
2181         if (TARGET_ILP32)
2182           emit_insn (gen_tlsdesc_small_si (imm));
2183         else
2184           emit_insn (gen_tlsdesc_small_di (imm));
2185         tp = aarch64_load_tp (NULL);
2186
2187         if (mode != Pmode)
2188           tp = gen_lowpart (mode, tp);
2189
2190         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2191         if (REG_P (dest))
2192           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2193         return;
2194       }
2195
2196     case SYMBOL_SMALL_TLSIE:
2197       {
2198         /* In ILP32, the mode of dest can be either SImode or DImode,
2199            while the got entry is always of SImode size.  The mode of
2200            dest depends on how dest is used: if dest is assigned to a
2201            pointer (e.g. in the memory), it has SImode; it may have
2202            DImode if dest is dereferenced to access the memeory.
2203            This is why we have to handle three different tlsie_small
2204            patterns here (two patterns for ILP32).  */
2205         machine_mode mode = GET_MODE (dest);
2206         rtx tmp_reg = gen_reg_rtx (mode);
2207         rtx tp = aarch64_load_tp (NULL);
2208
2209         if (mode == ptr_mode)
2210           {
2211             if (mode == DImode)
2212               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2213             else
2214               {
2215                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2216                 tp = gen_lowpart (mode, tp);
2217               }
2218           }
2219         else
2220           {
2221             gcc_assert (mode == Pmode);
2222             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2223           }
2224
2225         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2226         if (REG_P (dest))
2227           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2228         return;
2229       }
2230
2231     case SYMBOL_TLSLE12:
2232     case SYMBOL_TLSLE24:
2233     case SYMBOL_TLSLE32:
2234     case SYMBOL_TLSLE48:
2235       {
2236         machine_mode mode = GET_MODE (dest);
2237         rtx tp = aarch64_load_tp (NULL);
2238
2239         if (mode != Pmode)
2240           tp = gen_lowpart (mode, tp);
2241
2242         switch (type)
2243           {
2244           case SYMBOL_TLSLE12:
2245             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2246                         (dest, tp, imm));
2247             break;
2248           case SYMBOL_TLSLE24:
2249             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2250                         (dest, tp, imm));
2251           break;
2252           case SYMBOL_TLSLE32:
2253             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2254                         (dest, imm));
2255             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2256                         (dest, dest, tp));
2257           break;
2258           case SYMBOL_TLSLE48:
2259             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2260                         (dest, imm));
2261             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2262                         (dest, dest, tp));
2263             break;
2264           default:
2265             gcc_unreachable ();
2266           }
2267
2268         if (REG_P (dest))
2269           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2270         return;
2271       }
2272
2273     case SYMBOL_TINY_GOT:
2274       emit_insn (gen_ldr_got_tiny (dest, imm));
2275       return;
2276
2277     case SYMBOL_TINY_TLSIE:
2278       {
2279         machine_mode mode = GET_MODE (dest);
2280         rtx tp = aarch64_load_tp (NULL);
2281
2282         if (mode == ptr_mode)
2283           {
2284             if (mode == DImode)
2285               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2286             else
2287               {
2288                 tp = gen_lowpart (mode, tp);
2289                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2290               }
2291           }
2292         else
2293           {
2294             gcc_assert (mode == Pmode);
2295             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2296           }
2297
2298         if (REG_P (dest))
2299           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2300         return;
2301       }
2302
2303     default:
2304       gcc_unreachable ();
2305     }
2306 }
2307
2308 /* Emit a move from SRC to DEST.  Assume that the move expanders can
2309    handle all moves if !can_create_pseudo_p ().  The distinction is
2310    important because, unlike emit_move_insn, the move expanders know
2311    how to force Pmode objects into the constant pool even when the
2312    constant pool address is not itself legitimate.  */
2313 static rtx
2314 aarch64_emit_move (rtx dest, rtx src)
2315 {
2316   return (can_create_pseudo_p ()
2317           ? emit_move_insn (dest, src)
2318           : emit_move_insn_1 (dest, src));
2319 }
2320
2321 /* Apply UNOPTAB to OP and store the result in DEST.  */
2322
2323 static void
2324 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2325 {
2326   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2327   if (dest != tmp)
2328     emit_move_insn (dest, tmp);
2329 }
2330
2331 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2332
2333 static void
2334 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2335 {
2336   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2337                           OPTAB_DIRECT);
2338   if (dest != tmp)
2339     emit_move_insn (dest, tmp);
2340 }
2341
2342 /* Split a 128-bit move operation into two 64-bit move operations,
2343    taking care to handle partial overlap of register to register
2344    copies.  Special cases are needed when moving between GP regs and
2345    FP regs.  SRC can be a register, constant or memory; DST a register
2346    or memory.  If either operand is memory it must not have any side
2347    effects.  */
2348 void
2349 aarch64_split_128bit_move (rtx dst, rtx src)
2350 {
2351   rtx dst_lo, dst_hi;
2352   rtx src_lo, src_hi;
2353
2354   machine_mode mode = GET_MODE (dst);
2355
2356   gcc_assert (mode == TImode || mode == TFmode);
2357   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2358   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2359
2360   if (REG_P (dst) && REG_P (src))
2361     {
2362       int src_regno = REGNO (src);
2363       int dst_regno = REGNO (dst);
2364
2365       /* Handle FP <-> GP regs.  */
2366       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2367         {
2368           src_lo = gen_lowpart (word_mode, src);
2369           src_hi = gen_highpart (word_mode, src);
2370
2371           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2372           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2373           return;
2374         }
2375       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2376         {
2377           dst_lo = gen_lowpart (word_mode, dst);
2378           dst_hi = gen_highpart (word_mode, dst);
2379
2380           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2381           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2382           return;
2383         }
2384     }
2385
2386   dst_lo = gen_lowpart (word_mode, dst);
2387   dst_hi = gen_highpart (word_mode, dst);
2388   src_lo = gen_lowpart (word_mode, src);
2389   src_hi = gen_highpart_mode (word_mode, mode, src);
2390
2391   /* At most one pairing may overlap.  */
2392   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2393     {
2394       aarch64_emit_move (dst_hi, src_hi);
2395       aarch64_emit_move (dst_lo, src_lo);
2396     }
2397   else
2398     {
2399       aarch64_emit_move (dst_lo, src_lo);
2400       aarch64_emit_move (dst_hi, src_hi);
2401     }
2402 }
2403
2404 bool
2405 aarch64_split_128bit_move_p (rtx dst, rtx src)
2406 {
2407   return (! REG_P (src)
2408           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2409 }
2410
2411 /* Split a complex SIMD combine.  */
2412
2413 void
2414 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2415 {
2416   machine_mode src_mode = GET_MODE (src1);
2417   machine_mode dst_mode = GET_MODE (dst);
2418
2419   gcc_assert (VECTOR_MODE_P (dst_mode));
2420   gcc_assert (register_operand (dst, dst_mode)
2421               && register_operand (src1, src_mode)
2422               && register_operand (src2, src_mode));
2423
2424   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2425   return;
2426 }
2427
2428 /* Split a complex SIMD move.  */
2429
2430 void
2431 aarch64_split_simd_move (rtx dst, rtx src)
2432 {
2433   machine_mode src_mode = GET_MODE (src);
2434   machine_mode dst_mode = GET_MODE (dst);
2435
2436   gcc_assert (VECTOR_MODE_P (dst_mode));
2437
2438   if (REG_P (dst) && REG_P (src))
2439     {
2440       gcc_assert (VECTOR_MODE_P (src_mode));
2441       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2442     }
2443 }
2444
2445 bool
2446 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2447                               machine_mode ymode, rtx y)
2448 {
2449   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2450   gcc_assert (r != NULL);
2451   return rtx_equal_p (x, r);
2452 }
2453
2454
2455 static rtx
2456 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2457 {
2458   if (can_create_pseudo_p ())
2459     return force_reg (mode, value);
2460   else
2461     {
2462       gcc_assert (x);
2463       aarch64_emit_move (x, value);
2464       return x;
2465     }
2466 }
2467
2468 /* Return an all-true predicate register of mode MODE.  */
2469
2470 rtx
2471 aarch64_ptrue_reg (machine_mode mode)
2472 {
2473   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2474   return force_reg (mode, CONSTM1_RTX (mode));
2475 }
2476
2477 /* Return an all-false predicate register of mode MODE.  */
2478
2479 rtx
2480 aarch64_pfalse_reg (machine_mode mode)
2481 {
2482   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2483   return force_reg (mode, CONST0_RTX (mode));
2484 }
2485
2486 /* Return true if we can move VALUE into a register using a single
2487    CNT[BHWD] instruction.  */
2488
2489 static bool
2490 aarch64_sve_cnt_immediate_p (poly_int64 value)
2491 {
2492   HOST_WIDE_INT factor = value.coeffs[0];
2493   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2494   return (value.coeffs[1] == factor
2495           && IN_RANGE (factor, 2, 16 * 16)
2496           && (factor & 1) == 0
2497           && factor <= 16 * (factor & -factor));
2498 }
2499
2500 /* Likewise for rtx X.  */
2501
2502 bool
2503 aarch64_sve_cnt_immediate_p (rtx x)
2504 {
2505   poly_int64 value;
2506   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2507 }
2508
2509 /* Return the asm string for an instruction with a CNT-like vector size
2510    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2511    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2512    first part of the operands template (the part that comes before the
2513    vector size itself).  FACTOR is the number of quadwords.
2514    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2515    If it is zero, we can use any element size.  */
2516
2517 static char *
2518 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2519                                   unsigned int factor,
2520                                   unsigned int nelts_per_vq)
2521 {
2522   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2523
2524   if (nelts_per_vq == 0)
2525     /* There is some overlap in the ranges of the four CNT instructions.
2526        Here we always use the smallest possible element size, so that the
2527        multiplier is 1 whereever possible.  */
2528     nelts_per_vq = factor & -factor;
2529   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2530   gcc_assert (IN_RANGE (shift, 1, 4));
2531   char suffix = "dwhb"[shift - 1];
2532
2533   factor >>= shift;
2534   unsigned int written;
2535   if (factor == 1)
2536     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2537                         prefix, suffix, operands);
2538   else
2539     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2540                         prefix, suffix, operands, factor);
2541   gcc_assert (written < sizeof (buffer));
2542   return buffer;
2543 }
2544
2545 /* Return the asm string for an instruction with a CNT-like vector size
2546    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2547    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2548    first part of the operands template (the part that comes before the
2549    vector size itself).  X is the value of the vector size operand,
2550    as a polynomial integer rtx.  */
2551
2552 char *
2553 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2554                                   rtx x)
2555 {
2556   poly_int64 value = rtx_to_poly_int64 (x);
2557   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2558   return aarch64_output_sve_cnt_immediate (prefix, operands,
2559                                            value.coeffs[1], 0);
2560 }
2561
2562 /* Return true if we can add VALUE to a register using a single ADDVL
2563    or ADDPL instruction.  */
2564
2565 static bool
2566 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2567 {
2568   HOST_WIDE_INT factor = value.coeffs[0];
2569   if (factor == 0 || value.coeffs[1] != factor)
2570     return false;
2571   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2572      and a value of 16 is one vector width.  */
2573   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2574           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2575 }
2576
2577 /* Likewise for rtx X.  */
2578
2579 bool
2580 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2581 {
2582   poly_int64 value;
2583   return (poly_int_rtx_p (x, &value)
2584           && aarch64_sve_addvl_addpl_immediate_p (value));
2585 }
2586
2587 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2588    and storing the result in operand 0.  */
2589
2590 char *
2591 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2592 {
2593   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2594   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2595   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2596
2597   /* Use INC or DEC if possible.  */
2598   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2599     {
2600       if (aarch64_sve_cnt_immediate_p (offset_value))
2601         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2602                                                  offset_value.coeffs[1], 0);
2603       if (aarch64_sve_cnt_immediate_p (-offset_value))
2604         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2605                                                  -offset_value.coeffs[1], 0);
2606     }
2607
2608   int factor = offset_value.coeffs[1];
2609   if ((factor & 15) == 0)
2610     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2611   else
2612     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2613   return buffer;
2614 }
2615
2616 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2617    instruction.  If it is, store the number of elements in each vector
2618    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2619    factor in *FACTOR_OUT (if nonnull).  */
2620
2621 bool
2622 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2623                                  unsigned int *nelts_per_vq_out)
2624 {
2625   rtx elt;
2626   poly_int64 value;
2627
2628   if (!const_vec_duplicate_p (x, &elt)
2629       || !poly_int_rtx_p (elt, &value))
2630     return false;
2631
2632   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2633   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2634     /* There's no vector INCB.  */
2635     return false;
2636
2637   HOST_WIDE_INT factor = value.coeffs[0];
2638   if (value.coeffs[1] != factor)
2639     return false;
2640
2641   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2642   if ((factor % nelts_per_vq) != 0
2643       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2644     return false;
2645
2646   if (factor_out)
2647     *factor_out = factor;
2648   if (nelts_per_vq_out)
2649     *nelts_per_vq_out = nelts_per_vq;
2650   return true;
2651 }
2652
2653 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2654    instruction.  */
2655
2656 bool
2657 aarch64_sve_inc_dec_immediate_p (rtx x)
2658 {
2659   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2660 }
2661
2662 /* Return the asm template for an SVE vector INC or DEC instruction.
2663    OPERANDS gives the operands before the vector count and X is the
2664    value of the vector count operand itself.  */
2665
2666 char *
2667 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2668 {
2669   int factor;
2670   unsigned int nelts_per_vq;
2671   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2672     gcc_unreachable ();
2673   if (factor < 0)
2674     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2675                                              nelts_per_vq);
2676   else
2677     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2678                                              nelts_per_vq);
2679 }
2680
2681 static int
2682 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2683                                 scalar_int_mode mode)
2684 {
2685   int i;
2686   unsigned HOST_WIDE_INT val, val2, mask;
2687   int one_match, zero_match;
2688   int num_insns;
2689
2690   val = INTVAL (imm);
2691
2692   if (aarch64_move_imm (val, mode))
2693     {
2694       if (generate)
2695         emit_insn (gen_rtx_SET (dest, imm));
2696       return 1;
2697     }
2698
2699   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2700      (with XXXX non-zero). In that case check to see if the move can be done in
2701      a smaller mode.  */
2702   val2 = val & 0xffffffff;
2703   if (mode == DImode
2704       && aarch64_move_imm (val2, SImode)
2705       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2706     {
2707       if (generate)
2708         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2709
2710       /* Check if we have to emit a second instruction by checking to see
2711          if any of the upper 32 bits of the original DI mode value is set.  */
2712       if (val == val2)
2713         return 1;
2714
2715       i = (val >> 48) ? 48 : 32;
2716
2717       if (generate)
2718          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2719                                     GEN_INT ((val >> i) & 0xffff)));
2720
2721       return 2;
2722     }
2723
2724   if ((val >> 32) == 0 || mode == SImode)
2725     {
2726       if (generate)
2727         {
2728           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2729           if (mode == SImode)
2730             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2731                                        GEN_INT ((val >> 16) & 0xffff)));
2732           else
2733             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2734                                        GEN_INT ((val >> 16) & 0xffff)));
2735         }
2736       return 2;
2737     }
2738
2739   /* Remaining cases are all for DImode.  */
2740
2741   mask = 0xffff;
2742   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2743     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2744   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2745     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2746
2747   if (zero_match != 2 && one_match != 2)
2748     {
2749       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2750          For a 64-bit bitmask try whether changing 16 bits to all ones or
2751          zeroes creates a valid bitmask.  To check any repeated bitmask,
2752          try using 16 bits from the other 32-bit half of val.  */
2753
2754       for (i = 0; i < 64; i += 16, mask <<= 16)
2755         {
2756           val2 = val & ~mask;
2757           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2758             break;
2759           val2 = val | mask;
2760           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2761             break;
2762           val2 = val2 & ~mask;
2763           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2764           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2765             break;
2766         }
2767       if (i != 64)
2768         {
2769           if (generate)
2770             {
2771               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2772               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2773                                          GEN_INT ((val >> i) & 0xffff)));
2774             }
2775           return 2;
2776         }
2777     }
2778
2779   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2780      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2781      otherwise skip zero bits.  */
2782
2783   num_insns = 1;
2784   mask = 0xffff;
2785   val2 = one_match > zero_match ? ~val : val;
2786   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2787
2788   if (generate)
2789     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2790                                            ? (val | ~(mask << i))
2791                                            : (val & (mask << i)))));
2792   for (i += 16; i < 64; i += 16)
2793     {
2794       if ((val2 & (mask << i)) == 0)
2795         continue;
2796       if (generate)
2797         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2798                                    GEN_INT ((val >> i) & 0xffff)));
2799       num_insns ++;
2800     }
2801
2802   return num_insns;
2803 }
2804
2805 /* Return whether imm is a 128-bit immediate which is simple enough to
2806    expand inline.  */
2807 bool
2808 aarch64_mov128_immediate (rtx imm)
2809 {
2810   if (GET_CODE (imm) == CONST_INT)
2811     return true;
2812
2813   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2814
2815   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2816   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2817
2818   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2819          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2820 }
2821
2822
2823 /* Return the number of temporary registers that aarch64_add_offset_1
2824    would need to add OFFSET to a register.  */
2825
2826 static unsigned int
2827 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2828 {
2829   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2830 }
2831
2832 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2833    a non-polynomial OFFSET.  MODE is the mode of the addition.
2834    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2835    be set and CFA adjustments added to the generated instructions.
2836
2837    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2838    temporary if register allocation is already complete.  This temporary
2839    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2840    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2841    the immediate again.
2842
2843    Since this function may be used to adjust the stack pointer, we must
2844    ensure that it cannot cause transient stack deallocation (for example
2845    by first incrementing SP and then decrementing when adjusting by a
2846    large immediate).  */
2847
2848 static void
2849 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2850                       rtx src, HOST_WIDE_INT offset, rtx temp1,
2851                       bool frame_related_p, bool emit_move_imm)
2852 {
2853   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2854   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2855
2856   HOST_WIDE_INT moffset = abs_hwi (offset);
2857   rtx_insn *insn;
2858
2859   if (!moffset)
2860     {
2861       if (!rtx_equal_p (dest, src))
2862         {
2863           insn = emit_insn (gen_rtx_SET (dest, src));
2864           RTX_FRAME_RELATED_P (insn) = frame_related_p;
2865         }
2866       return;
2867     }
2868
2869   /* Single instruction adjustment.  */
2870   if (aarch64_uimm12_shift (moffset))
2871     {
2872       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2873       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2874       return;
2875     }
2876
2877   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2878      and either:
2879
2880      a) the offset cannot be loaded by a 16-bit move or
2881      b) there is no spare register into which we can move it.  */
2882   if (moffset < 0x1000000
2883       && ((!temp1 && !can_create_pseudo_p ())
2884           || !aarch64_move_imm (moffset, mode)))
2885     {
2886       HOST_WIDE_INT low_off = moffset & 0xfff;
2887
2888       low_off = offset < 0 ? -low_off : low_off;
2889       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2890       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2891       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2892       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2893       return;
2894     }
2895
2896   /* Emit a move immediate if required and an addition/subtraction.  */
2897   if (emit_move_imm)
2898     {
2899       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2900       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2901     }
2902   insn = emit_insn (offset < 0
2903                     ? gen_sub3_insn (dest, src, temp1)
2904                     : gen_add3_insn (dest, src, temp1));
2905   if (frame_related_p)
2906     {
2907       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2908       rtx adj = plus_constant (mode, src, offset);
2909       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2910     }
2911 }
2912
2913 /* Return the number of temporary registers that aarch64_add_offset
2914    would need to move OFFSET into a register or add OFFSET to a register;
2915    ADD_P is true if we want the latter rather than the former.  */
2916
2917 static unsigned int
2918 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2919 {
2920   /* This follows the same structure as aarch64_add_offset.  */
2921   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2922     return 0;
2923
2924   unsigned int count = 0;
2925   HOST_WIDE_INT factor = offset.coeffs[1];
2926   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2927   poly_int64 poly_offset (factor, factor);
2928   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2929     /* Need one register for the ADDVL/ADDPL result.  */
2930     count += 1;
2931   else if (factor != 0)
2932     {
2933       factor = abs (factor);
2934       if (factor > 16 * (factor & -factor))
2935         /* Need one register for the CNT result and one for the multiplication
2936            factor.  If necessary, the second temporary can be reused for the
2937            constant part of the offset.  */
2938         return 2;
2939       /* Need one register for the CNT result (which might then
2940          be shifted).  */
2941       count += 1;
2942     }
2943   return count + aarch64_add_offset_1_temporaries (constant);
2944 }
2945
2946 /* If X can be represented as a poly_int64, return the number
2947    of temporaries that are required to add it to a register.
2948    Return -1 otherwise.  */
2949
2950 int
2951 aarch64_add_offset_temporaries (rtx x)
2952 {
2953   poly_int64 offset;
2954   if (!poly_int_rtx_p (x, &offset))
2955     return -1;
2956   return aarch64_offset_temporaries (true, offset);
2957 }
2958
2959 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
2960    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2961    be set and CFA adjustments added to the generated instructions.
2962
2963    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2964    temporary if register allocation is already complete.  This temporary
2965    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2966    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2967    false to avoid emitting the immediate again.
2968
2969    TEMP2, if nonnull, is a second temporary register that doesn't
2970    overlap either DEST or REG.
2971
2972    Since this function may be used to adjust the stack pointer, we must
2973    ensure that it cannot cause transient stack deallocation (for example
2974    by first incrementing SP and then decrementing when adjusting by a
2975    large immediate).  */
2976
2977 static void
2978 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2979                     poly_int64 offset, rtx temp1, rtx temp2,
2980                     bool frame_related_p, bool emit_move_imm = true)
2981 {
2982   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2983   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2984   gcc_assert (temp1 == NULL_RTX
2985               || !frame_related_p
2986               || !reg_overlap_mentioned_p (temp1, dest));
2987   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2988
2989   /* Try using ADDVL or ADDPL to add the whole value.  */
2990   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2991     {
2992       rtx offset_rtx = gen_int_mode (offset, mode);
2993       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2994       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2995       return;
2996     }
2997
2998   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2999      SVE vector register, over and above the minimum size of 128 bits.
3000      This is equivalent to half the value returned by CNTD with a
3001      vector shape of ALL.  */
3002   HOST_WIDE_INT factor = offset.coeffs[1];
3003   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3004
3005   /* Try using ADDVL or ADDPL to add the VG-based part.  */
3006   poly_int64 poly_offset (factor, factor);
3007   if (src != const0_rtx
3008       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3009     {
3010       rtx offset_rtx = gen_int_mode (poly_offset, mode);
3011       if (frame_related_p)
3012         {
3013           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3014           RTX_FRAME_RELATED_P (insn) = true;
3015           src = dest;
3016         }
3017       else
3018         {
3019           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3020           src = aarch64_force_temporary (mode, temp1, addr);
3021           temp1 = temp2;
3022           temp2 = NULL_RTX;
3023         }
3024     }
3025   /* Otherwise use a CNT-based sequence.  */
3026   else if (factor != 0)
3027     {
3028       /* Use a subtraction if we have a negative factor.  */
3029       rtx_code code = PLUS;
3030       if (factor < 0)
3031         {
3032           factor = -factor;
3033           code = MINUS;
3034         }
3035
3036       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
3037          into the multiplication.  */
3038       rtx val;
3039       int shift = 0;
3040       if (factor & 1)
3041         /* Use a right shift by 1.  */
3042         shift = -1;
3043       else
3044         factor /= 2;
3045       HOST_WIDE_INT low_bit = factor & -factor;
3046       if (factor <= 16 * low_bit)
3047         {
3048           if (factor > 16 * 8)
3049             {
3050               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3051                  the value with the minimum multiplier and shift it into
3052                  position.  */
3053               int extra_shift = exact_log2 (low_bit);
3054               shift += extra_shift;
3055               factor >>= extra_shift;
3056             }
3057           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3058         }
3059       else
3060         {
3061           /* Use CNTD, then multiply it by FACTOR.  */
3062           val = gen_int_mode (poly_int64 (2, 2), mode);
3063           val = aarch64_force_temporary (mode, temp1, val);
3064
3065           /* Go back to using a negative multiplication factor if we have
3066              no register from which to subtract.  */
3067           if (code == MINUS && src == const0_rtx)
3068             {
3069               factor = -factor;
3070               code = PLUS;
3071             }
3072           rtx coeff1 = gen_int_mode (factor, mode);
3073           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3074           val = gen_rtx_MULT (mode, val, coeff1);
3075         }
3076
3077       if (shift > 0)
3078         {
3079           /* Multiply by 1 << SHIFT.  */
3080           val = aarch64_force_temporary (mode, temp1, val);
3081           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3082         }
3083       else if (shift == -1)
3084         {
3085           /* Divide by 2.  */
3086           val = aarch64_force_temporary (mode, temp1, val);
3087           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3088         }
3089
3090       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
3091       if (src != const0_rtx)
3092         {
3093           val = aarch64_force_temporary (mode, temp1, val);
3094           val = gen_rtx_fmt_ee (code, mode, src, val);
3095         }
3096       else if (code == MINUS)
3097         {
3098           val = aarch64_force_temporary (mode, temp1, val);
3099           val = gen_rtx_NEG (mode, val);
3100         }
3101
3102       if (constant == 0 || frame_related_p)
3103         {
3104           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3105           if (frame_related_p)
3106             {
3107               RTX_FRAME_RELATED_P (insn) = true;
3108               add_reg_note (insn, REG_CFA_ADJUST_CFA,
3109                             gen_rtx_SET (dest, plus_constant (Pmode, src,
3110                                                               poly_offset)));
3111             }
3112           src = dest;
3113           if (constant == 0)
3114             return;
3115         }
3116       else
3117         {
3118           src = aarch64_force_temporary (mode, temp1, val);
3119           temp1 = temp2;
3120           temp2 = NULL_RTX;
3121         }
3122
3123       emit_move_imm = true;
3124     }
3125
3126   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3127                         frame_related_p, emit_move_imm);
3128 }
3129
3130 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3131    than a poly_int64.  */
3132
3133 void
3134 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3135                           rtx offset_rtx, rtx temp1, rtx temp2)
3136 {
3137   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3138                       temp1, temp2, false);
3139 }
3140
3141 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3142    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
3143    if TEMP1 already contains abs (DELTA).  */
3144
3145 static inline void
3146 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3147 {
3148   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3149                       temp1, temp2, true, emit_move_imm);
3150 }
3151
3152 /* Subtract DELTA from the stack pointer, marking the instructions
3153    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
3154    if nonnull.  */
3155
3156 static inline void
3157 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3158                 bool emit_move_imm = true)
3159 {
3160   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3161                       temp1, temp2, frame_related_p, emit_move_imm);
3162 }
3163
3164 /* Set DEST to (vec_series BASE STEP).  */
3165
3166 static void
3167 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3168 {
3169   machine_mode mode = GET_MODE (dest);
3170   scalar_mode inner = GET_MODE_INNER (mode);
3171
3172   /* Each operand can be a register or an immediate in the range [-16, 15].  */
3173   if (!aarch64_sve_index_immediate_p (base))
3174     base = force_reg (inner, base);
3175   if (!aarch64_sve_index_immediate_p (step))
3176     step = force_reg (inner, step);
3177
3178   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3179 }
3180
3181 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
3182    integer of mode INT_MODE.  Return true on success.  */
3183
3184 static bool
3185 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
3186                                       rtx src)
3187 {
3188   /* If the constant is smaller than 128 bits, we can do the move
3189      using a vector of SRC_MODEs.  */
3190   if (src_mode != TImode)
3191     {
3192       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
3193                                      GET_MODE_SIZE (src_mode));
3194       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
3195       emit_move_insn (gen_lowpart (dup_mode, dest),
3196                       gen_const_vec_duplicate (dup_mode, src));
3197       return true;
3198     }
3199
3200   /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
3201   src = force_const_mem (src_mode, src);
3202   if (!src)
3203     return false;
3204
3205   /* Make sure that the address is legitimate.  */
3206   if (!aarch64_sve_ld1r_operand_p (src))
3207     {
3208       rtx addr = force_reg (Pmode, XEXP (src, 0));
3209       src = replace_equiv_address (src, addr);
3210     }
3211
3212   machine_mode mode = GET_MODE (dest);
3213   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3214   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3215   rtx ptrue = aarch64_ptrue_reg (pred_mode);
3216   src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
3217   emit_insn (gen_rtx_SET (dest, src));
3218   return true;
3219 }
3220
3221 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
3222    isn't a simple duplicate or series.  */
3223
3224 static void
3225 aarch64_expand_sve_const_vector (rtx dest, rtx src)
3226 {
3227   machine_mode mode = GET_MODE (src);
3228   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3229   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3230   gcc_assert (npatterns > 1);
3231
3232   if (nelts_per_pattern == 1)
3233     {
3234       /* The constant is a repeating seqeuence of at least two elements,
3235          where the repeating elements occupy no more than 128 bits.
3236          Get an integer representation of the replicated value.  */
3237       scalar_int_mode int_mode;
3238       if (BYTES_BIG_ENDIAN)
3239         /* For now, always use LD1RQ to load the value on big-endian
3240            targets, since the handling of smaller integers includes a
3241            subreg that is semantically an element reverse.  */
3242         int_mode = TImode;
3243       else
3244         {
3245           unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
3246           gcc_assert (int_bits <= 128);
3247           int_mode = int_mode_for_size (int_bits, 0).require ();
3248         }
3249       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
3250       if (int_value
3251           && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
3252         return;
3253     }
3254
3255   /* Expand each pattern individually.  */
3256   rtx_vector_builder builder;
3257   auto_vec<rtx, 16> vectors (npatterns);
3258   for (unsigned int i = 0; i < npatterns; ++i)
3259     {
3260       builder.new_vector (mode, 1, nelts_per_pattern);
3261       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3262         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3263       vectors.quick_push (force_reg (mode, builder.build ()));
3264     }
3265
3266   /* Use permutes to interleave the separate vectors.  */
3267   while (npatterns > 1)
3268     {
3269       npatterns /= 2;
3270       for (unsigned int i = 0; i < npatterns; ++i)
3271         {
3272           rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
3273           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3274           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3275           vectors[i] = tmp;
3276         }
3277     }
3278   gcc_assert (vectors[0] == dest);
3279 }
3280
3281 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
3282    is a pattern that can be used to set DEST to a replicated scalar
3283    element.  */
3284
3285 void
3286 aarch64_expand_mov_immediate (rtx dest, rtx imm,
3287                               rtx (*gen_vec_duplicate) (rtx, rtx))
3288 {
3289   machine_mode mode = GET_MODE (dest);
3290
3291   /* Check on what type of symbol it is.  */
3292   scalar_int_mode int_mode;
3293   if ((GET_CODE (imm) == SYMBOL_REF
3294        || GET_CODE (imm) == LABEL_REF
3295        || GET_CODE (imm) == CONST
3296        || GET_CODE (imm) == CONST_POLY_INT)
3297       && is_a <scalar_int_mode> (mode, &int_mode))
3298     {
3299       rtx mem;
3300       poly_int64 offset;
3301       HOST_WIDE_INT const_offset;
3302       enum aarch64_symbol_type sty;
3303
3304       /* If we have (const (plus symbol offset)), separate out the offset
3305          before we start classifying the symbol.  */
3306       rtx base = strip_offset (imm, &offset);
3307
3308       /* We must always add an offset involving VL separately, rather than
3309          folding it into the relocation.  */
3310       if (!offset.is_constant (&const_offset))
3311         {
3312           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
3313             emit_insn (gen_rtx_SET (dest, imm));
3314           else
3315             {
3316               /* Do arithmetic on 32-bit values if the result is smaller
3317                  than that.  */
3318               if (partial_subreg_p (int_mode, SImode))
3319                 {
3320                   /* It is invalid to do symbol calculations in modes
3321                      narrower than SImode.  */
3322                   gcc_assert (base == const0_rtx);
3323                   dest = gen_lowpart (SImode, dest);
3324                   int_mode = SImode;
3325                 }
3326               if (base != const0_rtx)
3327                 {
3328                   base = aarch64_force_temporary (int_mode, dest, base);
3329                   aarch64_add_offset (int_mode, dest, base, offset,
3330                                       NULL_RTX, NULL_RTX, false);
3331                 }
3332               else
3333                 aarch64_add_offset (int_mode, dest, base, offset,
3334                                     dest, NULL_RTX, false);
3335             }
3336           return;
3337         }
3338
3339       sty = aarch64_classify_symbol (base, const_offset);
3340       switch (sty)
3341         {
3342         case SYMBOL_FORCE_TO_MEM:
3343           if (const_offset != 0
3344               && targetm.cannot_force_const_mem (int_mode, imm))
3345             {
3346               gcc_assert (can_create_pseudo_p ());
3347               base = aarch64_force_temporary (int_mode, dest, base);
3348               aarch64_add_offset (int_mode, dest, base, const_offset,
3349                                   NULL_RTX, NULL_RTX, false);
3350               return;
3351             }
3352
3353           mem = force_const_mem (ptr_mode, imm);
3354           gcc_assert (mem);
3355
3356           /* If we aren't generating PC relative literals, then
3357              we need to expand the literal pool access carefully.
3358              This is something that needs to be done in a number
3359              of places, so could well live as a separate function.  */
3360           if (!aarch64_pcrelative_literal_loads)
3361             {
3362               gcc_assert (can_create_pseudo_p ());
3363               base = gen_reg_rtx (ptr_mode);
3364               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3365               if (ptr_mode != Pmode)
3366                 base = convert_memory_address (Pmode, base);
3367               mem = gen_rtx_MEM (ptr_mode, base);
3368             }
3369
3370           if (int_mode != ptr_mode)
3371             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3372
3373           emit_insn (gen_rtx_SET (dest, mem));
3374
3375           return;
3376
3377         case SYMBOL_SMALL_TLSGD:
3378         case SYMBOL_SMALL_TLSDESC:
3379         case SYMBOL_SMALL_TLSIE:
3380         case SYMBOL_SMALL_GOT_28K:
3381         case SYMBOL_SMALL_GOT_4G:
3382         case SYMBOL_TINY_GOT:
3383         case SYMBOL_TINY_TLSIE:
3384           if (const_offset != 0)
3385             {
3386               gcc_assert(can_create_pseudo_p ());
3387               base = aarch64_force_temporary (int_mode, dest, base);
3388               aarch64_add_offset (int_mode, dest, base, const_offset,
3389                                   NULL_RTX, NULL_RTX, false);
3390               return;
3391             }
3392           /* FALLTHRU */
3393
3394         case SYMBOL_SMALL_ABSOLUTE:
3395         case SYMBOL_TINY_ABSOLUTE:
3396         case SYMBOL_TLSLE12:
3397         case SYMBOL_TLSLE24:
3398         case SYMBOL_TLSLE32:
3399         case SYMBOL_TLSLE48:
3400           aarch64_load_symref_appropriately (dest, imm, sty);
3401           return;
3402
3403         default:
3404           gcc_unreachable ();
3405         }
3406     }
3407
3408   if (!CONST_INT_P (imm))
3409     {
3410       rtx base, step, value;
3411       if (GET_CODE (imm) == HIGH
3412           || aarch64_simd_valid_immediate (imm, NULL))
3413         emit_insn (gen_rtx_SET (dest, imm));
3414       else if (const_vec_series_p (imm, &base, &step))
3415         aarch64_expand_vec_series (dest, base, step);
3416       else if (const_vec_duplicate_p (imm, &value))
3417         {
3418           /* If the constant is out of range of an SVE vector move,
3419              load it from memory if we can, otherwise move it into
3420              a register and use a DUP.  */
3421           scalar_mode inner_mode = GET_MODE_INNER (mode);
3422           rtx op = force_const_mem (inner_mode, value);
3423           if (!op)
3424             op = force_reg (inner_mode, value);
3425           else if (!aarch64_sve_ld1r_operand_p (op))
3426             {
3427               rtx addr = force_reg (Pmode, XEXP (op, 0));
3428               op = replace_equiv_address (op, addr);
3429             }
3430           emit_insn (gen_vec_duplicate (dest, op));
3431         }
3432       else if (GET_CODE (imm) == CONST_VECTOR
3433                && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3434         aarch64_expand_sve_const_vector (dest, imm);
3435       else
3436         {
3437           rtx mem = force_const_mem (mode, imm);
3438           gcc_assert (mem);
3439           emit_move_insn (dest, mem);
3440         }
3441
3442       return;
3443     }
3444
3445   aarch64_internal_mov_immediate (dest, imm, true,
3446                                   as_a <scalar_int_mode> (mode));
3447 }
3448
3449 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3450    that is known to contain PTRUE.  */
3451
3452 void
3453 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3454 {
3455   expand_operand ops[3];
3456   machine_mode mode = GET_MODE (dest);
3457   create_output_operand (&ops[0], dest, mode);
3458   create_input_operand (&ops[1], pred, GET_MODE(pred));
3459   create_input_operand (&ops[2], src, mode);
3460   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
3461 }
3462
3463 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3464    operand is in memory.  In this case we need to use the predicated LD1
3465    and ST1 instead of LDR and STR, both for correctness on big-endian
3466    targets and because LD1 and ST1 support a wider range of addressing modes.
3467    PRED_MODE is the mode of the predicate.
3468
3469    See the comment at the head of aarch64-sve.md for details about the
3470    big-endian handling.  */
3471
3472 void
3473 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3474 {
3475   machine_mode mode = GET_MODE (dest);
3476   rtx ptrue = aarch64_ptrue_reg (pred_mode);
3477   if (!register_operand (src, mode)
3478       && !register_operand (dest, mode))
3479     {
3480       rtx tmp = gen_reg_rtx (mode);
3481       if (MEM_P (src))
3482         aarch64_emit_sve_pred_move (tmp, ptrue, src);
3483       else
3484         emit_move_insn (tmp, src);
3485       src = tmp;
3486     }
3487   aarch64_emit_sve_pred_move (dest, ptrue, src);
3488 }
3489
3490 /* Called only on big-endian targets.  See whether an SVE vector move
3491    from SRC to DEST is effectively a REV[BHW] instruction, because at
3492    least one operand is a subreg of an SVE vector that has wider or
3493    narrower elements.  Return true and emit the instruction if so.
3494
3495    For example:
3496
3497      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3498
3499    represents a VIEW_CONVERT between the following vectors, viewed
3500    in memory order:
3501
3502      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
3503      R1: { [0],      [1],      [2],      [3],     ... }
3504
3505    The high part of lane X in R2 should therefore correspond to lane X*2
3506    of R1, but the register representations are:
3507
3508          msb                                      lsb
3509      R2: ...... [1].high  [1].low   [0].high  [0].low
3510      R1: ...... [3]       [2]       [1]       [0]
3511
3512    where the low part of lane X in R2 corresponds to lane X*2 in R1.
3513    We therefore need a reverse operation to swap the high and low values
3514    around.
3515
3516    This is purely an optimization.  Without it we would spill the
3517    subreg operand to the stack in one mode and reload it in the
3518    other mode, which has the same effect as the REV.  */
3519
3520 bool
3521 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3522 {
3523   gcc_assert (BYTES_BIG_ENDIAN);
3524   if (GET_CODE (dest) == SUBREG)
3525     dest = SUBREG_REG (dest);
3526   if (GET_CODE (src) == SUBREG)
3527     src = SUBREG_REG (src);
3528
3529   /* The optimization handles two single SVE REGs with different element
3530      sizes.  */
3531   if (!REG_P (dest)
3532       || !REG_P (src)
3533       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3534       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3535       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3536           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3537     return false;
3538
3539   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
3540   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
3541   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3542                                UNSPEC_REV_SUBREG);
3543   emit_insn (gen_rtx_SET (dest, unspec));
3544   return true;
3545 }
3546
3547 /* Return a copy of X with mode MODE, without changing its other
3548    attributes.  Unlike gen_lowpart, this doesn't care whether the
3549    mode change is valid.  */
3550
3551 static rtx
3552 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3553 {
3554   if (GET_MODE (x) == mode)
3555     return x;
3556
3557   x = shallow_copy_rtx (x);
3558   set_mode_and_regno (x, mode, REGNO (x));
3559   return x;
3560 }
3561
3562 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3563    operands.  */
3564
3565 void
3566 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3567 {
3568   /* Decide which REV operation we need.  The mode with narrower elements
3569      determines the mode of the operands and the mode with the wider
3570      elements determines the reverse width.  */
3571   machine_mode mode_with_wider_elts = GET_MODE (dest);
3572   machine_mode mode_with_narrower_elts = GET_MODE (src);
3573   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3574       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3575     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3576
3577   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3578   unsigned int unspec;
3579   if (wider_bytes == 8)
3580     unspec = UNSPEC_REV64;
3581   else if (wider_bytes == 4)
3582     unspec = UNSPEC_REV32;
3583   else if (wider_bytes == 2)
3584     unspec = UNSPEC_REV16;
3585   else
3586     gcc_unreachable ();
3587   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3588
3589   /* Emit:
3590
3591        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3592                          UNSPEC_MERGE_PTRUE))
3593
3594      with the appropriate modes.  */
3595   ptrue = gen_lowpart (pred_mode, ptrue);
3596   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3597   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3598   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3599   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3600                         UNSPEC_MERGE_PTRUE);
3601   emit_insn (gen_rtx_SET (dest, src));
3602 }
3603
3604 static bool
3605 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3606                                  tree exp ATTRIBUTE_UNUSED)
3607 {
3608   if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
3609     return false;
3610
3611   return true;
3612 }
3613
3614 /* Implement TARGET_PASS_BY_REFERENCE.  */
3615
3616 static bool
3617 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3618                            machine_mode mode,
3619                            const_tree type,
3620                            bool named ATTRIBUTE_UNUSED)
3621 {
3622   HOST_WIDE_INT size;
3623   machine_mode dummymode;
3624   int nregs;
3625
3626   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3627   if (mode == BLKmode && type)
3628     size = int_size_in_bytes (type);
3629   else
3630     /* No frontends can create types with variable-sized modes, so we
3631        shouldn't be asked to pass or return them.  */
3632     size = GET_MODE_SIZE (mode).to_constant ();
3633
3634   /* Aggregates are passed by reference based on their size.  */
3635   if (type && AGGREGATE_TYPE_P (type))
3636     {
3637       size = int_size_in_bytes (type);
3638     }
3639
3640   /* Variable sized arguments are always returned by reference.  */
3641   if (size < 0)
3642     return true;
3643
3644   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3645   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3646                                                &dummymode, &nregs,
3647                                                NULL))
3648     return false;
3649
3650   /* Arguments which are variable sized or larger than 2 registers are
3651      passed by reference unless they are a homogenous floating point
3652      aggregate.  */
3653   return size > 2 * UNITS_PER_WORD;
3654 }
3655
3656 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3657 static bool
3658 aarch64_return_in_msb (const_tree valtype)
3659 {
3660   machine_mode dummy_mode;
3661   int dummy_int;
3662
3663   /* Never happens in little-endian mode.  */
3664   if (!BYTES_BIG_ENDIAN)
3665     return false;
3666
3667   /* Only composite types smaller than or equal to 16 bytes can
3668      be potentially returned in registers.  */
3669   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3670       || int_size_in_bytes (valtype) <= 0
3671       || int_size_in_bytes (valtype) > 16)
3672     return false;
3673
3674   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3675      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3676      is always passed/returned in the least significant bits of fp/simd
3677      register(s).  */
3678   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3679                                                &dummy_mode, &dummy_int, NULL))
3680     return false;
3681
3682   return true;
3683 }
3684
3685 /* Implement TARGET_FUNCTION_VALUE.
3686    Define how to find the value returned by a function.  */
3687
3688 static rtx
3689 aarch64_function_value (const_tree type, const_tree func,
3690                         bool outgoing ATTRIBUTE_UNUSED)
3691 {
3692   machine_mode mode;
3693   int unsignedp;
3694   int count;
3695   machine_mode ag_mode;
3696
3697   mode = TYPE_MODE (type);
3698   if (INTEGRAL_TYPE_P (type))
3699     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3700
3701   if (aarch64_return_in_msb (type))
3702     {
3703       HOST_WIDE_INT size = int_size_in_bytes (type);
3704
3705       if (size % UNITS_PER_WORD != 0)
3706         {
3707           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3708           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3709         }
3710     }
3711
3712   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3713                                                &ag_mode, &count, NULL))
3714     {
3715       if (!aarch64_composite_type_p (type, mode))
3716         {
3717           gcc_assert (count == 1 && mode == ag_mode);
3718           return gen_rtx_REG (mode, V0_REGNUM);
3719         }
3720       else
3721         {
3722           int i;
3723           rtx par;
3724
3725           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3726           for (i = 0; i < count; i++)
3727             {
3728               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3729               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3730               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3731               XVECEXP (par, 0, i) = tmp;
3732             }
3733           return par;
3734         }
3735     }
3736   else
3737     return gen_rtx_REG (mode, R0_REGNUM);
3738 }
3739
3740 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3741    Return true if REGNO is the number of a hard register in which the values
3742    of called function may come back.  */
3743
3744 static bool
3745 aarch64_function_value_regno_p (const unsigned int regno)
3746 {
3747   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3748      of 16-byte return values are: 128-bit integers and 16-byte small
3749      structures (excluding homogeneous floating-point aggregates).  */
3750   if (regno == R0_REGNUM || regno == R1_REGNUM)
3751     return true;
3752
3753   /* Up to four fp/simd registers can return a function value, e.g. a
3754      homogeneous floating-point aggregate having four members.  */
3755   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3756     return TARGET_FLOAT;
3757
3758   return false;
3759 }
3760
3761 /* Implement TARGET_RETURN_IN_MEMORY.
3762
3763    If the type T of the result of a function is such that
3764      void func (T arg)
3765    would require that arg be passed as a value in a register (or set of
3766    registers) according to the parameter passing rules, then the result
3767    is returned in the same registers as would be used for such an
3768    argument.  */
3769
3770 static bool
3771 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3772 {
3773   HOST_WIDE_INT size;
3774   machine_mode ag_mode;
3775   int count;
3776
3777   if (!AGGREGATE_TYPE_P (type)
3778       && TREE_CODE (type) != COMPLEX_TYPE
3779       && TREE_CODE (type) != VECTOR_TYPE)
3780     /* Simple scalar types always returned in registers.  */
3781     return false;
3782
3783   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3784                                                type,
3785                                                &ag_mode,
3786                                                &count,
3787                                                NULL))
3788     return false;
3789
3790   /* Types larger than 2 registers returned in memory.  */
3791   size = int_size_in_bytes (type);
3792   return (size < 0 || size > 2 * UNITS_PER_WORD);
3793 }
3794
3795 static bool
3796 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3797                                const_tree type, int *nregs)
3798 {
3799   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3800   return aarch64_vfp_is_call_or_return_candidate (mode,
3801                                                   type,
3802                                                   &pcum->aapcs_vfp_rmode,
3803                                                   nregs,
3804                                                   NULL);
3805 }
3806
3807 /* Given MODE and TYPE of a function argument, return the alignment in
3808    bits.  The idea is to suppress any stronger alignment requested by
3809    the user and opt for the natural alignment (specified in AAPCS64 \S
3810    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
3811    calculated in versions of GCC prior to GCC-9.  This is a helper
3812    function for local use only.  */
3813
3814 static unsigned int
3815 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
3816                                 bool *abi_break)
3817 {
3818   *abi_break = false;
3819   if (!type)
3820     return GET_MODE_ALIGNMENT (mode);
3821
3822   if (integer_zerop (TYPE_SIZE (type)))
3823     return 0;
3824
3825   gcc_assert (TYPE_MODE (type) == mode);
3826
3827   if (!AGGREGATE_TYPE_P (type))
3828     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3829
3830   if (TREE_CODE (type) == ARRAY_TYPE)
3831     return TYPE_ALIGN (TREE_TYPE (type));
3832
3833   unsigned int alignment = 0;
3834   unsigned int bitfield_alignment = 0;
3835   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3836     if (TREE_CODE (field) == FIELD_DECL)
3837       {
3838         alignment = std::max (alignment, DECL_ALIGN (field));
3839         if (DECL_BIT_FIELD_TYPE (field))
3840           bitfield_alignment
3841             = std::max (bitfield_alignment,
3842                         TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
3843       }
3844
3845   if (bitfield_alignment > alignment)
3846     {
3847       *abi_break = true;
3848       return bitfield_alignment;
3849     }
3850
3851   return alignment;
3852 }
3853
3854 /* Layout a function argument according to the AAPCS64 rules.  The rule
3855    numbers refer to the rule numbers in the AAPCS64.  */
3856
3857 static void
3858 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3859                     const_tree type,
3860                     bool named ATTRIBUTE_UNUSED)
3861 {
3862   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3863   int ncrn, nvrn, nregs;
3864   bool allocate_ncrn, allocate_nvrn;
3865   HOST_WIDE_INT size;
3866   bool abi_break;
3867
3868   /* We need to do this once per argument.  */
3869   if (pcum->aapcs_arg_processed)
3870     return;
3871
3872   pcum->aapcs_arg_processed = true;
3873
3874   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
3875   if (type)
3876     size = int_size_in_bytes (type);
3877   else
3878     /* No frontends can create types with variable-sized modes, so we
3879        shouldn't be asked to pass or return them.  */
3880     size = GET_MODE_SIZE (mode).to_constant ();
3881   size = ROUND_UP (size, UNITS_PER_WORD);
3882
3883   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3884   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3885                                                  mode,
3886                                                  type,
3887                                                  &nregs);
3888
3889   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3890      The following code thus handles passing by SIMD/FP registers first.  */
3891
3892   nvrn = pcum->aapcs_nvrn;
3893
3894   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3895      and homogenous short-vector aggregates (HVA).  */
3896   if (allocate_nvrn)
3897     {
3898       if (!TARGET_FLOAT)
3899         aarch64_err_no_fpadvsimd (mode);
3900
3901       if (nvrn + nregs <= NUM_FP_ARG_REGS)
3902         {
3903           pcum->aapcs_nextnvrn = nvrn + nregs;
3904           if (!aarch64_composite_type_p (type, mode))
3905             {
3906               gcc_assert (nregs == 1);
3907               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3908             }
3909           else
3910             {
3911               rtx par;
3912               int i;
3913               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3914               for (i = 0; i < nregs; i++)
3915                 {
3916                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3917                                          V0_REGNUM + nvrn + i);
3918                   rtx offset = gen_int_mode
3919                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3920                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3921                   XVECEXP (par, 0, i) = tmp;
3922                 }
3923               pcum->aapcs_reg = par;
3924             }
3925           return;
3926         }
3927       else
3928         {
3929           /* C.3 NSRN is set to 8.  */
3930           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3931           goto on_stack;
3932         }
3933     }
3934
3935   ncrn = pcum->aapcs_ncrn;
3936   nregs = size / UNITS_PER_WORD;
3937
3938   /* C6 - C9.  though the sign and zero extension semantics are
3939      handled elsewhere.  This is the case where the argument fits
3940      entirely general registers.  */
3941   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3942     {
3943       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3944
3945       /* C.8 if the argument has an alignment of 16 then the NGRN is
3946          rounded up to the next even number.  */
3947       if (nregs == 2
3948           && ncrn % 2
3949           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3950              comparison is there because for > 16 * BITS_PER_UNIT
3951              alignment nregs should be > 2 and therefore it should be
3952              passed by reference rather than value.  */
3953           && (aarch64_function_arg_alignment (mode, type, &abi_break)
3954               == 16 * BITS_PER_UNIT))
3955         {
3956           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
3957             inform (input_location, "parameter passing for argument of type "
3958                     "%qT changed in GCC 9.1", type);
3959           ++ncrn;
3960           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3961         }
3962
3963       /* NREGS can be 0 when e.g. an empty structure is to be passed.
3964          A reg is still generated for it, but the caller should be smart
3965          enough not to use it.  */
3966       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3967         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3968       else
3969         {
3970           rtx par;
3971           int i;
3972
3973           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3974           for (i = 0; i < nregs; i++)
3975             {
3976               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3977               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3978                                        GEN_INT (i * UNITS_PER_WORD));
3979               XVECEXP (par, 0, i) = tmp;
3980             }
3981           pcum->aapcs_reg = par;
3982         }
3983
3984       pcum->aapcs_nextncrn = ncrn + nregs;
3985       return;
3986     }
3987
3988   /* C.11  */
3989   pcum->aapcs_nextncrn = NUM_ARG_REGS;
3990
3991   /* The argument is passed on stack; record the needed number of words for
3992      this argument and align the total size if necessary.  */
3993 on_stack:
3994   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3995
3996   if (aarch64_function_arg_alignment (mode, type, &abi_break)
3997       == 16 * BITS_PER_UNIT)
3998     {
3999       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4000       if (pcum->aapcs_stack_size != new_size)
4001         {
4002           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4003             inform (input_location, "parameter passing for argument of type "
4004                     "%qT changed in GCC 9.1", type);
4005           pcum->aapcs_stack_size = new_size;
4006         }
4007     }
4008   return;
4009 }
4010
4011 /* Implement TARGET_FUNCTION_ARG.  */
4012
4013 static rtx
4014 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
4015                       const_tree type, bool named)
4016 {
4017   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4018   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
4019
4020   if (mode == VOIDmode)
4021     return NULL_RTX;
4022
4023   aarch64_layout_arg (pcum_v, mode, type, named);
4024   return pcum->aapcs_reg;
4025 }
4026
4027 void
4028 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4029                            const_tree fntype ATTRIBUTE_UNUSED,
4030                            rtx libname ATTRIBUTE_UNUSED,
4031                            const_tree fndecl ATTRIBUTE_UNUSED,
4032                            unsigned n_named ATTRIBUTE_UNUSED)
4033 {
4034   pcum->aapcs_ncrn = 0;
4035   pcum->aapcs_nvrn = 0;
4036   pcum->aapcs_nextncrn = 0;
4037   pcum->aapcs_nextnvrn = 0;
4038   pcum->pcs_variant = ARM_PCS_AAPCS64;
4039   pcum->aapcs_reg = NULL_RTX;
4040   pcum->aapcs_arg_processed = false;
4041   pcum->aapcs_stack_words = 0;
4042   pcum->aapcs_stack_size = 0;
4043
4044   if (!TARGET_FLOAT
4045       && fndecl && TREE_PUBLIC (fndecl)
4046       && fntype && fntype != error_mark_node)
4047     {
4048       const_tree type = TREE_TYPE (fntype);
4049       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
4050       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
4051       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4052                                                    &mode, &nregs, NULL))
4053         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4054     }
4055   return;
4056 }
4057
4058 static void
4059 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4060                               machine_mode mode,
4061                               const_tree type,
4062                               bool named)
4063 {
4064   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4065   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4066     {
4067       aarch64_layout_arg (pcum_v, mode, type, named);
4068       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4069                   != (pcum->aapcs_stack_words != 0));
4070       pcum->aapcs_arg_processed = false;
4071       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4072       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4073       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4074       pcum->aapcs_stack_words = 0;
4075       pcum->aapcs_reg = NULL_RTX;
4076     }
4077 }
4078
4079 bool
4080 aarch64_function_arg_regno_p (unsigned regno)
4081 {
4082   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4083           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4084 }
4085
4086 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
4087    PARM_BOUNDARY bits of alignment, but will be given anything up
4088    to STACK_BOUNDARY bits if the type requires it.  This makes sure
4089    that both before and after the layout of each argument, the Next
4090    Stacked Argument Address (NSAA) will have a minimum alignment of
4091    8 bytes.  */
4092
4093 static unsigned int
4094 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4095 {
4096   bool abi_break;
4097   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4098                                                            &abi_break);
4099   if (abi_break & warn_psabi)
4100     inform (input_location, "parameter passing for argument of type "
4101             "%qT changed in GCC 9.1", type);
4102
4103   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4104 }
4105
4106 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
4107
4108 static fixed_size_mode
4109 aarch64_get_reg_raw_mode (int regno)
4110 {
4111   if (TARGET_SVE && FP_REGNUM_P (regno))
4112     /* Don't use the SVE part of the register for __builtin_apply and
4113        __builtin_return.  The SVE registers aren't used by the normal PCS,
4114        so using them there would be a waste of time.  The PCS extensions
4115        for SVE types are fundamentally incompatible with the
4116        __builtin_return/__builtin_apply interface.  */
4117     return as_a <fixed_size_mode> (V16QImode);
4118   return default_get_reg_raw_mode (regno);
4119 }
4120
4121 /* Implement TARGET_FUNCTION_ARG_PADDING.
4122
4123    Small aggregate types are placed in the lowest memory address.
4124
4125    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
4126
4127 static pad_direction
4128 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4129 {
4130   /* On little-endian targets, the least significant byte of every stack
4131      argument is passed at the lowest byte address of the stack slot.  */
4132   if (!BYTES_BIG_ENDIAN)
4133     return PAD_UPWARD;
4134
4135   /* Otherwise, integral, floating-point and pointer types are padded downward:
4136      the least significant byte of a stack argument is passed at the highest
4137      byte address of the stack slot.  */
4138   if (type
4139       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4140          || POINTER_TYPE_P (type))
4141       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4142     return PAD_DOWNWARD;
4143
4144   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
4145   return PAD_UPWARD;
4146 }
4147
4148 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4149
4150    It specifies padding for the last (may also be the only)
4151    element of a block move between registers and memory.  If
4152    assuming the block is in the memory, padding upward means that
4153    the last element is padded after its highest significant byte,
4154    while in downward padding, the last element is padded at the
4155    its least significant byte side.
4156
4157    Small aggregates and small complex types are always padded
4158    upwards.
4159
4160    We don't need to worry about homogeneous floating-point or
4161    short-vector aggregates; their move is not affected by the
4162    padding direction determined here.  Regardless of endianness,
4163    each element of such an aggregate is put in the least
4164    significant bits of a fp/simd register.
4165
4166    Return !BYTES_BIG_ENDIAN if the least significant byte of the
4167    register has useful data, and return the opposite if the most
4168    significant byte does.  */
4169
4170 bool
4171 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4172                      bool first ATTRIBUTE_UNUSED)
4173 {
4174
4175   /* Small composite types are always padded upward.  */
4176   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4177     {
4178       HOST_WIDE_INT size;
4179       if (type)
4180         size = int_size_in_bytes (type);
4181       else
4182         /* No frontends can create types with variable-sized modes, so we
4183            shouldn't be asked to pass or return them.  */
4184         size = GET_MODE_SIZE (mode).to_constant ();
4185       if (size < 2 * UNITS_PER_WORD)
4186         return true;
4187     }
4188
4189   /* Otherwise, use the default padding.  */
4190   return !BYTES_BIG_ENDIAN;
4191 }
4192
4193 static scalar_int_mode
4194 aarch64_libgcc_cmp_return_mode (void)
4195 {
4196   return SImode;
4197 }
4198
4199 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4200
4201 /* We use the 12-bit shifted immediate arithmetic instructions so values
4202    must be multiple of (1 << 12), i.e. 4096.  */
4203 #define ARITH_FACTOR 4096
4204
4205 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4206 #error Cannot use simple address calculation for stack probing
4207 #endif
4208
4209 /* The pair of scratch registers used for stack probing.  */
4210 #define PROBE_STACK_FIRST_REG  R9_REGNUM
4211 #define PROBE_STACK_SECOND_REG R10_REGNUM
4212
4213 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4214    inclusive.  These are offsets from the current stack pointer.  */
4215
4216 static void
4217 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4218 {
4219   HOST_WIDE_INT size;
4220   if (!poly_size.is_constant (&size))
4221     {
4222       sorry ("stack probes for SVE frames");
4223       return;
4224     }
4225
4226   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4227
4228   /* See the same assertion on PROBE_INTERVAL above.  */
4229   gcc_assert ((first % ARITH_FACTOR) == 0);
4230
4231   /* See if we have a constant small number of probes to generate.  If so,
4232      that's the easy case.  */
4233   if (size <= PROBE_INTERVAL)
4234     {
4235       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4236
4237       emit_set_insn (reg1,
4238                      plus_constant (Pmode,
4239                                     stack_pointer_rtx, -(first + base)));
4240       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4241     }
4242
4243   /* The run-time loop is made up of 8 insns in the generic case while the
4244      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
4245   else if (size <= 4 * PROBE_INTERVAL)
4246     {
4247       HOST_WIDE_INT i, rem;
4248
4249       emit_set_insn (reg1,
4250                      plus_constant (Pmode,
4251                                     stack_pointer_rtx,
4252                                     -(first + PROBE_INTERVAL)));
4253       emit_stack_probe (reg1);
4254
4255       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4256          it exceeds SIZE.  If only two probes are needed, this will not
4257          generate any code.  Then probe at FIRST + SIZE.  */
4258       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4259         {
4260           emit_set_insn (reg1,
4261                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4262           emit_stack_probe (reg1);
4263         }
4264
4265       rem = size - (i - PROBE_INTERVAL);
4266       if (rem > 256)
4267         {
4268           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4269
4270           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4271           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4272         }
4273       else
4274         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4275     }
4276
4277   /* Otherwise, do the same as above, but in a loop.  Note that we must be
4278      extra careful with variables wrapping around because we might be at
4279      the very top (or the very bottom) of the address space and we have
4280      to be able to handle this case properly; in particular, we use an
4281      equality test for the loop condition.  */
4282   else
4283     {
4284       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
4285
4286       /* Step 1: round SIZE to the previous multiple of the interval.  */
4287
4288       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
4289
4290
4291       /* Step 2: compute initial and final value of the loop counter.  */
4292
4293       /* TEST_ADDR = SP + FIRST.  */
4294       emit_set_insn (reg1,
4295                      plus_constant (Pmode, stack_pointer_rtx, -first));
4296
4297       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
4298       HOST_WIDE_INT adjustment = - (first + rounded_size);
4299       if (! aarch64_uimm12_shift (adjustment))
4300         {
4301           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
4302                                           true, Pmode);
4303           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
4304         }
4305       else
4306         emit_set_insn (reg2,
4307                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
4308
4309       /* Step 3: the loop
4310
4311          do
4312            {
4313              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4314              probe at TEST_ADDR
4315            }
4316          while (TEST_ADDR != LAST_ADDR)
4317
4318          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4319          until it is equal to ROUNDED_SIZE.  */
4320
4321       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
4322
4323
4324       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4325          that SIZE is equal to ROUNDED_SIZE.  */
4326
4327       if (size != rounded_size)
4328         {
4329           HOST_WIDE_INT rem = size - rounded_size;
4330
4331           if (rem > 256)
4332             {
4333               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4334
4335               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
4336               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
4337             }
4338           else
4339             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
4340         }
4341     }
4342
4343   /* Make sure nothing is scheduled before we are done.  */
4344   emit_insn (gen_blockage ());
4345 }
4346
4347 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
4348    absolute addresses.  */
4349
4350 const char *
4351 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
4352 {
4353   static int labelno = 0;
4354   char loop_lab[32];
4355   rtx xops[2];
4356
4357   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
4358
4359   /* Loop.  */
4360   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
4361
4362   HOST_WIDE_INT stack_clash_probe_interval
4363     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
4364
4365   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
4366   xops[0] = reg1;
4367   HOST_WIDE_INT interval;
4368   if (flag_stack_clash_protection)
4369     interval = stack_clash_probe_interval;
4370   else
4371     interval = PROBE_INTERVAL;
4372
4373   gcc_assert (aarch64_uimm12_shift (interval));
4374   xops[1] = GEN_INT (interval);
4375
4376   output_asm_insn ("sub\t%0, %0, %1", xops);
4377
4378   /* If doing stack clash protection then we probe up by the ABI specified
4379      amount.  We do this because we're dropping full pages at a time in the
4380      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
4381   if (flag_stack_clash_protection)
4382     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
4383   else
4384     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
4385
4386   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
4387      by this amount for each iteration.  */
4388   output_asm_insn ("str\txzr, [%0, %1]", xops);
4389
4390   /* Test if TEST_ADDR == LAST_ADDR.  */
4391   xops[1] = reg2;
4392   output_asm_insn ("cmp\t%0, %1", xops);
4393
4394   /* Branch.  */
4395   fputs ("\tb.ne\t", asm_out_file);
4396   assemble_name_raw (asm_out_file, loop_lab);
4397   fputc ('\n', asm_out_file);
4398
4399   return "";
4400 }
4401
4402 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4403    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4404    of GUARD_SIZE.  When a probe is emitted it is done at most
4405    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4406    at most MIN_PROBE_THRESHOLD.  By the end of this function
4407    BASE = BASE - ADJUSTMENT.  */
4408
4409 const char *
4410 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
4411                                       rtx min_probe_threshold, rtx guard_size)
4412 {
4413   /* This function is not allowed to use any instruction generation function
4414      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
4415      so instead emit the code you want using output_asm_insn.  */
4416   gcc_assert (flag_stack_clash_protection);
4417   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
4418   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
4419
4420   /* The minimum required allocation before the residual requires probing.  */
4421   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
4422
4423   /* Clamp the value down to the nearest value that can be used with a cmp.  */
4424   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
4425   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
4426
4427   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
4428   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
4429
4430   static int labelno = 0;
4431   char loop_start_lab[32];
4432   char loop_end_lab[32];
4433   rtx xops[2];
4434
4435   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
4436   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
4437
4438   /* Emit loop start label.  */
4439   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
4440
4441   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
4442   xops[0] = adjustment;
4443   xops[1] = probe_offset_value_rtx;
4444   output_asm_insn ("cmp\t%0, %1", xops);
4445
4446   /* Branch to end if not enough adjustment to probe.  */
4447   fputs ("\tb.lt\t", asm_out_file);
4448   assemble_name_raw (asm_out_file, loop_end_lab);
4449   fputc ('\n', asm_out_file);
4450
4451   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
4452   xops[0] = base;
4453   xops[1] = probe_offset_value_rtx;
4454   output_asm_insn ("sub\t%0, %0, %1", xops);
4455
4456   /* Probe at BASE.  */
4457   xops[1] = const0_rtx;
4458   output_asm_insn ("str\txzr, [%0, %1]", xops);
4459
4460   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
4461   xops[0] = adjustment;
4462   xops[1] = probe_offset_value_rtx;
4463   output_asm_insn ("sub\t%0, %0, %1", xops);
4464
4465   /* Branch to start if still more bytes to allocate.  */
4466   fputs ("\tb\t", asm_out_file);
4467   assemble_name_raw (asm_out_file, loop_start_lab);
4468   fputc ('\n', asm_out_file);
4469
4470   /* No probe leave.  */
4471   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
4472
4473   /* BASE = BASE - ADJUSTMENT.  */
4474   xops[0] = base;
4475   xops[1] = adjustment;
4476   output_asm_insn ("sub\t%0, %0, %1", xops);
4477   return "";
4478 }
4479
4480 /* Determine whether a frame chain needs to be generated.  */
4481 static bool
4482 aarch64_needs_frame_chain (void)
4483 {
4484   /* Force a frame chain for EH returns so the return address is at FP+8.  */
4485   if (frame_pointer_needed || crtl->calls_eh_return)
4486     return true;
4487
4488   /* A leaf function cannot have calls or write LR.  */
4489   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4490
4491   /* Don't use a frame chain in leaf functions if leaf frame pointers
4492      are disabled.  */
4493   if (flag_omit_leaf_frame_pointer && is_leaf)
4494     return false;
4495
4496   return aarch64_use_frame_pointer;
4497 }
4498
4499 /* Mark the registers that need to be saved by the callee and calculate
4500    the size of the callee-saved registers area and frame record (both FP
4501    and LR may be omitted).  */
4502 static void
4503 aarch64_layout_frame (void)
4504 {
4505   HOST_WIDE_INT offset = 0;
4506   int regno, last_fp_reg = INVALID_REGNUM;
4507   bool simd_function = aarch64_simd_decl_p (cfun->decl);
4508
4509   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4510
4511   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
4512      the mid-end is doing.  */
4513   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
4514
4515 #define SLOT_NOT_REQUIRED (-2)
4516 #define SLOT_REQUIRED     (-1)
4517
4518   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4519   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4520
4521   /* If this is a non-leaf simd function with calls we assume that
4522      at least one of those calls is to a non-simd function and thus
4523      we must save V8 to V23 in the prologue.  */
4524
4525   if (simd_function && !crtl->is_leaf)
4526     {
4527       for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4528         if (FP_SIMD_SAVED_REGNUM_P (regno))
4529           df_set_regs_ever_live (regno, true);
4530     }
4531
4532   /* First mark all the registers that really need to be saved...  */
4533   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4534     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4535
4536   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4537     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4538
4539   /* ... that includes the eh data registers (if needed)...  */
4540   if (crtl->calls_eh_return)
4541     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4542       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4543         = SLOT_REQUIRED;
4544
4545   /* ... and any callee saved register that dataflow says is live.  */
4546   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4547     if (df_regs_ever_live_p (regno)
4548         && (regno == R30_REGNUM
4549             || !call_used_regs[regno]))
4550       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4551
4552   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4553     if (df_regs_ever_live_p (regno)
4554         && (!call_used_regs[regno]
4555             || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
4556       {
4557         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4558         last_fp_reg = regno;
4559       }
4560
4561   if (cfun->machine->frame.emit_frame_chain)
4562     {
4563       /* FP and LR are placed in the linkage record.  */
4564       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4565       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4566       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4567       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4568       offset = 2 * UNITS_PER_WORD;
4569     }
4570
4571   /* With stack-clash, LR must be saved in non-leaf functions.  */
4572   gcc_assert (crtl->is_leaf
4573               || (cfun->machine->frame.reg_offset[R30_REGNUM]
4574                   != SLOT_NOT_REQUIRED));
4575
4576   /* Now assign stack slots for them.  */
4577   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4578     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4579       {
4580         cfun->machine->frame.reg_offset[regno] = offset;
4581         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4582           cfun->machine->frame.wb_candidate1 = regno;
4583         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4584           cfun->machine->frame.wb_candidate2 = regno;
4585         offset += UNITS_PER_WORD;
4586       }
4587
4588   HOST_WIDE_INT max_int_offset = offset;
4589   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4590   bool has_align_gap = offset != max_int_offset;
4591
4592   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4593     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4594       {
4595         /* If there is an alignment gap between integer and fp callee-saves,
4596            allocate the last fp register to it if possible.  */
4597         if (regno == last_fp_reg
4598             && has_align_gap
4599             && !simd_function
4600             && (offset & 8) == 0)
4601           {
4602             cfun->machine->frame.reg_offset[regno] = max_int_offset;
4603             break;
4604           }
4605
4606         cfun->machine->frame.reg_offset[regno] = offset;
4607         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4608           cfun->machine->frame.wb_candidate1 = regno;
4609         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4610                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4611           cfun->machine->frame.wb_candidate2 = regno;
4612         offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
4613       }
4614
4615   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4616
4617   cfun->machine->frame.saved_regs_size = offset;
4618
4619   HOST_WIDE_INT varargs_and_saved_regs_size
4620     = offset + cfun->machine->frame.saved_varargs_size;
4621
4622   cfun->machine->frame.hard_fp_offset
4623     = aligned_upper_bound (varargs_and_saved_regs_size
4624                            + get_frame_size (),
4625                            STACK_BOUNDARY / BITS_PER_UNIT);
4626
4627   /* Both these values are already aligned.  */
4628   gcc_assert (multiple_p (crtl->outgoing_args_size,
4629                           STACK_BOUNDARY / BITS_PER_UNIT));
4630   cfun->machine->frame.frame_size
4631     = (cfun->machine->frame.hard_fp_offset
4632        + crtl->outgoing_args_size);
4633
4634   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4635
4636   cfun->machine->frame.initial_adjust = 0;
4637   cfun->machine->frame.final_adjust = 0;
4638   cfun->machine->frame.callee_adjust = 0;
4639   cfun->machine->frame.callee_offset = 0;
4640
4641   HOST_WIDE_INT max_push_offset = 0;
4642   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4643     max_push_offset = 512;
4644   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4645     max_push_offset = 256;
4646
4647   HOST_WIDE_INT const_size, const_fp_offset;
4648   if (cfun->machine->frame.frame_size.is_constant (&const_size)
4649       && const_size < max_push_offset
4650       && known_eq (crtl->outgoing_args_size, 0))
4651     {
4652       /* Simple, small frame with no outgoing arguments:
4653          stp reg1, reg2, [sp, -frame_size]!
4654          stp reg3, reg4, [sp, 16]  */
4655       cfun->machine->frame.callee_adjust = const_size;
4656     }
4657   else if (known_lt (crtl->outgoing_args_size
4658                      + cfun->machine->frame.saved_regs_size, 512)
4659            && !(cfun->calls_alloca
4660                 && known_lt (cfun->machine->frame.hard_fp_offset,
4661                              max_push_offset)))
4662     {
4663       /* Frame with small outgoing arguments:
4664          sub sp, sp, frame_size
4665          stp reg1, reg2, [sp, outgoing_args_size]
4666          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
4667       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4668       cfun->machine->frame.callee_offset
4669         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4670     }
4671   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4672            && const_fp_offset < max_push_offset)
4673     {
4674       /* Frame with large outgoing arguments but a small local area:
4675          stp reg1, reg2, [sp, -hard_fp_offset]!
4676          stp reg3, reg4, [sp, 16]
4677          sub sp, sp, outgoing_args_size  */
4678       cfun->machine->frame.callee_adjust = const_fp_offset;
4679       cfun->machine->frame.final_adjust
4680         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4681     }
4682   else
4683     {
4684       /* Frame with large local area and outgoing arguments using frame pointer:
4685          sub sp, sp, hard_fp_offset
4686          stp x29, x30, [sp, 0]
4687          add x29, sp, 0
4688          stp reg3, reg4, [sp, 16]
4689          sub sp, sp, outgoing_args_size  */
4690       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4691       cfun->machine->frame.final_adjust
4692         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4693     }
4694
4695   cfun->machine->frame.laid_out = true;
4696 }
4697
4698 /* Return true if the register REGNO is saved on entry to
4699    the current function.  */
4700
4701 static bool
4702 aarch64_register_saved_on_entry (int regno)
4703 {
4704   return cfun->machine->frame.reg_offset[regno] >= 0;
4705 }
4706
4707 /* Return the next register up from REGNO up to LIMIT for the callee
4708    to save.  */
4709
4710 static unsigned
4711 aarch64_next_callee_save (unsigned regno, unsigned limit)
4712 {
4713   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4714     regno ++;
4715   return regno;
4716 }
4717
4718 /* Push the register number REGNO of mode MODE to the stack with write-back
4719    adjusting the stack by ADJUSTMENT.  */
4720
4721 static void
4722 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4723                            HOST_WIDE_INT adjustment)
4724  {
4725   rtx base_rtx = stack_pointer_rtx;
4726   rtx insn, reg, mem;
4727
4728   reg = gen_rtx_REG (mode, regno);
4729   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4730                             plus_constant (Pmode, base_rtx, -adjustment));
4731   mem = gen_frame_mem (mode, mem);
4732
4733   insn = emit_move_insn (mem, reg);
4734   RTX_FRAME_RELATED_P (insn) = 1;
4735 }
4736
4737 /* Generate and return an instruction to store the pair of registers
4738    REG and REG2 of mode MODE to location BASE with write-back adjusting
4739    the stack location BASE by ADJUSTMENT.  */
4740
4741 static rtx
4742 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4743                           HOST_WIDE_INT adjustment)
4744 {
4745   switch (mode)
4746     {
4747     case E_DImode:
4748       return gen_storewb_pairdi_di (base, base, reg, reg2,
4749                                     GEN_INT (-adjustment),
4750                                     GEN_INT (UNITS_PER_WORD - adjustment));
4751     case E_DFmode:
4752       return gen_storewb_pairdf_di (base, base, reg, reg2,
4753                                     GEN_INT (-adjustment),
4754                                     GEN_INT (UNITS_PER_WORD - adjustment));
4755     case E_TFmode:
4756       return gen_storewb_pairtf_di (base, base, reg, reg2,
4757                                     GEN_INT (-adjustment),
4758                                     GEN_INT (UNITS_PER_VREG - adjustment));
4759     default:
4760       gcc_unreachable ();
4761     }
4762 }
4763
4764 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4765    stack pointer by ADJUSTMENT.  */
4766
4767 static void
4768 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4769 {
4770   rtx_insn *insn;
4771   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4772
4773   if (regno2 == INVALID_REGNUM)
4774     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4775
4776   rtx reg1 = gen_rtx_REG (mode, regno1);
4777   rtx reg2 = gen_rtx_REG (mode, regno2);
4778
4779   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4780                                               reg2, adjustment));
4781   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4782   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4783   RTX_FRAME_RELATED_P (insn) = 1;
4784 }
4785
4786 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4787    adjusting it by ADJUSTMENT afterwards.  */
4788
4789 static rtx
4790 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4791                          HOST_WIDE_INT adjustment)
4792 {
4793   switch (mode)
4794     {
4795     case E_DImode:
4796       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4797                                    GEN_INT (UNITS_PER_WORD));
4798     case E_DFmode:
4799       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4800                                    GEN_INT (UNITS_PER_WORD));
4801     case E_TFmode:
4802       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
4803                                    GEN_INT (UNITS_PER_VREG));
4804     default:
4805       gcc_unreachable ();
4806     }
4807 }
4808
4809 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4810    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4811    into CFI_OPS.  */
4812
4813 static void
4814 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4815                   rtx *cfi_ops)
4816 {
4817   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4818   rtx reg1 = gen_rtx_REG (mode, regno1);
4819
4820   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4821
4822   if (regno2 == INVALID_REGNUM)
4823     {
4824       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4825       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4826       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4827     }
4828   else
4829     {
4830       rtx reg2 = gen_rtx_REG (mode, regno2);
4831       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4832       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4833                                           reg2, adjustment));
4834     }
4835 }
4836
4837 /* Generate and return a store pair instruction of mode MODE to store
4838    register REG1 to MEM1 and register REG2 to MEM2.  */
4839
4840 static rtx
4841 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4842                         rtx reg2)
4843 {
4844   switch (mode)
4845     {
4846     case E_DImode:
4847       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4848
4849     case E_DFmode:
4850       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4851
4852     case E_TFmode:
4853       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
4854
4855     default:
4856       gcc_unreachable ();
4857     }
4858 }
4859
4860 /* Generate and regurn a load pair isntruction of mode MODE to load register
4861    REG1 from MEM1 and register REG2 from MEM2.  */
4862
4863 static rtx
4864 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4865                        rtx mem2)
4866 {
4867   switch (mode)
4868     {
4869     case E_DImode:
4870       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4871
4872     case E_DFmode:
4873       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4874
4875     case E_TFmode:
4876       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
4877
4878     default:
4879       gcc_unreachable ();
4880     }
4881 }
4882
4883 /* Return TRUE if return address signing should be enabled for the current
4884    function, otherwise return FALSE.  */
4885
4886 bool
4887 aarch64_return_address_signing_enabled (void)
4888 {
4889   /* This function should only be called after frame laid out.   */
4890   gcc_assert (cfun->machine->frame.laid_out);
4891
4892   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4893      if its LR is pushed onto stack.  */
4894   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4895           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4896               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4897 }
4898
4899 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
4900 bool
4901 aarch64_bti_enabled (void)
4902 {
4903   return (aarch64_enable_bti == 1);
4904 }
4905
4906 /* Emit code to save the callee-saved registers from register number START
4907    to LIMIT to the stack at the location starting at offset START_OFFSET,
4908    skipping any write-back candidates if SKIP_WB is true.  */
4909
4910 static void
4911 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4912                            unsigned start, unsigned limit, bool skip_wb)
4913 {
4914   rtx_insn *insn;
4915   unsigned regno;
4916   unsigned regno2;
4917
4918   for (regno = aarch64_next_callee_save (start, limit);
4919        regno <= limit;
4920        regno = aarch64_next_callee_save (regno + 1, limit))
4921     {
4922       rtx reg, mem;
4923       poly_int64 offset;
4924       int offset_diff;
4925
4926       if (skip_wb
4927           && (regno == cfun->machine->frame.wb_candidate1
4928               || regno == cfun->machine->frame.wb_candidate2))
4929         continue;
4930
4931       if (cfun->machine->reg_is_wrapped_separately[regno])
4932        continue;
4933
4934       reg = gen_rtx_REG (mode, regno);
4935       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4936       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4937                                                 offset));
4938
4939       regno2 = aarch64_next_callee_save (regno + 1, limit);
4940       offset_diff = cfun->machine->frame.reg_offset[regno2]
4941                     - cfun->machine->frame.reg_offset[regno];
4942
4943       if (regno2 <= limit
4944           && !cfun->machine->reg_is_wrapped_separately[regno2]
4945           && known_eq (GET_MODE_SIZE (mode), offset_diff))
4946         {
4947           rtx reg2 = gen_rtx_REG (mode, regno2);
4948           rtx mem2;
4949
4950           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4951           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4952                                                      offset));
4953           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4954                                                     reg2));
4955
4956           /* The first part of a frame-related parallel insn is
4957              always assumed to be relevant to the frame
4958              calculations; subsequent parts, are only
4959              frame-related if explicitly marked.  */
4960           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4961           regno = regno2;
4962         }
4963       else
4964         insn = emit_move_insn (mem, reg);
4965
4966       RTX_FRAME_RELATED_P (insn) = 1;
4967     }
4968 }
4969
4970 /* Emit code to restore the callee registers of mode MODE from register
4971    number START up to and including LIMIT.  Restore from the stack offset
4972    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4973    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
4974
4975 static void
4976 aarch64_restore_callee_saves (machine_mode mode,
4977                               poly_int64 start_offset, unsigned start,
4978                               unsigned limit, bool skip_wb, rtx *cfi_ops)
4979 {
4980   rtx base_rtx = stack_pointer_rtx;
4981   unsigned regno;
4982   unsigned regno2;
4983   poly_int64 offset;
4984
4985   for (regno = aarch64_next_callee_save (start, limit);
4986        regno <= limit;
4987        regno = aarch64_next_callee_save (regno + 1, limit))
4988     {
4989       if (cfun->machine->reg_is_wrapped_separately[regno])
4990        continue;
4991
4992       rtx reg, mem;
4993       int offset_diff;
4994
4995       if (skip_wb
4996           && (regno == cfun->machine->frame.wb_candidate1
4997               || regno == cfun->machine->frame.wb_candidate2))
4998         continue;
4999
5000       reg = gen_rtx_REG (mode, regno);
5001       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5002       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5003
5004       regno2 = aarch64_next_callee_save (regno + 1, limit);
5005       offset_diff = cfun->machine->frame.reg_offset[regno2]
5006                     - cfun->machine->frame.reg_offset[regno];
5007
5008       if (regno2 <= limit
5009           && !cfun->machine->reg_is_wrapped_separately[regno2]
5010           && known_eq (GET_MODE_SIZE (mode), offset_diff))
5011         {
5012           rtx reg2 = gen_rtx_REG (mode, regno2);
5013           rtx mem2;
5014
5015           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5016           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5017           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5018
5019           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5020           regno = regno2;
5021         }
5022       else
5023         emit_move_insn (reg, mem);
5024       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5025     }
5026 }
5027
5028 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5029    of MODE.  */
5030
5031 static inline bool
5032 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5033 {
5034   HOST_WIDE_INT multiple;
5035   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5036           && IN_RANGE (multiple, -8, 7));
5037 }
5038
5039 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5040    of MODE.  */
5041
5042 static inline bool
5043 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5044 {
5045   HOST_WIDE_INT multiple;
5046   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5047           && IN_RANGE (multiple, 0, 63));
5048 }
5049
5050 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5051    of MODE.  */
5052
5053 bool
5054 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5055 {
5056   HOST_WIDE_INT multiple;
5057   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5058           && IN_RANGE (multiple, -64, 63));
5059 }
5060
5061 /* Return true if OFFSET is a signed 9-bit value.  */
5062
5063 bool
5064 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5065                                        poly_int64 offset)
5066 {
5067   HOST_WIDE_INT const_offset;
5068   return (offset.is_constant (&const_offset)
5069           && IN_RANGE (const_offset, -256, 255));
5070 }
5071
5072 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5073    of MODE.  */
5074
5075 static inline bool
5076 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5077 {
5078   HOST_WIDE_INT multiple;
5079   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5080           && IN_RANGE (multiple, -256, 255));
5081 }
5082
5083 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5084    of MODE.  */
5085
5086 static inline bool
5087 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5088 {
5089   HOST_WIDE_INT multiple;
5090   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5091           && IN_RANGE (multiple, 0, 4095));
5092 }
5093
5094 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
5095
5096 static sbitmap
5097 aarch64_get_separate_components (void)
5098 {
5099   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5100   bitmap_clear (components);
5101
5102   /* The registers we need saved to the frame.  */
5103   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5104     if (aarch64_register_saved_on_entry (regno))
5105       {
5106         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5107         if (!frame_pointer_needed)
5108           offset += cfun->machine->frame.frame_size
5109                     - cfun->machine->frame.hard_fp_offset;
5110         /* Check that we can access the stack slot of the register with one
5111            direct load with no adjustments needed.  */
5112         if (offset_12bit_unsigned_scaled_p (DImode, offset))
5113           bitmap_set_bit (components, regno);
5114       }
5115
5116   /* Don't mess with the hard frame pointer.  */
5117   if (frame_pointer_needed)
5118     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5119
5120   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5121   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5122   /* If registers have been chosen to be stored/restored with
5123      writeback don't interfere with them to avoid having to output explicit
5124      stack adjustment instructions.  */
5125   if (reg2 != INVALID_REGNUM)
5126     bitmap_clear_bit (components, reg2);
5127   if (reg1 != INVALID_REGNUM)
5128     bitmap_clear_bit (components, reg1);
5129
5130   bitmap_clear_bit (components, LR_REGNUM);
5131   bitmap_clear_bit (components, SP_REGNUM);
5132
5133   return components;
5134 }
5135
5136 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
5137
5138 static sbitmap
5139 aarch64_components_for_bb (basic_block bb)
5140 {
5141   bitmap in = DF_LIVE_IN (bb);
5142   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5143   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5144   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5145
5146   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5147   bitmap_clear (components);
5148
5149   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
5150   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5151     if ((!call_used_regs[regno]
5152         || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5153        && (bitmap_bit_p (in, regno)
5154            || bitmap_bit_p (gen, regno)
5155            || bitmap_bit_p (kill, regno)))
5156       {
5157         unsigned regno2, offset, offset2;
5158         bitmap_set_bit (components, regno);
5159
5160         /* If there is a callee-save at an adjacent offset, add it too
5161            to increase the use of LDP/STP.  */
5162         offset = cfun->machine->frame.reg_offset[regno];
5163         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5164
5165         if (regno2 <= LAST_SAVED_REGNUM)
5166           {
5167             offset2 = cfun->machine->frame.reg_offset[regno2];
5168             if ((offset & ~8) == (offset2 & ~8))
5169               bitmap_set_bit (components, regno2);
5170           }
5171       }
5172
5173   return components;
5174 }
5175
5176 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5177    Nothing to do for aarch64.  */
5178
5179 static void
5180 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5181 {
5182 }
5183
5184 /* Return the next set bit in BMP from START onwards.  Return the total number
5185    of bits in BMP if no set bit is found at or after START.  */
5186
5187 static unsigned int
5188 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5189 {
5190   unsigned int nbits = SBITMAP_SIZE (bmp);
5191   if (start == nbits)
5192     return start;
5193
5194   gcc_assert (start < nbits);
5195   for (unsigned int i = start; i < nbits; i++)
5196     if (bitmap_bit_p (bmp, i))
5197       return i;
5198
5199   return nbits;
5200 }
5201
5202 /* Do the work for aarch64_emit_prologue_components and
5203    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
5204    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5205    for these components or the epilogue sequence.  That is, it determines
5206    whether we should emit stores or loads and what kind of CFA notes to attach
5207    to the insns.  Otherwise the logic for the two sequences is very
5208    similar.  */
5209
5210 static void
5211 aarch64_process_components (sbitmap components, bool prologue_p)
5212 {
5213   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5214                              ? HARD_FRAME_POINTER_REGNUM
5215                              : STACK_POINTER_REGNUM);
5216
5217   unsigned last_regno = SBITMAP_SIZE (components);
5218   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5219   rtx_insn *insn = NULL;
5220
5221   while (regno != last_regno)
5222     {
5223       /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5224          so DFmode for the vector registers is enough.  For simd functions
5225          we want to save the low 128 bits.  */
5226       machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5227
5228       rtx reg = gen_rtx_REG (mode, regno);
5229       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5230       if (!frame_pointer_needed)
5231         offset += cfun->machine->frame.frame_size
5232                   - cfun->machine->frame.hard_fp_offset;
5233       rtx addr = plus_constant (Pmode, ptr_reg, offset);
5234       rtx mem = gen_frame_mem (mode, addr);
5235
5236       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5237       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5238       /* No more registers to handle after REGNO.
5239          Emit a single save/restore and exit.  */
5240       if (regno2 == last_regno)
5241         {
5242           insn = emit_insn (set);
5243           RTX_FRAME_RELATED_P (insn) = 1;
5244           if (prologue_p)
5245             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5246           else
5247             add_reg_note (insn, REG_CFA_RESTORE, reg);
5248           break;
5249         }
5250
5251       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5252       /* The next register is not of the same class or its offset is not
5253          mergeable with the current one into a pair.  */
5254       if (!satisfies_constraint_Ump (mem)
5255           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5256           || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5257           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5258                        GET_MODE_SIZE (mode)))
5259         {
5260           insn = emit_insn (set);
5261           RTX_FRAME_RELATED_P (insn) = 1;
5262           if (prologue_p)
5263             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5264           else
5265             add_reg_note (insn, REG_CFA_RESTORE, reg);
5266
5267           regno = regno2;
5268           continue;
5269         }
5270
5271       /* REGNO2 can be saved/restored in a pair with REGNO.  */
5272       rtx reg2 = gen_rtx_REG (mode, regno2);
5273       if (!frame_pointer_needed)
5274         offset2 += cfun->machine->frame.frame_size
5275                   - cfun->machine->frame.hard_fp_offset;
5276       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5277       rtx mem2 = gen_frame_mem (mode, addr2);
5278       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5279                              : gen_rtx_SET (reg2, mem2);
5280
5281       if (prologue_p)
5282         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
5283       else
5284         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5285
5286       RTX_FRAME_RELATED_P (insn) = 1;
5287       if (prologue_p)
5288         {
5289           add_reg_note (insn, REG_CFA_OFFSET, set);
5290           add_reg_note (insn, REG_CFA_OFFSET, set2);
5291         }
5292       else
5293         {
5294           add_reg_note (insn, REG_CFA_RESTORE, reg);
5295           add_reg_note (insn, REG_CFA_RESTORE, reg2);
5296         }
5297
5298       regno = aarch64_get_next_set_bit (components, regno2 + 1);
5299     }
5300 }
5301
5302 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
5303
5304 static void
5305 aarch64_emit_prologue_components (sbitmap components)
5306 {
5307   aarch64_process_components (components, true);
5308 }
5309
5310 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
5311
5312 static void
5313 aarch64_emit_epilogue_components (sbitmap components)
5314 {
5315   aarch64_process_components (components, false);
5316 }
5317
5318 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
5319
5320 static void
5321 aarch64_set_handled_components (sbitmap components)
5322 {
5323   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5324     if (bitmap_bit_p (components, regno))
5325       cfun->machine->reg_is_wrapped_separately[regno] = true;
5326 }
5327
5328 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
5329    determining the probe offset for alloca.  */
5330
5331 static HOST_WIDE_INT
5332 aarch64_stack_clash_protection_alloca_probe_range (void)
5333 {
5334   return STACK_CLASH_CALLER_GUARD;
5335 }
5336
5337
5338 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5339    registers.  If POLY_SIZE is not large enough to require a probe this function
5340    will only adjust the stack.  When allocating the stack space
5341    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5342    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5343    arguments.  If we are then we ensure that any allocation larger than the ABI
5344    defined buffer needs a probe so that the invariant of having a 1KB buffer is
5345    maintained.
5346
5347    We emit barriers after each stack adjustment to prevent optimizations from
5348    breaking the invariant that we never drop the stack more than a page.  This
5349    invariant is needed to make it easier to correctly handle asynchronous
5350    events, e.g. if we were to allow the stack to be dropped by more than a page
5351    and then have multiple probes up and we take a signal somewhere in between
5352    then the signal handler doesn't know the state of the stack and can make no
5353    assumptions about which pages have been probed.  */
5354
5355 static void
5356 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
5357                                         poly_int64 poly_size,
5358                                         bool frame_related_p,
5359                                         bool final_adjustment_p)
5360 {
5361   HOST_WIDE_INT guard_size
5362     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5363   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5364   /* When doing the final adjustment for the outgoing argument size we can't
5365      assume that LR was saved at position 0.  So subtract it's offset from the
5366      ABI safe buffer so that we don't accidentally allow an adjustment that
5367      would result in an allocation larger than the ABI buffer without
5368      probing.  */
5369   HOST_WIDE_INT min_probe_threshold
5370     = final_adjustment_p
5371       ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
5372       : guard_size - guard_used_by_caller;
5373
5374   poly_int64 frame_size = cfun->machine->frame.frame_size;
5375
5376   /* We should always have a positive probe threshold.  */
5377   gcc_assert (min_probe_threshold > 0);
5378
5379   if (flag_stack_clash_protection && !final_adjustment_p)
5380     {
5381       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5382       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5383
5384       if (known_eq (frame_size, 0))
5385         {
5386           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
5387         }
5388       else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
5389                && known_lt (final_adjust, guard_used_by_caller))
5390         {
5391           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
5392         }
5393     }
5394
5395   /* If SIZE is not large enough to require probing, just adjust the stack and
5396      exit.  */
5397   if (known_lt (poly_size, min_probe_threshold)
5398       || !flag_stack_clash_protection)
5399     {
5400       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
5401       return;
5402     }
5403
5404   HOST_WIDE_INT size;
5405   /* Handle the SVE non-constant case first.  */
5406   if (!poly_size.is_constant (&size))
5407     {
5408      if (dump_file)
5409       {
5410         fprintf (dump_file, "Stack clash SVE prologue: ");
5411         print_dec (poly_size, dump_file);
5412         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
5413       }
5414
5415       /* First calculate the amount of bytes we're actually spilling.  */
5416       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
5417                           poly_size, temp1, temp2, false, true);
5418
5419       rtx_insn *insn = get_last_insn ();
5420
5421       if (frame_related_p)
5422         {
5423           /* This is done to provide unwinding information for the stack
5424              adjustments we're about to do, however to prevent the optimizers
5425              from removing the R11 move and leaving the CFA note (which would be
5426              very wrong) we tie the old and new stack pointer together.
5427              The tie will expand to nothing but the optimizers will not touch
5428              the instruction.  */
5429           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
5430           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
5431           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
5432
5433           /* We want the CFA independent of the stack pointer for the
5434              duration of the loop.  */
5435           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
5436           RTX_FRAME_RELATED_P (insn) = 1;
5437         }
5438
5439       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
5440       rtx guard_const = gen_int_mode (guard_size, Pmode);
5441
5442       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
5443                                                    stack_pointer_rtx, temp1,
5444                                                    probe_const, guard_const));
5445
5446       /* Now reset the CFA register if needed.  */
5447       if (frame_related_p)
5448         {
5449           add_reg_note (insn, REG_CFA_DEF_CFA,
5450                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
5451                                       gen_int_mode (poly_size, Pmode)));
5452           RTX_FRAME_RELATED_P (insn) = 1;
5453         }
5454
5455       return;
5456     }
5457
5458   if (dump_file)
5459     fprintf (dump_file,
5460              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5461              " bytes, probing will be required.\n", size);
5462
5463   /* Round size to the nearest multiple of guard_size, and calculate the
5464      residual as the difference between the original size and the rounded
5465      size.  */
5466   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
5467   HOST_WIDE_INT residual = size - rounded_size;
5468
5469   /* We can handle a small number of allocations/probes inline.  Otherwise
5470      punt to a loop.  */
5471   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
5472     {
5473       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
5474         {
5475           aarch64_sub_sp (NULL, temp2, guard_size, true);
5476           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5477                                            guard_used_by_caller));
5478           emit_insn (gen_blockage ());
5479         }
5480       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
5481     }
5482   else
5483     {
5484       /* Compute the ending address.  */
5485       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
5486                           temp1, NULL, false, true);
5487       rtx_insn *insn = get_last_insn ();
5488
5489       /* For the initial allocation, we don't have a frame pointer
5490          set up, so we always need CFI notes.  If we're doing the
5491          final allocation, then we may have a frame pointer, in which
5492          case it is the CFA, otherwise we need CFI notes.
5493
5494          We can determine which allocation we are doing by looking at
5495          the value of FRAME_RELATED_P since the final allocations are not
5496          frame related.  */
5497       if (frame_related_p)
5498         {
5499           /* We want the CFA independent of the stack pointer for the
5500              duration of the loop.  */
5501           add_reg_note (insn, REG_CFA_DEF_CFA,
5502                         plus_constant (Pmode, temp1, rounded_size));
5503           RTX_FRAME_RELATED_P (insn) = 1;
5504         }
5505
5506       /* This allocates and probes the stack.  Note that this re-uses some of
5507          the existing Ada stack protection code.  However we are guaranteed not
5508          to enter the non loop or residual branches of that code.
5509
5510          The non-loop part won't be entered because if our allocation amount
5511          doesn't require a loop, the case above would handle it.
5512
5513          The residual amount won't be entered because TEMP1 is a mutliple of
5514          the allocation size.  The residual will always be 0.  As such, the only
5515          part we are actually using from that code is the loop setup.  The
5516          actual probing is done in aarch64_output_probe_stack_range.  */
5517       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
5518                                                stack_pointer_rtx, temp1));
5519
5520       /* Now reset the CFA register if needed.  */
5521       if (frame_related_p)
5522         {
5523           add_reg_note (insn, REG_CFA_DEF_CFA,
5524                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
5525           RTX_FRAME_RELATED_P (insn) = 1;
5526         }
5527
5528       emit_insn (gen_blockage ());
5529       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
5530     }
5531
5532   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
5533      be probed.  This maintains the requirement that each page is probed at
5534      least once.  For initial probing we probe only if the allocation is
5535      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
5536      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
5537      GUARD_SIZE.  This works that for any allocation that is large enough to
5538      trigger a probe here, we'll have at least one, and if they're not large
5539      enough for this code to emit anything for them, The page would have been
5540      probed by the saving of FP/LR either by this function or any callees.  If
5541      we don't have any callees then we won't have more stack adjustments and so
5542      are still safe.  */
5543   if (residual)
5544     {
5545       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
5546       /* If we're doing final adjustments, and we've done any full page
5547          allocations then any residual needs to be probed.  */
5548       if (final_adjustment_p && rounded_size != 0)
5549         min_probe_threshold = 0;
5550       /* If doing a small final adjustment, we always probe at offset 0.
5551          This is done to avoid issues when LR is not at position 0 or when
5552          the final adjustment is smaller than the probing offset.  */
5553       else if (final_adjustment_p && rounded_size == 0)
5554         residual_probe_offset = 0;
5555
5556       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
5557       if (residual >= min_probe_threshold)
5558         {
5559           if (dump_file)
5560             fprintf (dump_file,
5561                      "Stack clash AArch64 prologue residuals: "
5562                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
5563                      "\n", residual);
5564
5565             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5566                                              residual_probe_offset));
5567           emit_insn (gen_blockage ());
5568         }
5569     }
5570 }
5571
5572 /* Return 1 if the register is used by the epilogue.  We need to say the
5573    return register is used, but only after epilogue generation is complete.
5574    Note that in the case of sibcalls, the values "used by the epilogue" are
5575    considered live at the start of the called function.
5576
5577    For SIMD functions we need to return 1 for FP registers that are saved and
5578    restored by a function but are not zero in call_used_regs.  If we do not do
5579    this optimizations may remove the restore of the register.  */
5580
5581 int
5582 aarch64_epilogue_uses (int regno)
5583 {
5584   if (epilogue_completed)
5585     {
5586       if (regno == LR_REGNUM)
5587         return 1;
5588       if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
5589         return 1;
5590     }
5591   return 0;
5592 }
5593
5594 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
5595    is saved at BASE + OFFSET.  */
5596
5597 static void
5598 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
5599                             rtx base, poly_int64 offset)
5600 {
5601   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
5602   add_reg_note (insn, REG_CFA_EXPRESSION,
5603                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
5604 }
5605
5606 /* AArch64 stack frames generated by this compiler look like:
5607
5608         +-------------------------------+
5609         |                               |
5610         |  incoming stack arguments     |
5611         |                               |
5612         +-------------------------------+
5613         |                               | <-- incoming stack pointer (aligned)
5614         |  callee-allocated save area   |
5615         |  for register varargs         |
5616         |                               |
5617         +-------------------------------+
5618         |  local variables              | <-- frame_pointer_rtx
5619         |                               |
5620         +-------------------------------+
5621         |  padding                      | \
5622         +-------------------------------+  |
5623         |  callee-saved registers       |  | frame.saved_regs_size
5624         +-------------------------------+  |
5625         |  LR'                          |  |
5626         +-------------------------------+  |
5627         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
5628         +-------------------------------+
5629         |  dynamic allocation           |
5630         +-------------------------------+
5631         |  padding                      |
5632         +-------------------------------+
5633         |  outgoing stack arguments     | <-- arg_pointer
5634         |                               |
5635         +-------------------------------+
5636         |                               | <-- stack_pointer_rtx (aligned)
5637
5638    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
5639    but leave frame_pointer_rtx and hard_frame_pointer_rtx
5640    unchanged.
5641
5642    By default for stack-clash we assume the guard is at least 64KB, but this
5643    value is configurable to either 4KB or 64KB.  We also force the guard size to
5644    be the same as the probing interval and both values are kept in sync.
5645
5646    With those assumptions the callee can allocate up to 63KB (or 3KB depending
5647    on the guard size) of stack space without probing.
5648
5649    When probing is needed, we emit a probe at the start of the prologue
5650    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
5651
5652    We have to track how much space has been allocated and the only stores
5653    to the stack we track as implicit probes are the FP/LR stores.
5654
5655    For outgoing arguments we probe if the size is larger than 1KB, such that
5656    the ABI specified buffer is maintained for the next callee.
5657
5658    The following registers are reserved during frame layout and should not be
5659    used for any other purpose:
5660
5661    - r11: Used by stack clash protection when SVE is enabled.
5662    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
5663    - r14 and r15: Used for speculation tracking.
5664    - r16(IP0), r17(IP1): Used by indirect tailcalls.
5665    - r30(LR), r29(FP): Used by standard frame layout.
5666
5667    These registers must be avoided in frame layout related code unless the
5668    explicit intention is to interact with one of the features listed above.  */
5669
5670 /* Generate the prologue instructions for entry into a function.
5671    Establish the stack frame by decreasing the stack pointer with a
5672    properly calculated size and, if necessary, create a frame record
5673    filled with the values of LR and previous frame pointer.  The
5674    current FP is also set up if it is in use.  */
5675
5676 void
5677 aarch64_expand_prologue (void)
5678 {
5679   poly_int64 frame_size = cfun->machine->frame.frame_size;
5680   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5681   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5682   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5683   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5684   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5685   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5686   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
5687   rtx_insn *insn;
5688
5689   /* Sign return address for functions.  */
5690   if (aarch64_return_address_signing_enabled ())
5691     {
5692       switch (aarch64_ra_sign_key)
5693         {
5694           case AARCH64_KEY_A:
5695             insn = emit_insn (gen_paciasp ());
5696             break;
5697           case AARCH64_KEY_B:
5698             insn = emit_insn (gen_pacibsp ());
5699             break;
5700           default:
5701             gcc_unreachable ();
5702         }
5703       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5704       RTX_FRAME_RELATED_P (insn) = 1;
5705     }
5706
5707   if (flag_stack_usage_info)
5708     current_function_static_stack_size = constant_lower_bound (frame_size);
5709
5710   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5711     {
5712       if (crtl->is_leaf && !cfun->calls_alloca)
5713         {
5714           if (maybe_gt (frame_size, PROBE_INTERVAL)
5715               && maybe_gt (frame_size, get_stack_check_protect ()))
5716             aarch64_emit_probe_stack_range (get_stack_check_protect (),
5717                                             (frame_size
5718                                              - get_stack_check_protect ()));
5719         }
5720       else if (maybe_gt (frame_size, 0))
5721         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
5722     }
5723
5724   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5725   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5726
5727   /* In theory we should never have both an initial adjustment
5728      and a callee save adjustment.  Verify that is the case since the
5729      code below does not handle it for -fstack-clash-protection.  */
5730   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
5731
5732   /* Will only probe if the initial adjustment is larger than the guard
5733      less the amount of the guard reserved for use by the caller's
5734      outgoing args.  */
5735   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
5736                                           true, false);
5737
5738   if (callee_adjust != 0)
5739     aarch64_push_regs (reg1, reg2, callee_adjust);
5740
5741   if (emit_frame_chain)
5742     {
5743       poly_int64 reg_offset = callee_adjust;
5744       if (callee_adjust == 0)
5745         {
5746           reg1 = R29_REGNUM;
5747           reg2 = R30_REGNUM;
5748           reg_offset = callee_offset;
5749           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
5750         }
5751       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
5752                           stack_pointer_rtx, callee_offset,
5753                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
5754       if (frame_pointer_needed && !frame_size.is_constant ())
5755         {
5756           /* Variable-sized frames need to describe the save slot
5757              address using DW_CFA_expression rather than DW_CFA_offset.
5758              This means that, without taking further action, the
5759              locations of the registers that we've already saved would
5760              remain based on the stack pointer even after we redefine
5761              the CFA based on the frame pointer.  We therefore need new
5762              DW_CFA_expressions to re-express the save slots with addresses
5763              based on the frame pointer.  */
5764           rtx_insn *insn = get_last_insn ();
5765           gcc_assert (RTX_FRAME_RELATED_P (insn));
5766
5767           /* Add an explicit CFA definition if this was previously
5768              implicit.  */
5769           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
5770             {
5771               rtx src = plus_constant (Pmode, stack_pointer_rtx,
5772                                        callee_offset);
5773               add_reg_note (insn, REG_CFA_ADJUST_CFA,
5774                             gen_rtx_SET (hard_frame_pointer_rtx, src));
5775             }
5776
5777           /* Change the save slot expressions for the registers that
5778              we've already saved.  */
5779           reg_offset -= callee_offset;
5780           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
5781                                       reg_offset + UNITS_PER_WORD);
5782           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
5783                                       reg_offset);
5784         }
5785       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
5786     }
5787
5788   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5789                              callee_adjust != 0 || emit_frame_chain);
5790   if (aarch64_simd_decl_p (cfun->decl))
5791     aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5792                                callee_adjust != 0 || emit_frame_chain);
5793   else
5794     aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5795                                callee_adjust != 0 || emit_frame_chain);
5796
5797   /* We may need to probe the final adjustment if it is larger than the guard
5798      that is assumed by the called.  */
5799   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
5800                                           !frame_pointer_needed, true);
5801 }
5802
5803 /* Return TRUE if we can use a simple_return insn.
5804
5805    This function checks whether the callee saved stack is empty, which
5806    means no restore actions are need. The pro_and_epilogue will use
5807    this to check whether shrink-wrapping opt is feasible.  */
5808
5809 bool
5810 aarch64_use_return_insn_p (void)
5811 {
5812   if (!reload_completed)
5813     return false;
5814
5815   if (crtl->profile)
5816     return false;
5817
5818   return known_eq (cfun->machine->frame.frame_size, 0);
5819 }
5820
5821 /* Return false for non-leaf SIMD functions in order to avoid
5822    shrink-wrapping them.  Doing this will lose the necessary
5823    save/restore of FP registers.  */
5824
5825 bool
5826 aarch64_use_simple_return_insn_p (void)
5827 {
5828   if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
5829     return false;
5830
5831   return true;
5832 }
5833
5834 /* Generate the epilogue instructions for returning from a function.
5835    This is almost exactly the reverse of the prolog sequence, except
5836    that we need to insert barriers to avoid scheduling loads that read
5837    from a deallocated stack, and we optimize the unwind records by
5838    emitting them all together if possible.  */
5839 void
5840 aarch64_expand_epilogue (bool for_sibcall)
5841 {
5842   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5843   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5844   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5845   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5846   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5847   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5848   rtx cfi_ops = NULL;
5849   rtx_insn *insn;
5850   /* A stack clash protection prologue may not have left EP0_REGNUM or
5851      EP1_REGNUM in a usable state.  The same is true for allocations
5852      with an SVE component, since we then need both temporary registers
5853      for each allocation.  For stack clash we are in a usable state if
5854      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
5855   HOST_WIDE_INT guard_size
5856     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5857   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5858
5859   /* We can re-use the registers when the allocation amount is smaller than
5860      guard_size - guard_used_by_caller because we won't be doing any probes
5861      then.  In such situations the register should remain live with the correct
5862      value.  */
5863   bool can_inherit_p = (initial_adjust.is_constant ()
5864                         && final_adjust.is_constant ())
5865                         && (!flag_stack_clash_protection
5866                             || known_lt (initial_adjust,
5867                                          guard_size - guard_used_by_caller));
5868
5869   /* We need to add memory barrier to prevent read from deallocated stack.  */
5870   bool need_barrier_p
5871     = maybe_ne (get_frame_size ()
5872                 + cfun->machine->frame.saved_varargs_size, 0);
5873
5874   /* Emit a barrier to prevent loads from a deallocated stack.  */
5875   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5876       || cfun->calls_alloca
5877       || crtl->calls_eh_return)
5878     {
5879       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5880       need_barrier_p = false;
5881     }
5882
5883   /* Restore the stack pointer from the frame pointer if it may not
5884      be the same as the stack pointer.  */
5885   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5886   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5887   if (frame_pointer_needed
5888       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5889     /* If writeback is used when restoring callee-saves, the CFA
5890        is restored on the instruction doing the writeback.  */
5891     aarch64_add_offset (Pmode, stack_pointer_rtx,
5892                         hard_frame_pointer_rtx, -callee_offset,
5893                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
5894   else
5895      /* The case where we need to re-use the register here is very rare, so
5896         avoid the complicated condition and just always emit a move if the
5897         immediate doesn't fit.  */
5898      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
5899
5900   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5901                                 callee_adjust != 0, &cfi_ops);
5902   if (aarch64_simd_decl_p (cfun->decl))
5903     aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5904                                   callee_adjust != 0, &cfi_ops);
5905   else
5906     aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5907                                   callee_adjust != 0, &cfi_ops);
5908
5909   if (need_barrier_p)
5910     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5911
5912   if (callee_adjust != 0)
5913     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5914
5915   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5916     {
5917       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
5918       insn = get_last_insn ();
5919       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5920       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5921       RTX_FRAME_RELATED_P (insn) = 1;
5922       cfi_ops = NULL;
5923     }
5924
5925   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
5926      add restriction on emit_move optimization to leaf functions.  */
5927   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
5928                   (!can_inherit_p || !crtl->is_leaf
5929                    || df_regs_ever_live_p (EP0_REGNUM)));
5930
5931   if (cfi_ops)
5932     {
5933       /* Emit delayed restores and reset the CFA to be SP.  */
5934       insn = get_last_insn ();
5935       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5936       REG_NOTES (insn) = cfi_ops;
5937       RTX_FRAME_RELATED_P (insn) = 1;
5938     }
5939
5940   /* We prefer to emit the combined return/authenticate instruction RETAA,
5941      however there are three cases in which we must instead emit an explicit
5942      authentication instruction.
5943
5944         1) Sibcalls don't return in a normal way, so if we're about to call one
5945            we must authenticate.
5946
5947         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5948            generating code for !TARGET_ARMV8_3 we can't use it and must
5949            explicitly authenticate.
5950
5951         3) On an eh_return path we make extra stack adjustments to update the
5952            canonical frame address to be the exception handler's CFA.  We want
5953            to authenticate using the CFA of the function which calls eh_return.
5954     */
5955   if (aarch64_return_address_signing_enabled ()
5956       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5957     {
5958       switch (aarch64_ra_sign_key)
5959         {
5960           case AARCH64_KEY_A:
5961             insn = emit_insn (gen_autiasp ());
5962             break;
5963           case AARCH64_KEY_B:
5964             insn = emit_insn (gen_autibsp ());
5965             break;
5966           default:
5967             gcc_unreachable ();
5968         }
5969       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5970       RTX_FRAME_RELATED_P (insn) = 1;
5971     }
5972
5973   /* Stack adjustment for exception handler.  */
5974   if (crtl->calls_eh_return && !for_sibcall)
5975     {
5976       /* We need to unwind the stack by the offset computed by
5977          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
5978          to be SP; letting the CFA move during this adjustment
5979          is just as correct as retaining the CFA from the body
5980          of the function.  Therefore, do nothing special.  */
5981       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5982     }
5983
5984   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5985   if (!for_sibcall)
5986     emit_jump_insn (ret_rtx);
5987 }
5988
5989 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
5990    normally or return to a previous frame after unwinding.
5991
5992    An EH return uses a single shared return sequence.  The epilogue is
5993    exactly like a normal epilogue except that it has an extra input
5994    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5995    that must be applied after the frame has been destroyed.  An extra label
5996    is inserted before the epilogue which initializes this register to zero,
5997    and this is the entry point for a normal return.
5998
5999    An actual EH return updates the return address, initializes the stack
6000    adjustment and jumps directly into the epilogue (bypassing the zeroing
6001    of the adjustment).  Since the return address is typically saved on the
6002    stack when a function makes a call, the saved LR must be updated outside
6003    the epilogue.
6004
6005    This poses problems as the store is generated well before the epilogue,
6006    so the offset of LR is not known yet.  Also optimizations will remove the
6007    store as it appears dead, even after the epilogue is generated (as the
6008    base or offset for loading LR is different in many cases).
6009
6010    To avoid these problems this implementation forces the frame pointer
6011    in eh_return functions so that the location of LR is fixed and known early.
6012    It also marks the store volatile, so no optimization is permitted to
6013    remove the store.  */
6014 rtx
6015 aarch64_eh_return_handler_rtx (void)
6016 {
6017   rtx tmp = gen_frame_mem (Pmode,
6018     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6019
6020   /* Mark the store volatile, so no optimization is permitted to remove it.  */
6021   MEM_VOLATILE_P (tmp) = true;
6022   return tmp;
6023 }
6024
6025 /* Output code to add DELTA to the first argument, and then jump
6026    to FUNCTION.  Used for C++ multiple inheritance.  */
6027 static void
6028 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6029                          HOST_WIDE_INT delta,
6030                          HOST_WIDE_INT vcall_offset,
6031                          tree function)
6032 {
6033   /* The this pointer is always in x0.  Note that this differs from
6034      Arm where the this pointer maybe bumped to r1 if r0 is required
6035      to return a pointer to an aggregate.  On AArch64 a result value
6036      pointer will be in x8.  */
6037   int this_regno = R0_REGNUM;
6038   rtx this_rtx, temp0, temp1, addr, funexp;
6039   rtx_insn *insn;
6040   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6041
6042   if (aarch64_bti_enabled ())
6043     emit_insn (gen_bti_c());
6044
6045   reload_completed = 1;
6046   emit_note (NOTE_INSN_PROLOGUE_END);
6047
6048   this_rtx = gen_rtx_REG (Pmode, this_regno);
6049   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6050   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6051
6052   if (vcall_offset == 0)
6053     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6054   else
6055     {
6056       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6057
6058       addr = this_rtx;
6059       if (delta != 0)
6060         {
6061           if (delta >= -256 && delta < 256)
6062             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6063                                        plus_constant (Pmode, this_rtx, delta));
6064           else
6065             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6066                                 temp1, temp0, false);
6067         }
6068
6069       if (Pmode == ptr_mode)
6070         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6071       else
6072         aarch64_emit_move (temp0,
6073                            gen_rtx_ZERO_EXTEND (Pmode,
6074                                                 gen_rtx_MEM (ptr_mode, addr)));
6075
6076       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6077           addr = plus_constant (Pmode, temp0, vcall_offset);
6078       else
6079         {
6080           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6081                                           Pmode);
6082           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6083         }
6084
6085       if (Pmode == ptr_mode)
6086         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6087       else
6088         aarch64_emit_move (temp1,
6089                            gen_rtx_SIGN_EXTEND (Pmode,
6090                                                 gen_rtx_MEM (ptr_mode, addr)));
6091
6092       emit_insn (gen_add2_insn (this_rtx, temp1));
6093     }
6094
6095   /* Generate a tail call to the target function.  */
6096   if (!TREE_USED (function))
6097     {
6098       assemble_external (function);
6099       TREE_USED (function) = 1;
6100     }
6101   funexp = XEXP (DECL_RTL (function), 0);
6102   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6103   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6104   SIBLING_CALL_P (insn) = 1;
6105
6106   insn = get_insns ();
6107   shorten_branches (insn);
6108
6109   assemble_start_function (thunk, fnname);
6110   final_start_function (insn, file, 1);
6111   final (insn, file, 1);
6112   final_end_function ();
6113   assemble_end_function (thunk, fnname);
6114
6115   /* Stop pretending to be a post-reload pass.  */
6116   reload_completed = 0;
6117 }
6118
6119 static bool
6120 aarch64_tls_referenced_p (rtx x)
6121 {
6122   if (!TARGET_HAVE_TLS)
6123     return false;
6124   subrtx_iterator::array_type array;
6125   FOR_EACH_SUBRTX (iter, array, x, ALL)
6126     {
6127       const_rtx x = *iter;
6128       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6129         return true;
6130       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6131          TLS offsets, not real symbol references.  */
6132       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6133         iter.skip_subrtxes ();
6134     }
6135   return false;
6136 }
6137
6138
6139 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6140    a left shift of 0 or 12 bits.  */
6141 bool
6142 aarch64_uimm12_shift (HOST_WIDE_INT val)
6143 {
6144   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6145           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6146           );
6147 }
6148
6149 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6150    that can be created with a left shift of 0 or 12.  */
6151 static HOST_WIDE_INT
6152 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6153 {
6154   /* Check to see if the value fits in 24 bits, as that is the maximum we can
6155      handle correctly.  */
6156   gcc_assert ((val & 0xffffff) == val);
6157
6158   if (((val & 0xfff) << 0) == val)
6159     return val;
6160
6161   return val & (0xfff << 12);
6162 }
6163
6164 /* Return true if val is an immediate that can be loaded into a
6165    register by a MOVZ instruction.  */
6166 static bool
6167 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6168 {
6169   if (GET_MODE_SIZE (mode) > 4)
6170     {
6171       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6172           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6173         return 1;
6174     }
6175   else
6176     {
6177       /* Ignore sign extension.  */
6178       val &= (HOST_WIDE_INT) 0xffffffff;
6179     }
6180   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6181           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6182 }
6183
6184 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
6185    64-bit (DImode) integer.  */
6186
6187 static unsigned HOST_WIDE_INT
6188 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6189 {
6190   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6191   while (size < 64)
6192     {
6193       val &= (HOST_WIDE_INT_1U << size) - 1;
6194       val |= val << size;
6195       size *= 2;
6196     }
6197   return val;
6198 }
6199
6200 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
6201
6202 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6203   {
6204     0x0000000100000001ull,
6205     0x0001000100010001ull,
6206     0x0101010101010101ull,
6207     0x1111111111111111ull,
6208     0x5555555555555555ull,
6209   };
6210
6211
6212 /* Return true if val is a valid bitmask immediate.  */
6213
6214 bool
6215 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6216 {
6217   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6218   int bits;
6219
6220   /* Check for a single sequence of one bits and return quickly if so.
6221      The special cases of all ones and all zeroes returns false.  */
6222   val = aarch64_replicate_bitmask_imm (val_in, mode);
6223   tmp = val + (val & -val);
6224
6225   if (tmp == (tmp & -tmp))
6226     return (val + 1) > 1;
6227
6228   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
6229   if (mode == SImode)
6230     val = (val << 32) | (val & 0xffffffff);
6231
6232   /* Invert if the immediate doesn't start with a zero bit - this means we
6233      only need to search for sequences of one bits.  */
6234   if (val & 1)
6235     val = ~val;
6236
6237   /* Find the first set bit and set tmp to val with the first sequence of one
6238      bits removed.  Return success if there is a single sequence of ones.  */
6239   first_one = val & -val;
6240   tmp = val & (val + first_one);
6241
6242   if (tmp == 0)
6243     return true;
6244
6245   /* Find the next set bit and compute the difference in bit position.  */
6246   next_one = tmp & -tmp;
6247   bits = clz_hwi (first_one) - clz_hwi (next_one);
6248   mask = val ^ tmp;
6249
6250   /* Check the bit position difference is a power of 2, and that the first
6251      sequence of one bits fits within 'bits' bits.  */
6252   if ((mask >> bits) != 0 || bits != (bits & -bits))
6253     return false;
6254
6255   /* Check the sequence of one bits is repeated 64/bits times.  */
6256   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6257 }
6258
6259 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6260    Assumed precondition: VAL_IN Is not zero.  */
6261
6262 unsigned HOST_WIDE_INT
6263 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6264 {
6265   int lowest_bit_set = ctz_hwi (val_in);
6266   int highest_bit_set = floor_log2 (val_in);
6267   gcc_assert (val_in != 0);
6268
6269   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6270           (HOST_WIDE_INT_1U << lowest_bit_set));
6271 }
6272
6273 /* Create constant where bits outside of lowest bit set to highest bit set
6274    are set to 1.  */
6275
6276 unsigned HOST_WIDE_INT
6277 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6278 {
6279   return val_in | ~aarch64_and_split_imm1 (val_in);
6280 }
6281
6282 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
6283
6284 bool
6285 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
6286 {
6287   scalar_int_mode int_mode;
6288   if (!is_a <scalar_int_mode> (mode, &int_mode))
6289     return false;
6290
6291   if (aarch64_bitmask_imm (val_in, int_mode))
6292     return false;
6293
6294   if (aarch64_move_imm (val_in, int_mode))
6295     return false;
6296
6297   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
6298
6299   return aarch64_bitmask_imm (imm2, int_mode);
6300 }
6301
6302 /* Return true if val is an immediate that can be loaded into a
6303    register in a single instruction.  */
6304 bool
6305 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
6306 {
6307   scalar_int_mode int_mode;
6308   if (!is_a <scalar_int_mode> (mode, &int_mode))
6309     return false;
6310
6311   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
6312     return 1;
6313   return aarch64_bitmask_imm (val, int_mode);
6314 }
6315
6316 static bool
6317 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
6318 {
6319   rtx base, offset;
6320
6321   if (GET_CODE (x) == HIGH)
6322     return true;
6323
6324   /* There's no way to calculate VL-based values using relocations.  */
6325   subrtx_iterator::array_type array;
6326   FOR_EACH_SUBRTX (iter, array, x, ALL)
6327     if (GET_CODE (*iter) == CONST_POLY_INT)
6328       return true;
6329
6330   split_const (x, &base, &offset);
6331   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
6332     {
6333       if (aarch64_classify_symbol (base, INTVAL (offset))
6334           != SYMBOL_FORCE_TO_MEM)
6335         return true;
6336       else
6337         /* Avoid generating a 64-bit relocation in ILP32; leave
6338            to aarch64_expand_mov_immediate to handle it properly.  */
6339         return mode != ptr_mode;
6340     }
6341
6342   return aarch64_tls_referenced_p (x);
6343 }
6344
6345 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6346    The expansion for a table switch is quite expensive due to the number
6347    of instructions, the table lookup and hard to predict indirect jump.
6348    When optimizing for speed, and -O3 enabled, use the per-core tuning if
6349    set, otherwise use tables for > 16 cases as a tradeoff between size and
6350    performance.  When optimizing for size, use the default setting.  */
6351
6352 static unsigned int
6353 aarch64_case_values_threshold (void)
6354 {
6355   /* Use the specified limit for the number of cases before using jump
6356      tables at higher optimization levels.  */
6357   if (optimize > 2
6358       && selected_cpu->tune->max_case_values != 0)
6359     return selected_cpu->tune->max_case_values;
6360   else
6361     return optimize_size ? default_case_values_threshold () : 17;
6362 }
6363
6364 /* Return true if register REGNO is a valid index register.
6365    STRICT_P is true if REG_OK_STRICT is in effect.  */
6366
6367 bool
6368 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
6369 {
6370   if (!HARD_REGISTER_NUM_P (regno))
6371     {
6372       if (!strict_p)
6373         return true;
6374
6375       if (!reg_renumber)
6376         return false;
6377
6378       regno = reg_renumber[regno];
6379     }
6380   return GP_REGNUM_P (regno);
6381 }
6382
6383 /* Return true if register REGNO is a valid base register for mode MODE.
6384    STRICT_P is true if REG_OK_STRICT is in effect.  */
6385
6386 bool
6387 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
6388 {
6389   if (!HARD_REGISTER_NUM_P (regno))
6390     {
6391       if (!strict_p)
6392         return true;
6393
6394       if (!reg_renumber)
6395         return false;
6396
6397       regno = reg_renumber[regno];
6398     }
6399
6400   /* The fake registers will be eliminated to either the stack or
6401      hard frame pointer, both of which are usually valid base registers.
6402      Reload deals with the cases where the eliminated form isn't valid.  */
6403   return (GP_REGNUM_P (regno)
6404           || regno == SP_REGNUM
6405           || regno == FRAME_POINTER_REGNUM
6406           || regno == ARG_POINTER_REGNUM);
6407 }
6408
6409 /* Return true if X is a valid base register for mode MODE.
6410    STRICT_P is true if REG_OK_STRICT is in effect.  */
6411
6412 static bool
6413 aarch64_base_register_rtx_p (rtx x, bool strict_p)
6414 {
6415   if (!strict_p
6416       && GET_CODE (x) == SUBREG
6417       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
6418     x = SUBREG_REG (x);
6419
6420   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
6421 }
6422
6423 /* Return true if address offset is a valid index.  If it is, fill in INFO
6424    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6425
6426 static bool
6427 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
6428                         machine_mode mode, bool strict_p)
6429 {
6430   enum aarch64_address_type type;
6431   rtx index;
6432   int shift;
6433
6434   /* (reg:P) */
6435   if ((REG_P (x) || GET_CODE (x) == SUBREG)
6436       && GET_MODE (x) == Pmode)
6437     {
6438       type = ADDRESS_REG_REG;
6439       index = x;
6440       shift = 0;
6441     }
6442   /* (sign_extend:DI (reg:SI)) */
6443   else if ((GET_CODE (x) == SIGN_EXTEND
6444             || GET_CODE (x) == ZERO_EXTEND)
6445            && GET_MODE (x) == DImode
6446            && GET_MODE (XEXP (x, 0)) == SImode)
6447     {
6448       type = (GET_CODE (x) == SIGN_EXTEND)
6449         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6450       index = XEXP (x, 0);
6451       shift = 0;
6452     }
6453   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6454   else if (GET_CODE (x) == MULT
6455            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6456                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6457            && GET_MODE (XEXP (x, 0)) == DImode
6458            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6459            && CONST_INT_P (XEXP (x, 1)))
6460     {
6461       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6462         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6463       index = XEXP (XEXP (x, 0), 0);
6464       shift = exact_log2 (INTVAL (XEXP (x, 1)));
6465     }
6466   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6467   else if (GET_CODE (x) == ASHIFT
6468            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6469                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6470            && GET_MODE (XEXP (x, 0)) == DImode
6471            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6472            && CONST_INT_P (XEXP (x, 1)))
6473     {
6474       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6475         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6476       index = XEXP (XEXP (x, 0), 0);
6477       shift = INTVAL (XEXP (x, 1));
6478     }
6479   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6480   else if ((GET_CODE (x) == SIGN_EXTRACT
6481             || GET_CODE (x) == ZERO_EXTRACT)
6482            && GET_MODE (x) == DImode
6483            && GET_CODE (XEXP (x, 0)) == MULT
6484            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6485            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6486     {
6487       type = (GET_CODE (x) == SIGN_EXTRACT)
6488         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6489       index = XEXP (XEXP (x, 0), 0);
6490       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6491       if (INTVAL (XEXP (x, 1)) != 32 + shift
6492           || INTVAL (XEXP (x, 2)) != 0)
6493         shift = -1;
6494     }
6495   /* (and:DI (mult:DI (reg:DI) (const_int scale))
6496      (const_int 0xffffffff<<shift)) */
6497   else if (GET_CODE (x) == AND
6498            && GET_MODE (x) == DImode
6499            && GET_CODE (XEXP (x, 0)) == MULT
6500            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6501            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6502            && CONST_INT_P (XEXP (x, 1)))
6503     {
6504       type = ADDRESS_REG_UXTW;
6505       index = XEXP (XEXP (x, 0), 0);
6506       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6507       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6508         shift = -1;
6509     }
6510   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
6511   else if ((GET_CODE (x) == SIGN_EXTRACT
6512             || GET_CODE (x) == ZERO_EXTRACT)
6513            && GET_MODE (x) == DImode
6514            && GET_CODE (XEXP (x, 0)) == ASHIFT
6515            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6516            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6517     {
6518       type = (GET_CODE (x) == SIGN_EXTRACT)
6519         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6520       index = XEXP (XEXP (x, 0), 0);
6521       shift = INTVAL (XEXP (XEXP (x, 0), 1));
6522       if (INTVAL (XEXP (x, 1)) != 32 + shift
6523           || INTVAL (XEXP (x, 2)) != 0)
6524         shift = -1;
6525     }
6526   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
6527      (const_int 0xffffffff<<shift)) */
6528   else if (GET_CODE (x) == AND
6529            && GET_MODE (x) == DImode
6530            && GET_CODE (XEXP (x, 0)) == ASHIFT
6531            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6532            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6533            && CONST_INT_P (XEXP (x, 1)))
6534     {
6535       type = ADDRESS_REG_UXTW;
6536       index = XEXP (XEXP (x, 0), 0);
6537       shift = INTVAL (XEXP (XEXP (x, 0), 1));
6538       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6539         shift = -1;
6540     }
6541   /* (mult:P (reg:P) (const_int scale)) */
6542   else if (GET_CODE (x) == MULT
6543            && GET_MODE (x) == Pmode
6544            && GET_MODE (XEXP (x, 0)) == Pmode
6545            && CONST_INT_P (XEXP (x, 1)))
6546     {
6547       type = ADDRESS_REG_REG;
6548       index = XEXP (x, 0);
6549       shift = exact_log2 (INTVAL (XEXP (x, 1)));
6550     }
6551   /* (ashift:P (reg:P) (const_int shift)) */
6552   else if (GET_CODE (x) == ASHIFT
6553            && GET_MODE (x) == Pmode
6554            && GET_MODE (XEXP (x, 0)) == Pmode
6555            && CONST_INT_P (XEXP (x, 1)))
6556     {
6557       type = ADDRESS_REG_REG;
6558       index = XEXP (x, 0);
6559       shift = INTVAL (XEXP (x, 1));
6560     }
6561   else
6562     return false;
6563
6564   if (!strict_p
6565       && GET_CODE (index) == SUBREG
6566       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
6567     index = SUBREG_REG (index);
6568
6569   if (aarch64_sve_data_mode_p (mode))
6570     {
6571       if (type != ADDRESS_REG_REG
6572           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
6573         return false;
6574     }
6575   else
6576     {
6577       if (shift != 0
6578           && !(IN_RANGE (shift, 1, 3)
6579                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
6580         return false;
6581     }
6582
6583   if (REG_P (index)
6584       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
6585     {
6586       info->type = type;
6587       info->offset = index;
6588       info->shift = shift;
6589       return true;
6590     }
6591
6592   return false;
6593 }
6594
6595 /* Return true if MODE is one of the modes for which we
6596    support LDP/STP operations.  */
6597
6598 static bool
6599 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
6600 {
6601   return mode == SImode || mode == DImode
6602          || mode == SFmode || mode == DFmode
6603          || (aarch64_vector_mode_supported_p (mode)
6604              && (known_eq (GET_MODE_SIZE (mode), 8)
6605                  || (known_eq (GET_MODE_SIZE (mode), 16)
6606                     && (aarch64_tune_params.extra_tuning_flags
6607                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
6608 }
6609
6610 /* Return true if REGNO is a virtual pointer register, or an eliminable
6611    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
6612    include stack_pointer or hard_frame_pointer.  */
6613 static bool
6614 virt_or_elim_regno_p (unsigned regno)
6615 {
6616   return ((regno >= FIRST_VIRTUAL_REGISTER
6617            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
6618           || regno == FRAME_POINTER_REGNUM
6619           || regno == ARG_POINTER_REGNUM);
6620 }
6621
6622 /* Return true if X is a valid address of type TYPE for machine mode MODE.
6623    If it is, fill in INFO appropriately.  STRICT_P is true if
6624    REG_OK_STRICT is in effect.  */
6625
6626 bool
6627 aarch64_classify_address (struct aarch64_address_info *info,
6628                           rtx x, machine_mode mode, bool strict_p,
6629                           aarch64_addr_query_type type)
6630 {
6631   enum rtx_code code = GET_CODE (x);
6632   rtx op0, op1;
6633   poly_int64 offset;
6634
6635   HOST_WIDE_INT const_size;
6636
6637   /* On BE, we use load/store pair for all large int mode load/stores.
6638      TI/TFmode may also use a load/store pair.  */
6639   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6640   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
6641   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
6642                             || type == ADDR_QUERY_LDP_STP_N
6643                             || mode == TImode
6644                             || mode == TFmode
6645                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
6646
6647   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
6648      corresponds to the actual size of the memory being loaded/stored and the
6649      mode of the corresponding addressing mode is half of that.  */
6650   if (type == ADDR_QUERY_LDP_STP_N
6651       && known_eq (GET_MODE_SIZE (mode), 16))
6652     mode = DFmode;
6653
6654   bool allow_reg_index_p = (!load_store_pair_p
6655                             && (known_lt (GET_MODE_SIZE (mode), 16)
6656                                 || vec_flags == VEC_ADVSIMD
6657                                 || vec_flags & VEC_SVE_DATA));
6658
6659   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
6660      [Rn, #offset, MUL VL].  */
6661   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
6662       && (code != REG && code != PLUS))
6663     return false;
6664
6665   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
6666      REG addressing.  */
6667   if (advsimd_struct_p
6668       && !BYTES_BIG_ENDIAN
6669       && (code != POST_INC && code != REG))
6670     return false;
6671
6672   gcc_checking_assert (GET_MODE (x) == VOIDmode
6673                        || SCALAR_INT_MODE_P (GET_MODE (x)));
6674
6675   switch (code)
6676     {
6677     case REG:
6678     case SUBREG:
6679       info->type = ADDRESS_REG_IMM;
6680       info->base = x;
6681       info->offset = const0_rtx;
6682       info->const_offset = 0;
6683       return aarch64_base_register_rtx_p (x, strict_p);
6684
6685     case PLUS:
6686       op0 = XEXP (x, 0);
6687       op1 = XEXP (x, 1);
6688
6689       if (! strict_p
6690           && REG_P (op0)
6691           && virt_or_elim_regno_p (REGNO (op0))
6692           && poly_int_rtx_p (op1, &offset))
6693         {
6694           info->type = ADDRESS_REG_IMM;
6695           info->base = op0;
6696           info->offset = op1;
6697           info->const_offset = offset;
6698
6699           return true;
6700         }
6701
6702       if (maybe_ne (GET_MODE_SIZE (mode), 0)
6703           && aarch64_base_register_rtx_p (op0, strict_p)
6704           && poly_int_rtx_p (op1, &offset))
6705         {
6706           info->type = ADDRESS_REG_IMM;
6707           info->base = op0;
6708           info->offset = op1;
6709           info->const_offset = offset;
6710
6711           /* TImode and TFmode values are allowed in both pairs of X
6712              registers and individual Q registers.  The available
6713              address modes are:
6714              X,X: 7-bit signed scaled offset
6715              Q:   9-bit signed offset
6716              We conservatively require an offset representable in either mode.
6717              When performing the check for pairs of X registers i.e.  LDP/STP
6718              pass down DImode since that is the natural size of the LDP/STP
6719              instruction memory accesses.  */
6720           if (mode == TImode || mode == TFmode)
6721             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
6722                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6723                         || offset_12bit_unsigned_scaled_p (mode, offset)));
6724
6725           /* A 7bit offset check because OImode will emit a ldp/stp
6726              instruction (only big endian will get here).
6727              For ldp/stp instructions, the offset is scaled for the size of a
6728              single element of the pair.  */
6729           if (mode == OImode)
6730             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
6731
6732           /* Three 9/12 bit offsets checks because CImode will emit three
6733              ldr/str instructions (only big endian will get here).  */
6734           if (mode == CImode)
6735             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6736                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
6737                                                                offset + 32)
6738                         || offset_12bit_unsigned_scaled_p (V16QImode,
6739                                                            offset + 32)));
6740
6741           /* Two 7bit offsets checks because XImode will emit two ldp/stp
6742              instructions (only big endian will get here).  */
6743           if (mode == XImode)
6744             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6745                     && aarch64_offset_7bit_signed_scaled_p (TImode,
6746                                                             offset + 32));
6747
6748           /* Make "m" use the LD1 offset range for SVE data modes, so
6749              that pre-RTL optimizers like ivopts will work to that
6750              instead of the wider LDR/STR range.  */
6751           if (vec_flags == VEC_SVE_DATA)
6752             return (type == ADDR_QUERY_M
6753                     ? offset_4bit_signed_scaled_p (mode, offset)
6754                     : offset_9bit_signed_scaled_p (mode, offset));
6755
6756           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
6757             {
6758               poly_int64 end_offset = (offset
6759                                        + GET_MODE_SIZE (mode)
6760                                        - BYTES_PER_SVE_VECTOR);
6761               return (type == ADDR_QUERY_M
6762                       ? offset_4bit_signed_scaled_p (mode, offset)
6763                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
6764                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
6765                                                          end_offset)));
6766             }
6767
6768           if (vec_flags == VEC_SVE_PRED)
6769             return offset_9bit_signed_scaled_p (mode, offset);
6770
6771           if (load_store_pair_p)
6772             return ((known_eq (GET_MODE_SIZE (mode), 4)
6773                      || known_eq (GET_MODE_SIZE (mode), 8)
6774                      || known_eq (GET_MODE_SIZE (mode), 16))
6775                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6776           else
6777             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6778                     || offset_12bit_unsigned_scaled_p (mode, offset));
6779         }
6780
6781       if (allow_reg_index_p)
6782         {
6783           /* Look for base + (scaled/extended) index register.  */
6784           if (aarch64_base_register_rtx_p (op0, strict_p)
6785               && aarch64_classify_index (info, op1, mode, strict_p))
6786             {
6787               info->base = op0;
6788               return true;
6789             }
6790           if (aarch64_base_register_rtx_p (op1, strict_p)
6791               && aarch64_classify_index (info, op0, mode, strict_p))
6792             {
6793               info->base = op1;
6794               return true;
6795             }
6796         }
6797
6798       return false;
6799
6800     case POST_INC:
6801     case POST_DEC:
6802     case PRE_INC:
6803     case PRE_DEC:
6804       info->type = ADDRESS_REG_WB;
6805       info->base = XEXP (x, 0);
6806       info->offset = NULL_RTX;
6807       return aarch64_base_register_rtx_p (info->base, strict_p);
6808
6809     case POST_MODIFY:
6810     case PRE_MODIFY:
6811       info->type = ADDRESS_REG_WB;
6812       info->base = XEXP (x, 0);
6813       if (GET_CODE (XEXP (x, 1)) == PLUS
6814           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
6815           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
6816           && aarch64_base_register_rtx_p (info->base, strict_p))
6817         {
6818           info->offset = XEXP (XEXP (x, 1), 1);
6819           info->const_offset = offset;
6820
6821           /* TImode and TFmode values are allowed in both pairs of X
6822              registers and individual Q registers.  The available
6823              address modes are:
6824              X,X: 7-bit signed scaled offset
6825              Q:   9-bit signed offset
6826              We conservatively require an offset representable in either mode.
6827            */
6828           if (mode == TImode || mode == TFmode)
6829             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
6830                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
6831
6832           if (load_store_pair_p)
6833             return ((known_eq (GET_MODE_SIZE (mode), 4)
6834                      || known_eq (GET_MODE_SIZE (mode), 8)
6835                      || known_eq (GET_MODE_SIZE (mode), 16))
6836                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6837           else
6838             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
6839         }
6840       return false;
6841
6842     case CONST:
6843     case SYMBOL_REF:
6844     case LABEL_REF:
6845       /* load literal: pc-relative constant pool entry.  Only supported
6846          for SI mode or larger.  */
6847       info->type = ADDRESS_SYMBOLIC;
6848
6849       if (!load_store_pair_p
6850           && GET_MODE_SIZE (mode).is_constant (&const_size)
6851           && const_size >= 4)
6852         {
6853           rtx sym, addend;
6854
6855           split_const (x, &sym, &addend);
6856           return ((GET_CODE (sym) == LABEL_REF
6857                    || (GET_CODE (sym) == SYMBOL_REF
6858                        && CONSTANT_POOL_ADDRESS_P (sym)
6859                        && aarch64_pcrelative_literal_loads)));
6860         }
6861       return false;
6862
6863     case LO_SUM:
6864       info->type = ADDRESS_LO_SUM;
6865       info->base = XEXP (x, 0);
6866       info->offset = XEXP (x, 1);
6867       if (allow_reg_index_p
6868           && aarch64_base_register_rtx_p (info->base, strict_p))
6869         {
6870           rtx sym, offs;
6871           split_const (info->offset, &sym, &offs);
6872           if (GET_CODE (sym) == SYMBOL_REF
6873               && (aarch64_classify_symbol (sym, INTVAL (offs))
6874                   == SYMBOL_SMALL_ABSOLUTE))
6875             {
6876               /* The symbol and offset must be aligned to the access size.  */
6877               unsigned int align;
6878
6879               if (CONSTANT_POOL_ADDRESS_P (sym))
6880                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
6881               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
6882                 {
6883                   tree exp = SYMBOL_REF_DECL (sym);
6884                   align = TYPE_ALIGN (TREE_TYPE (exp));
6885                   align = aarch64_constant_alignment (exp, align);
6886                 }
6887               else if (SYMBOL_REF_DECL (sym))
6888                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
6889               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
6890                        && SYMBOL_REF_BLOCK (sym) != NULL)
6891                 align = SYMBOL_REF_BLOCK (sym)->alignment;
6892               else
6893                 align = BITS_PER_UNIT;
6894
6895               poly_int64 ref_size = GET_MODE_SIZE (mode);
6896               if (known_eq (ref_size, 0))
6897                 ref_size = GET_MODE_SIZE (DImode);
6898
6899               return (multiple_p (INTVAL (offs), ref_size)
6900                       && multiple_p (align / BITS_PER_UNIT, ref_size));
6901             }
6902         }
6903       return false;
6904
6905     default:
6906       return false;
6907     }
6908 }
6909
6910 /* Return true if the address X is valid for a PRFM instruction.
6911    STRICT_P is true if we should do strict checking with
6912    aarch64_classify_address.  */
6913
6914 bool
6915 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
6916 {
6917   struct aarch64_address_info addr;
6918
6919   /* PRFM accepts the same addresses as DImode...  */
6920   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6921   if (!res)
6922     return false;
6923
6924   /* ... except writeback forms.  */
6925   return addr.type != ADDRESS_REG_WB;
6926 }
6927
6928 bool
6929 aarch64_symbolic_address_p (rtx x)
6930 {
6931   rtx offset;
6932
6933   split_const (x, &x, &offset);
6934   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6935 }
6936
6937 /* Classify the base of symbolic expression X.  */
6938
6939 enum aarch64_symbol_type
6940 aarch64_classify_symbolic_expression (rtx x)
6941 {
6942   rtx offset;
6943
6944   split_const (x, &x, &offset);
6945   return aarch64_classify_symbol (x, INTVAL (offset));
6946 }
6947
6948
6949 /* Return TRUE if X is a legitimate address for accessing memory in
6950    mode MODE.  */
6951 static bool
6952 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6953 {
6954   struct aarch64_address_info addr;
6955
6956   return aarch64_classify_address (&addr, x, mode, strict_p);
6957 }
6958
6959 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6960    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6961 bool
6962 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6963                               aarch64_addr_query_type type)
6964 {
6965   struct aarch64_address_info addr;
6966
6967   return aarch64_classify_address (&addr, x, mode, strict_p, type);
6968 }
6969
6970 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
6971
6972 static bool
6973 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6974                                          poly_int64 orig_offset,
6975                                          machine_mode mode)
6976 {
6977   HOST_WIDE_INT size;
6978   if (GET_MODE_SIZE (mode).is_constant (&size))
6979     {
6980       HOST_WIDE_INT const_offset, second_offset;
6981
6982       /* A general SVE offset is A * VQ + B.  Remove the A component from
6983          coefficient 0 in order to get the constant B.  */
6984       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6985
6986       /* Split an out-of-range address displacement into a base and
6987          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
6988          range otherwise to increase opportunities for sharing the base
6989          address of different sizes.  Unaligned accesses use the signed
6990          9-bit range, TImode/TFmode use the intersection of signed
6991          scaled 7-bit and signed 9-bit offset.  */
6992       if (mode == TImode || mode == TFmode)
6993         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6994       else if ((const_offset & (size - 1)) != 0)
6995         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6996       else
6997         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6998
6999       if (second_offset == 0 || known_eq (orig_offset, second_offset))
7000         return false;
7001
7002       /* Split the offset into second_offset and the rest.  */
7003       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7004       *offset2 = gen_int_mode (second_offset, Pmode);
7005       return true;
7006     }
7007   else
7008     {
7009       /* Get the mode we should use as the basis of the range.  For structure
7010          modes this is the mode of one vector.  */
7011       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7012       machine_mode step_mode
7013         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7014
7015       /* Get the "mul vl" multiplier we'd like to use.  */
7016       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7017       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7018       if (vec_flags & VEC_SVE_DATA)
7019         /* LDR supports a 9-bit range, but the move patterns for
7020            structure modes require all vectors to be in range of the
7021            same base.  The simplest way of accomodating that while still
7022            promoting reuse of anchor points between different modes is
7023            to use an 8-bit range unconditionally.  */
7024         vnum = ((vnum + 128) & 255) - 128;
7025       else
7026         /* Predicates are only handled singly, so we might as well use
7027            the full range.  */
7028         vnum = ((vnum + 256) & 511) - 256;
7029       if (vnum == 0)
7030         return false;
7031
7032       /* Convert the "mul vl" multiplier into a byte offset.  */
7033       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7034       if (known_eq (second_offset, orig_offset))
7035         return false;
7036
7037       /* Split the offset into second_offset and the rest.  */
7038       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7039       *offset2 = gen_int_mode (second_offset, Pmode);
7040       return true;
7041     }
7042 }
7043
7044 /* Return the binary representation of floating point constant VALUE in INTVAL.
7045    If the value cannot be converted, return false without setting INTVAL.
7046    The conversion is done in the given MODE.  */
7047 bool
7048 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7049 {
7050
7051   /* We make a general exception for 0.  */
7052   if (aarch64_float_const_zero_rtx_p (value))
7053     {
7054       *intval = 0;
7055       return true;
7056     }
7057
7058   scalar_float_mode mode;
7059   if (GET_CODE (value) != CONST_DOUBLE
7060       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7061       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7062       /* Only support up to DF mode.  */
7063       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7064     return false;
7065
7066   unsigned HOST_WIDE_INT ival = 0;
7067
7068   long res[2];
7069   real_to_target (res,
7070                   CONST_DOUBLE_REAL_VALUE (value),
7071                   REAL_MODE_FORMAT (mode));
7072
7073   if (mode == DFmode)
7074     {
7075       int order = BYTES_BIG_ENDIAN ? 1 : 0;
7076       ival = zext_hwi (res[order], 32);
7077       ival |= (zext_hwi (res[1 - order], 32) << 32);
7078     }
7079   else
7080       ival = zext_hwi (res[0], 32);
7081
7082   *intval = ival;
7083   return true;
7084 }
7085
7086 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7087    single MOV(+MOVK) followed by an FMOV.  */
7088 bool
7089 aarch64_float_const_rtx_p (rtx x)
7090 {
7091   machine_mode mode = GET_MODE (x);
7092   if (mode == VOIDmode)
7093     return false;
7094
7095   /* Determine whether it's cheaper to write float constants as
7096      mov/movk pairs over ldr/adrp pairs.  */
7097   unsigned HOST_WIDE_INT ival;
7098
7099   if (GET_CODE (x) == CONST_DOUBLE
7100       && SCALAR_FLOAT_MODE_P (mode)
7101       && aarch64_reinterpret_float_as_int (x, &ival))
7102     {
7103       scalar_int_mode imode = (mode == HFmode
7104                                ? SImode
7105                                : int_mode_for_mode (mode).require ());
7106       int num_instr = aarch64_internal_mov_immediate
7107                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7108       return num_instr < 3;
7109     }
7110
7111   return false;
7112 }
7113
7114 /* Return TRUE if rtx X is immediate constant 0.0 */
7115 bool
7116 aarch64_float_const_zero_rtx_p (rtx x)
7117 {
7118   if (GET_MODE (x) == VOIDmode)
7119     return false;
7120
7121   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7122     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7123   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7124 }
7125
7126 /* Return TRUE if rtx X is immediate constant that fits in a single
7127    MOVI immediate operation.  */
7128 bool
7129 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7130 {
7131   if (!TARGET_SIMD)
7132      return false;
7133
7134   machine_mode vmode;
7135   scalar_int_mode imode;
7136   unsigned HOST_WIDE_INT ival;
7137
7138   if (GET_CODE (x) == CONST_DOUBLE
7139       && SCALAR_FLOAT_MODE_P (mode))
7140     {
7141       if (!aarch64_reinterpret_float_as_int (x, &ival))
7142         return false;
7143
7144       /* We make a general exception for 0.  */
7145       if (aarch64_float_const_zero_rtx_p (x))
7146         return true;
7147
7148       imode = int_mode_for_mode (mode).require ();
7149     }
7150   else if (GET_CODE (x) == CONST_INT
7151            && is_a <scalar_int_mode> (mode, &imode))
7152     ival = INTVAL (x);
7153   else
7154     return false;
7155
7156    /* use a 64 bit mode for everything except for DI/DF mode, where we use
7157      a 128 bit vector mode.  */
7158   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7159
7160   vmode = aarch64_simd_container_mode (imode, width);
7161   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7162
7163   return aarch64_simd_valid_immediate (v_op, NULL);
7164 }
7165
7166
7167 /* Return the fixed registers used for condition codes.  */
7168
7169 static bool
7170 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7171 {
7172   *p1 = CC_REGNUM;
7173   *p2 = INVALID_REGNUM;
7174   return true;
7175 }
7176
7177 /* This function is used by the call expanders of the machine description.
7178    RESULT is the register in which the result is returned.  It's NULL for
7179    "call" and "sibcall".
7180    MEM is the location of the function call.
7181    SIBCALL indicates whether this function call is normal call or sibling call.
7182    It will generate different pattern accordingly.  */
7183
7184 void
7185 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7186 {
7187   rtx call, callee, tmp;
7188   rtvec vec;
7189   machine_mode mode;
7190
7191   gcc_assert (MEM_P (mem));
7192   callee = XEXP (mem, 0);
7193   mode = GET_MODE (callee);
7194   gcc_assert (mode == Pmode);
7195
7196   /* Decide if we should generate indirect calls by loading the
7197      address of the callee into a register before performing
7198      the branch-and-link.  */
7199   if (SYMBOL_REF_P (callee)
7200       ? (aarch64_is_long_call_p (callee)
7201          || aarch64_is_noplt_call_p (callee))
7202       : !REG_P (callee))
7203     XEXP (mem, 0) = force_reg (mode, callee);
7204
7205   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7206
7207   if (result != NULL_RTX)
7208     call = gen_rtx_SET (result, call);
7209
7210   if (sibcall)
7211     tmp = ret_rtx;
7212   else
7213     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7214
7215   vec = gen_rtvec (2, call, tmp);
7216   call = gen_rtx_PARALLEL (VOIDmode, vec);
7217
7218   aarch64_emit_call_insn (call);
7219 }
7220
7221 /* Emit call insn with PAT and do aarch64-specific handling.  */
7222
7223 void
7224 aarch64_emit_call_insn (rtx pat)
7225 {
7226   rtx insn = emit_call_insn (pat);
7227
7228   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7229   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7230   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7231 }
7232
7233 machine_mode
7234 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7235 {
7236   machine_mode mode_x = GET_MODE (x);
7237   rtx_code code_x = GET_CODE (x);
7238
7239   /* All floating point compares return CCFP if it is an equality
7240      comparison, and CCFPE otherwise.  */
7241   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
7242     {
7243       switch (code)
7244         {
7245         case EQ:
7246         case NE:
7247         case UNORDERED:
7248         case ORDERED:
7249         case UNLT:
7250         case UNLE:
7251         case UNGT:
7252         case UNGE:
7253         case UNEQ:
7254           return CCFPmode;
7255
7256         case LT:
7257         case LE:
7258         case GT:
7259         case GE:
7260         case LTGT:
7261           return CCFPEmode;
7262
7263         default:
7264           gcc_unreachable ();
7265         }
7266     }
7267
7268   /* Equality comparisons of short modes against zero can be performed
7269      using the TST instruction with the appropriate bitmask.  */
7270   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
7271       && (code == EQ || code == NE)
7272       && (mode_x == HImode || mode_x == QImode))
7273     return CC_NZmode;
7274
7275   /* Similarly, comparisons of zero_extends from shorter modes can
7276      be performed using an ANDS with an immediate mask.  */
7277   if (y == const0_rtx && code_x == ZERO_EXTEND
7278       && (mode_x == SImode || mode_x == DImode)
7279       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
7280       && (code == EQ || code == NE))
7281     return CC_NZmode;
7282
7283   if ((mode_x == SImode || mode_x == DImode)
7284       && y == const0_rtx
7285       && (code == EQ || code == NE || code == LT || code == GE)
7286       && (code_x == PLUS || code_x == MINUS || code_x == AND
7287           || code_x == NEG
7288           || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7289               && CONST_INT_P (XEXP (x, 2)))))
7290     return CC_NZmode;
7291
7292   /* A compare with a shifted operand.  Because of canonicalization,
7293      the comparison will have to be swapped when we emit the assembly
7294      code.  */
7295   if ((mode_x == SImode || mode_x == DImode)
7296       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
7297       && (code_x == ASHIFT || code_x == ASHIFTRT
7298           || code_x == LSHIFTRT
7299           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
7300     return CC_SWPmode;
7301
7302   /* Similarly for a negated operand, but we can only do this for
7303      equalities.  */
7304   if ((mode_x == SImode || mode_x == DImode)
7305       && (REG_P (y) || GET_CODE (y) == SUBREG)
7306       && (code == EQ || code == NE)
7307       && code_x == NEG)
7308     return CC_Zmode;
7309
7310   /* A test for unsigned overflow from an addition.  */
7311   if ((mode_x == DImode || mode_x == TImode)
7312       && (code == LTU || code == GEU)
7313       && code_x == PLUS
7314       && rtx_equal_p (XEXP (x, 0), y))
7315     return CC_Cmode;
7316
7317   /* A test for unsigned overflow from an add with carry.  */
7318   if ((mode_x == DImode || mode_x == TImode)
7319       && (code == LTU || code == GEU)
7320       && code_x == PLUS
7321       && CONST_SCALAR_INT_P (y)
7322       && (rtx_mode_t (y, mode_x)
7323           == (wi::shwi (1, mode_x)
7324               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
7325     return CC_ADCmode;
7326
7327   /* A test for signed overflow.  */
7328   if ((mode_x == DImode || mode_x == TImode)
7329       && code == NE
7330       && code_x == PLUS
7331       && GET_CODE (y) == SIGN_EXTEND)
7332     return CC_Vmode;
7333
7334   /* For everything else, return CCmode.  */
7335   return CCmode;
7336 }
7337
7338 static int
7339 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
7340
7341 int
7342 aarch64_get_condition_code (rtx x)
7343 {
7344   machine_mode mode = GET_MODE (XEXP (x, 0));
7345   enum rtx_code comp_code = GET_CODE (x);
7346
7347   if (GET_MODE_CLASS (mode) != MODE_CC)
7348     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
7349   return aarch64_get_condition_code_1 (mode, comp_code);
7350 }
7351
7352 static int
7353 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
7354 {
7355   switch (mode)
7356     {
7357     case E_CCFPmode:
7358     case E_CCFPEmode:
7359       switch (comp_code)
7360         {
7361         case GE: return AARCH64_GE;
7362         case GT: return AARCH64_GT;
7363         case LE: return AARCH64_LS;
7364         case LT: return AARCH64_MI;
7365         case NE: return AARCH64_NE;
7366         case EQ: return AARCH64_EQ;
7367         case ORDERED: return AARCH64_VC;
7368         case UNORDERED: return AARCH64_VS;
7369         case UNLT: return AARCH64_LT;
7370         case UNLE: return AARCH64_LE;
7371         case UNGT: return AARCH64_HI;
7372         case UNGE: return AARCH64_PL;
7373         default: return -1;
7374         }
7375       break;
7376
7377     case E_CCmode:
7378       switch (comp_code)
7379         {
7380         case NE: return AARCH64_NE;
7381         case EQ: return AARCH64_EQ;
7382         case GE: return AARCH64_GE;
7383         case GT: return AARCH64_GT;
7384         case LE: return AARCH64_LE;
7385         case LT: return AARCH64_LT;
7386         case GEU: return AARCH64_CS;
7387         case GTU: return AARCH64_HI;
7388         case LEU: return AARCH64_LS;
7389         case LTU: return AARCH64_CC;
7390         default: return -1;
7391         }
7392       break;
7393
7394     case E_CC_SWPmode:
7395       switch (comp_code)
7396         {
7397         case NE: return AARCH64_NE;
7398         case EQ: return AARCH64_EQ;
7399         case GE: return AARCH64_LE;
7400         case GT: return AARCH64_LT;
7401         case LE: return AARCH64_GE;
7402         case LT: return AARCH64_GT;
7403         case GEU: return AARCH64_LS;
7404         case GTU: return AARCH64_CC;
7405         case LEU: return AARCH64_CS;
7406         case LTU: return AARCH64_HI;
7407         default: return -1;
7408         }
7409       break;
7410
7411     case E_CC_NZCmode:
7412       switch (comp_code)
7413         {
7414         case NE: return AARCH64_NE; /* = any */
7415         case EQ: return AARCH64_EQ; /* = none */
7416         case GE: return AARCH64_PL; /* = nfrst */
7417         case LT: return AARCH64_MI; /* = first */
7418         case GEU: return AARCH64_CS; /* = nlast */
7419         case GTU: return AARCH64_HI; /* = pmore */
7420         case LEU: return AARCH64_LS; /* = plast */
7421         case LTU: return AARCH64_CC; /* = last */
7422         default: return -1;
7423         }
7424       break;
7425
7426     case E_CC_NZmode:
7427       switch (comp_code)
7428         {
7429         case NE: return AARCH64_NE;
7430         case EQ: return AARCH64_EQ;
7431         case GE: return AARCH64_PL;
7432         case LT: return AARCH64_MI;
7433         default: return -1;
7434         }
7435       break;
7436
7437     case E_CC_Zmode:
7438       switch (comp_code)
7439         {
7440         case NE: return AARCH64_NE;
7441         case EQ: return AARCH64_EQ;
7442         default: return -1;
7443         }
7444       break;
7445
7446     case E_CC_Cmode:
7447       switch (comp_code)
7448         {
7449         case LTU: return AARCH64_CS;
7450         case GEU: return AARCH64_CC;
7451         default: return -1;
7452         }
7453       break;
7454
7455     case E_CC_ADCmode:
7456       switch (comp_code)
7457         {
7458         case GEU: return AARCH64_CS;
7459         case LTU: return AARCH64_CC;
7460         default: return -1;
7461         }
7462       break;
7463
7464     case E_CC_Vmode:
7465       switch (comp_code)
7466         {
7467         case NE: return AARCH64_VS;
7468         case EQ: return AARCH64_VC;
7469         default: return -1;
7470         }
7471       break;
7472
7473     default:
7474       return -1;
7475     }
7476
7477   return -1;
7478 }
7479
7480 bool
7481 aarch64_const_vec_all_same_in_range_p (rtx x,
7482                                        HOST_WIDE_INT minval,
7483                                        HOST_WIDE_INT maxval)
7484 {
7485   rtx elt;
7486   return (const_vec_duplicate_p (x, &elt)
7487           && CONST_INT_P (elt)
7488           && IN_RANGE (INTVAL (elt), minval, maxval));
7489 }
7490
7491 bool
7492 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
7493 {
7494   return aarch64_const_vec_all_same_in_range_p (x, val, val);
7495 }
7496
7497 /* Return true if VEC is a constant in which every element is in the range
7498    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
7499
7500 static bool
7501 aarch64_const_vec_all_in_range_p (rtx vec,
7502                                   HOST_WIDE_INT minval,
7503                                   HOST_WIDE_INT maxval)
7504 {
7505   if (GET_CODE (vec) != CONST_VECTOR
7506       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
7507     return false;
7508
7509   int nunits;
7510   if (!CONST_VECTOR_STEPPED_P (vec))
7511     nunits = const_vector_encoded_nelts (vec);
7512   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
7513     return false;
7514
7515   for (int i = 0; i < nunits; i++)
7516     {
7517       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
7518       if (!CONST_INT_P (vec_elem)
7519           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
7520         return false;
7521     }
7522   return true;
7523 }
7524
7525 /* N Z C V.  */
7526 #define AARCH64_CC_V 1
7527 #define AARCH64_CC_C (1 << 1)
7528 #define AARCH64_CC_Z (1 << 2)
7529 #define AARCH64_CC_N (1 << 3)
7530
7531 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
7532 static const int aarch64_nzcv_codes[] =
7533 {
7534   0,            /* EQ, Z == 1.  */
7535   AARCH64_CC_Z, /* NE, Z == 0.  */
7536   0,            /* CS, C == 1.  */
7537   AARCH64_CC_C, /* CC, C == 0.  */
7538   0,            /* MI, N == 1.  */
7539   AARCH64_CC_N, /* PL, N == 0.  */
7540   0,            /* VS, V == 1.  */
7541   AARCH64_CC_V, /* VC, V == 0.  */
7542   0,            /* HI, C ==1 && Z == 0.  */
7543   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
7544   AARCH64_CC_V, /* GE, N == V.  */
7545   0,            /* LT, N != V.  */
7546   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
7547   0,            /* LE, !(Z == 0 && N == V).  */
7548   0,            /* AL, Any.  */
7549   0             /* NV, Any.  */
7550 };
7551
7552 /* Print floating-point vector immediate operand X to F, negating it
7553    first if NEGATE is true.  Return true on success, false if it isn't
7554    a constant we can handle.  */
7555
7556 static bool
7557 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
7558 {
7559   rtx elt;
7560
7561   if (!const_vec_duplicate_p (x, &elt))
7562     return false;
7563
7564   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
7565   if (negate)
7566     r = real_value_negate (&r);
7567
7568   /* We only handle the SVE single-bit immediates here.  */
7569   if (real_equal (&r, &dconst0))
7570     asm_fprintf (f, "0.0");
7571   else if (real_equal (&r, &dconst1))
7572     asm_fprintf (f, "1.0");
7573   else if (real_equal (&r, &dconsthalf))
7574     asm_fprintf (f, "0.5");
7575   else
7576     return false;
7577
7578   return true;
7579 }
7580
7581 /* Return the equivalent letter for size.  */
7582 static char
7583 sizetochar (int size)
7584 {
7585   switch (size)
7586     {
7587     case 64: return 'd';
7588     case 32: return 's';
7589     case 16: return 'h';
7590     case 8 : return 'b';
7591     default: gcc_unreachable ();
7592     }
7593 }
7594
7595 /* Print operand X to file F in a target specific manner according to CODE.
7596    The acceptable formatting commands given by CODE are:
7597      'c':               An integer or symbol address without a preceding #
7598                         sign.
7599      'C':               Take the duplicated element in a vector constant
7600                         and print it in hex.
7601      'D':               Take the duplicated element in a vector constant
7602                         and print it as an unsigned integer, in decimal.
7603      'e':               Print the sign/zero-extend size as a character 8->b,
7604                         16->h, 32->w.
7605      'p':               Prints N such that 2^N == X (X must be power of 2 and
7606                         const int).
7607      'P':               Print the number of non-zero bits in X (a const_int).
7608      'H':               Print the higher numbered register of a pair (TImode)
7609                         of regs.
7610      'm':               Print a condition (eq, ne, etc).
7611      'M':               Same as 'm', but invert condition.
7612      'N':               Take the duplicated element in a vector constant
7613                         and print the negative of it in decimal.
7614      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
7615      'S/T/U/V':         Print a FP/SIMD register name for a register list.
7616                         The register printed is the FP/SIMD register name
7617                         of X + 0/1/2/3 for S/T/U/V.
7618      'R':               Print a scalar FP/SIMD register name + 1.
7619      'X':               Print bottom 16 bits of integer constant in hex.
7620      'w/x':             Print a general register name or the zero register
7621                         (32-bit or 64-bit).
7622      '0':               Print a normal operand, if it's a general register,
7623                         then we assume DImode.
7624      'k':               Print NZCV for conditional compare instructions.
7625      'A':               Output address constant representing the first
7626                         argument of X, specifying a relocation offset
7627                         if appropriate.
7628      'L':               Output constant address specified by X
7629                         with a relocation offset if appropriate.
7630      'G':               Prints address of X, specifying a PC relative
7631                         relocation mode if appropriate.
7632      'y':               Output address of LDP or STP - this is used for
7633                         some LDP/STPs which don't use a PARALLEL in their
7634                         pattern (so the mode needs to be adjusted).
7635      'z':               Output address of a typical LDP or STP.  */
7636
7637 static void
7638 aarch64_print_operand (FILE *f, rtx x, int code)
7639 {
7640   rtx elt;
7641   switch (code)
7642     {
7643     case 'c':
7644       switch (GET_CODE (x))
7645         {
7646         case CONST_INT:
7647           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
7648           break;
7649
7650         case SYMBOL_REF:
7651           output_addr_const (f, x);
7652           break;
7653
7654         case CONST:
7655           if (GET_CODE (XEXP (x, 0)) == PLUS
7656               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
7657             {
7658               output_addr_const (f, x);
7659               break;
7660             }
7661           /* Fall through.  */
7662
7663         default:
7664           output_operand_lossage ("unsupported operand for code '%c'", code);
7665         }
7666       break;
7667
7668     case 'e':
7669       {
7670         int n;
7671
7672         if (!CONST_INT_P (x)
7673             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
7674           {
7675             output_operand_lossage ("invalid operand for '%%%c'", code);
7676             return;
7677           }
7678
7679         switch (n)
7680           {
7681           case 3:
7682             fputc ('b', f);
7683             break;
7684           case 4:
7685             fputc ('h', f);
7686             break;
7687           case 5:
7688             fputc ('w', f);
7689             break;
7690           default:
7691             output_operand_lossage ("invalid operand for '%%%c'", code);
7692             return;
7693           }
7694       }
7695       break;
7696
7697     case 'p':
7698       {
7699         int n;
7700
7701         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
7702           {
7703             output_operand_lossage ("invalid operand for '%%%c'", code);
7704             return;
7705           }
7706
7707         asm_fprintf (f, "%d", n);
7708       }
7709       break;
7710
7711     case 'P':
7712       if (!CONST_INT_P (x))
7713         {
7714           output_operand_lossage ("invalid operand for '%%%c'", code);
7715           return;
7716         }
7717
7718       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
7719       break;
7720
7721     case 'H':
7722       if (x == const0_rtx)
7723         {
7724           asm_fprintf (f, "xzr");
7725           break;
7726         }
7727
7728       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
7729         {
7730           output_operand_lossage ("invalid operand for '%%%c'", code);
7731           return;
7732         }
7733
7734       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
7735       break;
7736
7737     case 'M':
7738     case 'm':
7739       {
7740         int cond_code;
7741         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
7742         if (x == const_true_rtx)
7743           {
7744             if (code == 'M')
7745               fputs ("nv", f);
7746             return;
7747           }
7748
7749         if (!COMPARISON_P (x))
7750           {
7751             output_operand_lossage ("invalid operand for '%%%c'", code);
7752             return;
7753           }
7754
7755         cond_code = aarch64_get_condition_code (x);
7756         gcc_assert (cond_code >= 0);
7757         if (code == 'M')
7758           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
7759         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
7760           fputs (aarch64_sve_condition_codes[cond_code], f);
7761         else
7762           fputs (aarch64_condition_codes[cond_code], f);
7763       }
7764       break;
7765
7766     case 'N':
7767       if (!const_vec_duplicate_p (x, &elt))
7768         {
7769           output_operand_lossage ("invalid vector constant");
7770           return;
7771         }
7772
7773       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7774         asm_fprintf (f, "%wd", -INTVAL (elt));
7775       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7776                && aarch64_print_vector_float_operand (f, x, true))
7777         ;
7778       else
7779         {
7780           output_operand_lossage ("invalid vector constant");
7781           return;
7782         }
7783       break;
7784
7785     case 'b':
7786     case 'h':
7787     case 's':
7788     case 'd':
7789     case 'q':
7790       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7791         {
7792           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7793           return;
7794         }
7795       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
7796       break;
7797
7798     case 'S':
7799     case 'T':
7800     case 'U':
7801     case 'V':
7802       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7803         {
7804           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7805           return;
7806         }
7807       asm_fprintf (f, "%c%d",
7808                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
7809                    REGNO (x) - V0_REGNUM + (code - 'S'));
7810       break;
7811
7812     case 'R':
7813       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7814         {
7815           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7816           return;
7817         }
7818       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
7819       break;
7820
7821     case 'X':
7822       if (!CONST_INT_P (x))
7823         {
7824           output_operand_lossage ("invalid operand for '%%%c'", code);
7825           return;
7826         }
7827       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
7828       break;
7829
7830     case 'C':
7831       {
7832         /* Print a replicated constant in hex.  */
7833         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7834           {
7835             output_operand_lossage ("invalid operand for '%%%c'", code);
7836             return;
7837           }
7838         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7839         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7840       }
7841       break;
7842
7843     case 'D':
7844       {
7845         /* Print a replicated constant in decimal, treating it as
7846            unsigned.  */
7847         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7848           {
7849             output_operand_lossage ("invalid operand for '%%%c'", code);
7850             return;
7851           }
7852         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7853         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7854       }
7855       break;
7856
7857     case 'w':
7858     case 'x':
7859       if (x == const0_rtx
7860           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
7861         {
7862           asm_fprintf (f, "%czr", code);
7863           break;
7864         }
7865
7866       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
7867         {
7868           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
7869           break;
7870         }
7871
7872       if (REG_P (x) && REGNO (x) == SP_REGNUM)
7873         {
7874           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
7875           break;
7876         }
7877
7878       /* Fall through */
7879
7880     case 0:
7881       if (x == NULL)
7882         {
7883           output_operand_lossage ("missing operand");
7884           return;
7885         }
7886
7887       switch (GET_CODE (x))
7888         {
7889         case REG:
7890           if (aarch64_sve_data_mode_p (GET_MODE (x)))
7891             {
7892               if (REG_NREGS (x) == 1)
7893                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
7894               else
7895                 {
7896                   char suffix
7897                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
7898                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
7899                                REGNO (x) - V0_REGNUM, suffix,
7900                                END_REGNO (x) - V0_REGNUM - 1, suffix);
7901                 }
7902             }
7903           else
7904             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
7905           break;
7906
7907         case MEM:
7908           output_address (GET_MODE (x), XEXP (x, 0));
7909           break;
7910
7911         case LABEL_REF:
7912         case SYMBOL_REF:
7913           output_addr_const (asm_out_file, x);
7914           break;
7915
7916         case CONST_INT:
7917           asm_fprintf (f, "%wd", INTVAL (x));
7918           break;
7919
7920         case CONST:
7921           if (!VECTOR_MODE_P (GET_MODE (x)))
7922             {
7923               output_addr_const (asm_out_file, x);
7924               break;
7925             }
7926           /* fall through */
7927
7928         case CONST_VECTOR:
7929           if (!const_vec_duplicate_p (x, &elt))
7930             {
7931               output_operand_lossage ("invalid vector constant");
7932               return;
7933             }
7934
7935           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7936             asm_fprintf (f, "%wd", INTVAL (elt));
7937           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7938                    && aarch64_print_vector_float_operand (f, x, false))
7939             ;
7940           else
7941             {
7942               output_operand_lossage ("invalid vector constant");
7943               return;
7944             }
7945           break;
7946
7947         case CONST_DOUBLE:
7948           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
7949              be getting CONST_DOUBLEs holding integers.  */
7950           gcc_assert (GET_MODE (x) != VOIDmode);
7951           if (aarch64_float_const_zero_rtx_p (x))
7952             {
7953               fputc ('0', f);
7954               break;
7955             }
7956           else if (aarch64_float_const_representable_p (x))
7957             {
7958 #define buf_size 20
7959               char float_buf[buf_size] = {'\0'};
7960               real_to_decimal_for_mode (float_buf,
7961                                         CONST_DOUBLE_REAL_VALUE (x),
7962                                         buf_size, buf_size,
7963                                         1, GET_MODE (x));
7964               asm_fprintf (asm_out_file, "%s", float_buf);
7965               break;
7966 #undef buf_size
7967             }
7968           output_operand_lossage ("invalid constant");
7969           return;
7970         default:
7971           output_operand_lossage ("invalid operand");
7972           return;
7973         }
7974       break;
7975
7976     case 'A':
7977       if (GET_CODE (x) == HIGH)
7978         x = XEXP (x, 0);
7979
7980       switch (aarch64_classify_symbolic_expression (x))
7981         {
7982         case SYMBOL_SMALL_GOT_4G:
7983           asm_fprintf (asm_out_file, ":got:");
7984           break;
7985
7986         case SYMBOL_SMALL_TLSGD:
7987           asm_fprintf (asm_out_file, ":tlsgd:");
7988           break;
7989
7990         case SYMBOL_SMALL_TLSDESC:
7991           asm_fprintf (asm_out_file, ":tlsdesc:");
7992           break;
7993
7994         case SYMBOL_SMALL_TLSIE:
7995           asm_fprintf (asm_out_file, ":gottprel:");
7996           break;
7997
7998         case SYMBOL_TLSLE24:
7999           asm_fprintf (asm_out_file, ":tprel:");
8000           break;
8001
8002         case SYMBOL_TINY_GOT:
8003           gcc_unreachable ();
8004           break;
8005
8006         default:
8007           break;
8008         }
8009       output_addr_const (asm_out_file, x);
8010       break;
8011
8012     case 'L':
8013       switch (aarch64_classify_symbolic_expression (x))
8014         {
8015         case SYMBOL_SMALL_GOT_4G:
8016           asm_fprintf (asm_out_file, ":lo12:");
8017           break;
8018
8019         case SYMBOL_SMALL_TLSGD:
8020           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8021           break;
8022
8023         case SYMBOL_SMALL_TLSDESC:
8024           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8025           break;
8026
8027         case SYMBOL_SMALL_TLSIE:
8028           asm_fprintf (asm_out_file, ":gottprel_lo12:");
8029           break;
8030
8031         case SYMBOL_TLSLE12:
8032           asm_fprintf (asm_out_file, ":tprel_lo12:");
8033           break;
8034
8035         case SYMBOL_TLSLE24:
8036           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8037           break;
8038
8039         case SYMBOL_TINY_GOT:
8040           asm_fprintf (asm_out_file, ":got:");
8041           break;
8042
8043         case SYMBOL_TINY_TLSIE:
8044           asm_fprintf (asm_out_file, ":gottprel:");
8045           break;
8046
8047         default:
8048           break;
8049         }
8050       output_addr_const (asm_out_file, x);
8051       break;
8052
8053     case 'G':
8054       switch (aarch64_classify_symbolic_expression (x))
8055         {
8056         case SYMBOL_TLSLE24:
8057           asm_fprintf (asm_out_file, ":tprel_hi12:");
8058           break;
8059         default:
8060           break;
8061         }
8062       output_addr_const (asm_out_file, x);
8063       break;
8064
8065     case 'k':
8066       {
8067         HOST_WIDE_INT cond_code;
8068
8069         if (!CONST_INT_P (x))
8070           {
8071             output_operand_lossage ("invalid operand for '%%%c'", code);
8072             return;
8073           }
8074
8075         cond_code = INTVAL (x);
8076         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8077         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8078       }
8079       break;
8080
8081     case 'y':
8082     case 'z':
8083       {
8084         machine_mode mode = GET_MODE (x);
8085
8086         if (GET_CODE (x) != MEM
8087             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8088           {
8089             output_operand_lossage ("invalid operand for '%%%c'", code);
8090             return;
8091           }
8092
8093         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8094                                             code == 'y'
8095                                             ? ADDR_QUERY_LDP_STP_N
8096                                             : ADDR_QUERY_LDP_STP))
8097           output_operand_lossage ("invalid operand prefix '%%%c'", code);
8098       }
8099       break;
8100
8101     default:
8102       output_operand_lossage ("invalid operand prefix '%%%c'", code);
8103       return;
8104     }
8105 }
8106
8107 /* Print address 'x' of a memory access with mode 'mode'.
8108    'op' is the context required by aarch64_classify_address.  It can either be
8109    MEM for a normal memory access or PARALLEL for LDP/STP.  */
8110 static bool
8111 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8112                                 aarch64_addr_query_type type)
8113 {
8114   struct aarch64_address_info addr;
8115   unsigned int size;
8116
8117   /* Check all addresses are Pmode - including ILP32.  */
8118   if (GET_MODE (x) != Pmode
8119       && (!CONST_INT_P (x)
8120           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8121     {
8122       output_operand_lossage ("invalid address mode");
8123       return false;
8124     }
8125
8126   if (aarch64_classify_address (&addr, x, mode, true, type))
8127     switch (addr.type)
8128       {
8129       case ADDRESS_REG_IMM:
8130         if (known_eq (addr.const_offset, 0))
8131           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8132         else if (aarch64_sve_data_mode_p (mode))
8133           {
8134             HOST_WIDE_INT vnum
8135               = exact_div (addr.const_offset,
8136                            BYTES_PER_SVE_VECTOR).to_constant ();
8137             asm_fprintf (f, "[%s, #%wd, mul vl]",
8138                          reg_names[REGNO (addr.base)], vnum);
8139           }
8140         else if (aarch64_sve_pred_mode_p (mode))
8141           {
8142             HOST_WIDE_INT vnum
8143               = exact_div (addr.const_offset,
8144                            BYTES_PER_SVE_PRED).to_constant ();
8145             asm_fprintf (f, "[%s, #%wd, mul vl]",
8146                          reg_names[REGNO (addr.base)], vnum);
8147           }
8148         else
8149           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8150                        INTVAL (addr.offset));
8151         return true;
8152
8153       case ADDRESS_REG_REG:
8154         if (addr.shift == 0)
8155           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8156                        reg_names [REGNO (addr.offset)]);
8157         else
8158           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8159                        reg_names [REGNO (addr.offset)], addr.shift);
8160         return true;
8161
8162       case ADDRESS_REG_UXTW:
8163         if (addr.shift == 0)
8164           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8165                        REGNO (addr.offset) - R0_REGNUM);
8166         else
8167           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8168                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8169         return true;
8170
8171       case ADDRESS_REG_SXTW:
8172         if (addr.shift == 0)
8173           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8174                        REGNO (addr.offset) - R0_REGNUM);
8175         else
8176           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8177                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8178         return true;
8179
8180       case ADDRESS_REG_WB:
8181         /* Writeback is only supported for fixed-width modes.  */
8182         size = GET_MODE_SIZE (mode).to_constant ();
8183         switch (GET_CODE (x))
8184           {
8185           case PRE_INC:
8186             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8187             return true;
8188           case POST_INC:
8189             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8190             return true;
8191           case PRE_DEC:
8192             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8193             return true;
8194           case POST_DEC:
8195             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
8196             return true;
8197           case PRE_MODIFY:
8198             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
8199                          INTVAL (addr.offset));
8200             return true;
8201           case POST_MODIFY:
8202             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8203                          INTVAL (addr.offset));
8204             return true;
8205           default:
8206             break;
8207           }
8208         break;
8209
8210       case ADDRESS_LO_SUM:
8211         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8212         output_addr_const (f, addr.offset);
8213         asm_fprintf (f, "]");
8214         return true;
8215
8216       case ADDRESS_SYMBOLIC:
8217         output_addr_const (f, x);
8218         return true;
8219       }
8220
8221   return false;
8222 }
8223
8224 /* Print address 'x' of a memory access with mode 'mode'.  */
8225 static void
8226 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8227 {
8228   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
8229     output_addr_const (f, x);
8230 }
8231
8232 bool
8233 aarch64_label_mentioned_p (rtx x)
8234 {
8235   const char *fmt;
8236   int i;
8237
8238   if (GET_CODE (x) == LABEL_REF)
8239     return true;
8240
8241   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8242      referencing instruction, but they are constant offsets, not
8243      symbols.  */
8244   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8245     return false;
8246
8247   fmt = GET_RTX_FORMAT (GET_CODE (x));
8248   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8249     {
8250       if (fmt[i] == 'E')
8251         {
8252           int j;
8253
8254           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8255             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8256               return 1;
8257         }
8258       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
8259         return 1;
8260     }
8261
8262   return 0;
8263 }
8264
8265 /* Implement REGNO_REG_CLASS.  */
8266
8267 enum reg_class
8268 aarch64_regno_regclass (unsigned regno)
8269 {
8270   if (GP_REGNUM_P (regno))
8271     return GENERAL_REGS;
8272
8273   if (regno == SP_REGNUM)
8274     return STACK_REG;
8275
8276   if (regno == FRAME_POINTER_REGNUM
8277       || regno == ARG_POINTER_REGNUM)
8278     return POINTER_REGS;
8279
8280   if (FP_REGNUM_P (regno))
8281     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
8282
8283   if (PR_REGNUM_P (regno))
8284     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
8285
8286   return NO_REGS;
8287 }
8288
8289 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8290    If OFFSET is out of range, return an offset of an anchor point
8291    that is in range.  Return 0 otherwise.  */
8292
8293 static HOST_WIDE_INT
8294 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
8295                        machine_mode mode)
8296 {
8297   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
8298   if (size > 16)
8299     return (offset + 0x400) & ~0x7f0;
8300
8301   /* For offsets that aren't a multiple of the access size, the limit is
8302      -256...255.  */
8303   if (offset & (size - 1))
8304     {
8305       /* BLKmode typically uses LDP of X-registers.  */
8306       if (mode == BLKmode)
8307         return (offset + 512) & ~0x3ff;
8308       return (offset + 0x100) & ~0x1ff;
8309     }
8310
8311   /* Small negative offsets are supported.  */
8312   if (IN_RANGE (offset, -256, 0))
8313     return 0;
8314
8315   if (mode == TImode || mode == TFmode)
8316     return (offset + 0x100) & ~0x1ff;
8317
8318   /* Use 12-bit offset by access size.  */
8319   return offset & (~0xfff * size);
8320 }
8321
8322 static rtx
8323 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
8324 {
8325   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8326      where mask is selected by alignment and size of the offset.
8327      We try to pick as large a range for the offset as possible to
8328      maximize the chance of a CSE.  However, for aligned addresses
8329      we limit the range to 4k so that structures with different sized
8330      elements are likely to use the same base.  We need to be careful
8331      not to split a CONST for some forms of address expression, otherwise
8332      it will generate sub-optimal code.  */
8333
8334   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
8335     {
8336       rtx base = XEXP (x, 0);
8337       rtx offset_rtx = XEXP (x, 1);
8338       HOST_WIDE_INT offset = INTVAL (offset_rtx);
8339
8340       if (GET_CODE (base) == PLUS)
8341         {
8342           rtx op0 = XEXP (base, 0);
8343           rtx op1 = XEXP (base, 1);
8344
8345           /* Force any scaling into a temp for CSE.  */
8346           op0 = force_reg (Pmode, op0);
8347           op1 = force_reg (Pmode, op1);
8348
8349           /* Let the pointer register be in op0.  */
8350           if (REG_POINTER (op1))
8351             std::swap (op0, op1);
8352
8353           /* If the pointer is virtual or frame related, then we know that
8354              virtual register instantiation or register elimination is going
8355              to apply a second constant.  We want the two constants folded
8356              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
8357           if (virt_or_elim_regno_p (REGNO (op0)))
8358             {
8359               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
8360                                    NULL_RTX, true, OPTAB_DIRECT);
8361               return gen_rtx_PLUS (Pmode, base, op1);
8362             }
8363
8364           /* Otherwise, in order to encourage CSE (and thence loop strength
8365              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
8366           base = expand_binop (Pmode, add_optab, op0, op1,
8367                                NULL_RTX, true, OPTAB_DIRECT);
8368           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
8369         }
8370
8371       HOST_WIDE_INT size;
8372       if (GET_MODE_SIZE (mode).is_constant (&size))
8373         {
8374           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
8375                                                              mode);
8376           if (base_offset != 0)
8377             {
8378               base = plus_constant (Pmode, base, base_offset);
8379               base = force_operand (base, NULL_RTX);
8380               return plus_constant (Pmode, base, offset - base_offset);
8381             }
8382         }
8383     }
8384
8385   return x;
8386 }
8387
8388 static reg_class_t
8389 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
8390                           reg_class_t rclass,
8391                           machine_mode mode,
8392                           secondary_reload_info *sri)
8393 {
8394   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8395      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
8396      comment at the head of aarch64-sve.md for more details about the
8397      big-endian handling.  */
8398   if (BYTES_BIG_ENDIAN
8399       && reg_class_subset_p (rclass, FP_REGS)
8400       && !((REG_P (x) && HARD_REGISTER_P (x))
8401            || aarch64_simd_valid_immediate (x, NULL))
8402       && aarch64_sve_data_mode_p (mode))
8403     {
8404       sri->icode = CODE_FOR_aarch64_sve_reload_be;
8405       return NO_REGS;
8406     }
8407
8408   /* If we have to disable direct literal pool loads and stores because the
8409      function is too big, then we need a scratch register.  */
8410   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
8411       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
8412           || targetm.vector_mode_supported_p (GET_MODE (x)))
8413       && !aarch64_pcrelative_literal_loads)
8414     {
8415       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
8416       return NO_REGS;
8417     }
8418
8419   /* Without the TARGET_SIMD instructions we cannot move a Q register
8420      to a Q register directly.  We need a scratch.  */
8421   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
8422       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
8423       && reg_class_subset_p (rclass, FP_REGS))
8424     {
8425       sri->icode = code_for_aarch64_reload_mov (mode);
8426       return NO_REGS;
8427     }
8428
8429   /* A TFmode or TImode memory access should be handled via an FP_REGS
8430      because AArch64 has richer addressing modes for LDR/STR instructions
8431      than LDP/STP instructions.  */
8432   if (TARGET_FLOAT && rclass == GENERAL_REGS
8433       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
8434     return FP_REGS;
8435
8436   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
8437       return GENERAL_REGS;
8438
8439   return NO_REGS;
8440 }
8441
8442 static bool
8443 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
8444 {
8445   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
8446
8447   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8448      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
8449   if (frame_pointer_needed)
8450     return to == HARD_FRAME_POINTER_REGNUM;
8451   return true;
8452 }
8453
8454 poly_int64
8455 aarch64_initial_elimination_offset (unsigned from, unsigned to)
8456 {
8457   if (to == HARD_FRAME_POINTER_REGNUM)
8458     {
8459       if (from == ARG_POINTER_REGNUM)
8460         return cfun->machine->frame.hard_fp_offset;
8461
8462       if (from == FRAME_POINTER_REGNUM)
8463         return cfun->machine->frame.hard_fp_offset
8464                - cfun->machine->frame.locals_offset;
8465     }
8466
8467   if (to == STACK_POINTER_REGNUM)
8468     {
8469       if (from == FRAME_POINTER_REGNUM)
8470           return cfun->machine->frame.frame_size
8471                  - cfun->machine->frame.locals_offset;
8472     }
8473
8474   return cfun->machine->frame.frame_size;
8475 }
8476
8477 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
8478    previous frame.  */
8479
8480 rtx
8481 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
8482 {
8483   if (count != 0)
8484     return const0_rtx;
8485   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
8486 }
8487
8488
8489 static void
8490 aarch64_asm_trampoline_template (FILE *f)
8491 {
8492   int offset1 = 16;
8493   int offset2 = 20;
8494
8495   if (aarch64_bti_enabled ())
8496     {
8497       asm_fprintf (f, "\thint\t34 // bti c\n");
8498       offset1 -= 4;
8499       offset2 -= 4;
8500     }
8501
8502   if (TARGET_ILP32)
8503     {
8504       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
8505       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
8506                    offset1);
8507     }
8508   else
8509     {
8510       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
8511       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
8512                    offset2);
8513     }
8514   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
8515
8516   /* The trampoline needs an extra padding instruction.  In case if BTI is
8517      enabled the padding instruction is replaced by the BTI instruction at
8518      the beginning.  */
8519   if (!aarch64_bti_enabled ())
8520     assemble_aligned_integer (4, const0_rtx);
8521
8522   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8523   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8524 }
8525
8526 static void
8527 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
8528 {
8529   rtx fnaddr, mem, a_tramp;
8530   const int tramp_code_sz = 16;
8531
8532   /* Don't need to copy the trailing D-words, we fill those in below.  */
8533   emit_block_move (m_tramp, assemble_trampoline_template (),
8534                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
8535   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
8536   fnaddr = XEXP (DECL_RTL (fndecl), 0);
8537   if (GET_MODE (fnaddr) != ptr_mode)
8538     fnaddr = convert_memory_address (ptr_mode, fnaddr);
8539   emit_move_insn (mem, fnaddr);
8540
8541   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
8542   emit_move_insn (mem, chain_value);
8543
8544   /* XXX We should really define a "clear_cache" pattern and use
8545      gen_clear_cache().  */
8546   a_tramp = XEXP (m_tramp, 0);
8547   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
8548                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
8549                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
8550                      ptr_mode);
8551 }
8552
8553 static unsigned char
8554 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
8555 {
8556   /* ??? Logically we should only need to provide a value when
8557      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
8558      can hold MODE, but at the moment we need to handle all modes.
8559      Just ignore any runtime parts for registers that can't store them.  */
8560   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
8561   unsigned int nregs;
8562   switch (regclass)
8563     {
8564     case TAILCALL_ADDR_REGS:
8565     case POINTER_REGS:
8566     case GENERAL_REGS:
8567     case ALL_REGS:
8568     case POINTER_AND_FP_REGS:
8569     case FP_REGS:
8570     case FP_LO_REGS:
8571       if (aarch64_sve_data_mode_p (mode)
8572           && constant_multiple_p (GET_MODE_SIZE (mode),
8573                                   BYTES_PER_SVE_VECTOR, &nregs))
8574         return nregs;
8575       return (aarch64_vector_data_mode_p (mode)
8576               ? CEIL (lowest_size, UNITS_PER_VREG)
8577               : CEIL (lowest_size, UNITS_PER_WORD));
8578     case STACK_REG:
8579     case PR_REGS:
8580     case PR_LO_REGS:
8581     case PR_HI_REGS:
8582       return 1;
8583
8584     case NO_REGS:
8585       return 0;
8586
8587     default:
8588       break;
8589     }
8590   gcc_unreachable ();
8591 }
8592
8593 static reg_class_t
8594 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
8595 {
8596   if (regclass == POINTER_REGS)
8597     return GENERAL_REGS;
8598
8599   if (regclass == STACK_REG)
8600     {
8601       if (REG_P(x)
8602           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
8603           return regclass;
8604
8605       return NO_REGS;
8606     }
8607
8608   /* Register eliminiation can result in a request for
8609      SP+constant->FP_REGS.  We cannot support such operations which
8610      use SP as source and an FP_REG as destination, so reject out
8611      right now.  */
8612   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
8613     {
8614       rtx lhs = XEXP (x, 0);
8615
8616       /* Look through a possible SUBREG introduced by ILP32.  */
8617       if (GET_CODE (lhs) == SUBREG)
8618         lhs = SUBREG_REG (lhs);
8619
8620       gcc_assert (REG_P (lhs));
8621       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
8622                                       POINTER_REGS));
8623       return NO_REGS;
8624     }
8625
8626   return regclass;
8627 }
8628
8629 void
8630 aarch64_asm_output_labelref (FILE* f, const char *name)
8631 {
8632   asm_fprintf (f, "%U%s", name);
8633 }
8634
8635 static void
8636 aarch64_elf_asm_constructor (rtx symbol, int priority)
8637 {
8638   if (priority == DEFAULT_INIT_PRIORITY)
8639     default_ctor_section_asm_out_constructor (symbol, priority);
8640   else
8641     {
8642       section *s;
8643       /* While priority is known to be in range [0, 65535], so 18 bytes
8644          would be enough, the compiler might not know that.  To avoid
8645          -Wformat-truncation false positive, use a larger size.  */
8646       char buf[23];
8647       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
8648       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8649       switch_to_section (s);
8650       assemble_align (POINTER_SIZE);
8651       assemble_aligned_integer (POINTER_BYTES, symbol);
8652     }
8653 }
8654
8655 static void
8656 aarch64_elf_asm_destructor (rtx symbol, int priority)
8657 {
8658   if (priority == DEFAULT_INIT_PRIORITY)
8659     default_dtor_section_asm_out_destructor (symbol, priority);
8660   else
8661     {
8662       section *s;
8663       /* While priority is known to be in range [0, 65535], so 18 bytes
8664          would be enough, the compiler might not know that.  To avoid
8665          -Wformat-truncation false positive, use a larger size.  */
8666       char buf[23];
8667       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
8668       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8669       switch_to_section (s);
8670       assemble_align (POINTER_SIZE);
8671       assemble_aligned_integer (POINTER_BYTES, symbol);
8672     }
8673 }
8674
8675 const char*
8676 aarch64_output_casesi (rtx *operands)
8677 {
8678   char buf[100];
8679   char label[100];
8680   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
8681   int index;
8682   static const char *const patterns[4][2] =
8683   {
8684     {
8685       "ldrb\t%w3, [%0,%w1,uxtw]",
8686       "add\t%3, %4, %w3, sxtb #2"
8687     },
8688     {
8689       "ldrh\t%w3, [%0,%w1,uxtw #1]",
8690       "add\t%3, %4, %w3, sxth #2"
8691     },
8692     {
8693       "ldr\t%w3, [%0,%w1,uxtw #2]",
8694       "add\t%3, %4, %w3, sxtw #2"
8695     },
8696     /* We assume that DImode is only generated when not optimizing and
8697        that we don't really need 64-bit address offsets.  That would
8698        imply an object file with 8GB of code in a single function!  */
8699     {
8700       "ldr\t%w3, [%0,%w1,uxtw #2]",
8701       "add\t%3, %4, %w3, sxtw #2"
8702     }
8703   };
8704
8705   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
8706
8707   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
8708   index = exact_log2 (GET_MODE_SIZE (mode));
8709
8710   gcc_assert (index >= 0 && index <= 3);
8711
8712   /* Need to implement table size reduction, by chaning the code below.  */
8713   output_asm_insn (patterns[index][0], operands);
8714   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
8715   snprintf (buf, sizeof (buf),
8716             "adr\t%%4, %s", targetm.strip_name_encoding (label));
8717   output_asm_insn (buf, operands);
8718   output_asm_insn (patterns[index][1], operands);
8719   output_asm_insn ("br\t%3", operands);
8720   assemble_label (asm_out_file, label);
8721   return "";
8722 }
8723
8724
8725 /* Return size in bits of an arithmetic operand which is shifted/scaled and
8726    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
8727    operator.  */
8728
8729 int
8730 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
8731 {
8732   if (shift >= 0 && shift <= 3)
8733     {
8734       int size;
8735       for (size = 8; size <= 32; size *= 2)
8736         {
8737           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
8738           if (mask == bits << shift)
8739             return size;
8740         }
8741     }
8742   return 0;
8743 }
8744
8745 /* Constant pools are per function only when PC relative
8746    literal loads are true or we are in the large memory
8747    model.  */
8748
8749 static inline bool
8750 aarch64_can_use_per_function_literal_pools_p (void)
8751 {
8752   return (aarch64_pcrelative_literal_loads
8753           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
8754 }
8755
8756 static bool
8757 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
8758 {
8759   /* We can't use blocks for constants when we're using a per-function
8760      constant pool.  */
8761   return !aarch64_can_use_per_function_literal_pools_p ();
8762 }
8763
8764 /* Select appropriate section for constants depending
8765    on where we place literal pools.  */
8766
8767 static section *
8768 aarch64_select_rtx_section (machine_mode mode,
8769                             rtx x,
8770                             unsigned HOST_WIDE_INT align)
8771 {
8772   if (aarch64_can_use_per_function_literal_pools_p ())
8773     return function_section (current_function_decl);
8774
8775   return default_elf_select_rtx_section (mode, x, align);
8776 }
8777
8778 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
8779 void
8780 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
8781                                   HOST_WIDE_INT offset)
8782 {
8783   /* When using per-function literal pools, we must ensure that any code
8784      section is aligned to the minimal instruction length, lest we get
8785      errors from the assembler re "unaligned instructions".  */
8786   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
8787     ASM_OUTPUT_ALIGN (f, 2);
8788 }
8789
8790 /* Costs.  */
8791
8792 /* Helper function for rtx cost calculation.  Strip a shift expression
8793    from X.  Returns the inner operand if successful, or the original
8794    expression on failure.  */
8795 static rtx
8796 aarch64_strip_shift (rtx x)
8797 {
8798   rtx op = x;
8799
8800   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
8801      we can convert both to ROR during final output.  */
8802   if ((GET_CODE (op) == ASHIFT
8803        || GET_CODE (op) == ASHIFTRT
8804        || GET_CODE (op) == LSHIFTRT
8805        || GET_CODE (op) == ROTATERT
8806        || GET_CODE (op) == ROTATE)
8807       && CONST_INT_P (XEXP (op, 1)))
8808     return XEXP (op, 0);
8809
8810   if (GET_CODE (op) == MULT
8811       && CONST_INT_P (XEXP (op, 1))
8812       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
8813     return XEXP (op, 0);
8814
8815   return x;
8816 }
8817
8818 /* Helper function for rtx cost calculation.  Strip an extend
8819    expression from X.  Returns the inner operand if successful, or the
8820    original expression on failure.  We deal with a number of possible
8821    canonicalization variations here. If STRIP_SHIFT is true, then
8822    we can strip off a shift also.  */
8823 static rtx
8824 aarch64_strip_extend (rtx x, bool strip_shift)
8825 {
8826   scalar_int_mode mode;
8827   rtx op = x;
8828
8829   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
8830     return op;
8831
8832   /* Zero and sign extraction of a widened value.  */
8833   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
8834       && XEXP (op, 2) == const0_rtx
8835       && GET_CODE (XEXP (op, 0)) == MULT
8836       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
8837                                          XEXP (op, 1)))
8838     return XEXP (XEXP (op, 0), 0);
8839
8840   /* It can also be represented (for zero-extend) as an AND with an
8841      immediate.  */
8842   if (GET_CODE (op) == AND
8843       && GET_CODE (XEXP (op, 0)) == MULT
8844       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
8845       && CONST_INT_P (XEXP (op, 1))
8846       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
8847                            INTVAL (XEXP (op, 1))) != 0)
8848     return XEXP (XEXP (op, 0), 0);
8849
8850   /* Now handle extended register, as this may also have an optional
8851      left shift by 1..4.  */
8852   if (strip_shift
8853       && GET_CODE (op) == ASHIFT
8854       && CONST_INT_P (XEXP (op, 1))
8855       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
8856     op = XEXP (op, 0);
8857
8858   if (GET_CODE (op) == ZERO_EXTEND
8859       || GET_CODE (op) == SIGN_EXTEND)
8860     op = XEXP (op, 0);
8861
8862   if (op != x)
8863     return op;
8864
8865   return x;
8866 }
8867
8868 /* Return true iff CODE is a shift supported in combination
8869    with arithmetic instructions.  */
8870
8871 static bool
8872 aarch64_shift_p (enum rtx_code code)
8873 {
8874   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
8875 }
8876
8877
8878 /* Return true iff X is a cheap shift without a sign extend. */
8879
8880 static bool
8881 aarch64_cheap_mult_shift_p (rtx x)
8882 {
8883   rtx op0, op1;
8884
8885   op0 = XEXP (x, 0);
8886   op1 = XEXP (x, 1);
8887
8888   if (!(aarch64_tune_params.extra_tuning_flags
8889                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
8890     return false;
8891
8892   if (GET_CODE (op0) == SIGN_EXTEND)
8893     return false;
8894
8895   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
8896       && UINTVAL (op1) <= 4)
8897     return true;
8898
8899   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
8900     return false;
8901
8902   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
8903
8904   if (l2 > 0 && l2 <= 4)
8905     return true;
8906
8907   return false;
8908 }
8909
8910 /* Helper function for rtx cost calculation.  Calculate the cost of
8911    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
8912    Return the calculated cost of the expression, recursing manually in to
8913    operands where needed.  */
8914
8915 static int
8916 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
8917 {
8918   rtx op0, op1;
8919   const struct cpu_cost_table *extra_cost
8920     = aarch64_tune_params.insn_extra_cost;
8921   int cost = 0;
8922   bool compound_p = (outer == PLUS || outer == MINUS);
8923   machine_mode mode = GET_MODE (x);
8924
8925   gcc_checking_assert (code == MULT);
8926
8927   op0 = XEXP (x, 0);
8928   op1 = XEXP (x, 1);
8929
8930   if (VECTOR_MODE_P (mode))
8931     mode = GET_MODE_INNER (mode);
8932
8933   /* Integer multiply/fma.  */
8934   if (GET_MODE_CLASS (mode) == MODE_INT)
8935     {
8936       /* The multiply will be canonicalized as a shift, cost it as such.  */
8937       if (aarch64_shift_p (GET_CODE (x))
8938           || (CONST_INT_P (op1)
8939               && exact_log2 (INTVAL (op1)) > 0))
8940         {
8941           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
8942                            || GET_CODE (op0) == SIGN_EXTEND;
8943           if (speed)
8944             {
8945               if (compound_p)
8946                 {
8947                   /* If the shift is considered cheap,
8948                      then don't add any cost. */
8949                   if (aarch64_cheap_mult_shift_p (x))
8950                     ;
8951                   else if (REG_P (op1))
8952                     /* ARITH + shift-by-register.  */
8953                     cost += extra_cost->alu.arith_shift_reg;
8954                   else if (is_extend)
8955                     /* ARITH + extended register.  We don't have a cost field
8956                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
8957                     cost += extra_cost->alu.extend_arith;
8958                   else
8959                     /* ARITH + shift-by-immediate.  */
8960                     cost += extra_cost->alu.arith_shift;
8961                 }
8962               else
8963                 /* LSL (immediate).  */
8964                 cost += extra_cost->alu.shift;
8965
8966             }
8967           /* Strip extends as we will have costed them in the case above.  */
8968           if (is_extend)
8969             op0 = aarch64_strip_extend (op0, true);
8970
8971           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
8972
8973           return cost;
8974         }
8975
8976       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
8977          compound and let the below cases handle it.  After all, MNEG is a
8978          special-case alias of MSUB.  */
8979       if (GET_CODE (op0) == NEG)
8980         {
8981           op0 = XEXP (op0, 0);
8982           compound_p = true;
8983         }
8984
8985       /* Integer multiplies or FMAs have zero/sign extending variants.  */
8986       if ((GET_CODE (op0) == ZERO_EXTEND
8987            && GET_CODE (op1) == ZERO_EXTEND)
8988           || (GET_CODE (op0) == SIGN_EXTEND
8989               && GET_CODE (op1) == SIGN_EXTEND))
8990         {
8991           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8992           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8993
8994           if (speed)
8995             {
8996               if (compound_p)
8997                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
8998                 cost += extra_cost->mult[0].extend_add;
8999               else
9000                 /* MUL/SMULL/UMULL.  */
9001                 cost += extra_cost->mult[0].extend;
9002             }
9003
9004           return cost;
9005         }
9006
9007       /* This is either an integer multiply or a MADD.  In both cases
9008          we want to recurse and cost the operands.  */
9009       cost += rtx_cost (op0, mode, MULT, 0, speed);
9010       cost += rtx_cost (op1, mode, MULT, 1, speed);
9011
9012       if (speed)
9013         {
9014           if (compound_p)
9015             /* MADD/MSUB.  */
9016             cost += extra_cost->mult[mode == DImode].add;
9017           else
9018             /* MUL.  */
9019             cost += extra_cost->mult[mode == DImode].simple;
9020         }
9021
9022       return cost;
9023     }
9024   else
9025     {
9026       if (speed)
9027         {
9028           /* Floating-point FMA/FMUL can also support negations of the
9029              operands, unless the rounding mode is upward or downward in
9030              which case FNMUL is different than FMUL with operand negation.  */
9031           bool neg0 = GET_CODE (op0) == NEG;
9032           bool neg1 = GET_CODE (op1) == NEG;
9033           if (compound_p || !flag_rounding_math || (neg0 && neg1))
9034             {
9035               if (neg0)
9036                 op0 = XEXP (op0, 0);
9037               if (neg1)
9038                 op1 = XEXP (op1, 0);
9039             }
9040
9041           if (compound_p)
9042             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
9043             cost += extra_cost->fp[mode == DFmode].fma;
9044           else
9045             /* FMUL/FNMUL.  */
9046             cost += extra_cost->fp[mode == DFmode].mult;
9047         }
9048
9049       cost += rtx_cost (op0, mode, MULT, 0, speed);
9050       cost += rtx_cost (op1, mode, MULT, 1, speed);
9051       return cost;
9052     }
9053 }
9054
9055 static int
9056 aarch64_address_cost (rtx x,
9057                       machine_mode mode,
9058                       addr_space_t as ATTRIBUTE_UNUSED,
9059                       bool speed)
9060 {
9061   enum rtx_code c = GET_CODE (x);
9062   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9063   struct aarch64_address_info info;
9064   int cost = 0;
9065   info.shift = 0;
9066
9067   if (!aarch64_classify_address (&info, x, mode, false))
9068     {
9069       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9070         {
9071           /* This is a CONST or SYMBOL ref which will be split
9072              in a different way depending on the code model in use.
9073              Cost it through the generic infrastructure.  */
9074           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9075           /* Divide through by the cost of one instruction to
9076              bring it to the same units as the address costs.  */
9077           cost_symbol_ref /= COSTS_N_INSNS (1);
9078           /* The cost is then the cost of preparing the address,
9079              followed by an immediate (possibly 0) offset.  */
9080           return cost_symbol_ref + addr_cost->imm_offset;
9081         }
9082       else
9083         {
9084           /* This is most likely a jump table from a case
9085              statement.  */
9086           return addr_cost->register_offset;
9087         }
9088     }
9089
9090   switch (info.type)
9091     {
9092       case ADDRESS_LO_SUM:
9093       case ADDRESS_SYMBOLIC:
9094       case ADDRESS_REG_IMM:
9095         cost += addr_cost->imm_offset;
9096         break;
9097
9098       case ADDRESS_REG_WB:
9099         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9100           cost += addr_cost->pre_modify;
9101         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9102           cost += addr_cost->post_modify;
9103         else
9104           gcc_unreachable ();
9105
9106         break;
9107
9108       case ADDRESS_REG_REG:
9109         cost += addr_cost->register_offset;
9110         break;
9111
9112       case ADDRESS_REG_SXTW:
9113         cost += addr_cost->register_sextend;
9114         break;
9115
9116       case ADDRESS_REG_UXTW:
9117         cost += addr_cost->register_zextend;
9118         break;
9119
9120       default:
9121         gcc_unreachable ();
9122     }
9123
9124
9125   if (info.shift > 0)
9126     {
9127       /* For the sake of calculating the cost of the shifted register
9128          component, we can treat same sized modes in the same way.  */
9129       if (known_eq (GET_MODE_BITSIZE (mode), 16))
9130         cost += addr_cost->addr_scale_costs.hi;
9131       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9132         cost += addr_cost->addr_scale_costs.si;
9133       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9134         cost += addr_cost->addr_scale_costs.di;
9135       else
9136         /* We can't tell, or this is a 128-bit vector.  */
9137         cost += addr_cost->addr_scale_costs.ti;
9138     }
9139
9140   return cost;
9141 }
9142
9143 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
9144    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
9145    to be taken.  */
9146
9147 int
9148 aarch64_branch_cost (bool speed_p, bool predictable_p)
9149 {
9150   /* When optimizing for speed, use the cost of unpredictable branches.  */
9151   const struct cpu_branch_cost *branch_costs =
9152     aarch64_tune_params.branch_costs;
9153
9154   if (!speed_p || predictable_p)
9155     return branch_costs->predictable;
9156   else
9157     return branch_costs->unpredictable;
9158 }
9159
9160 /* Return true if the RTX X in mode MODE is a zero or sign extract
9161    usable in an ADD or SUB (extended register) instruction.  */
9162 static bool
9163 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9164 {
9165   /* Catch add with a sign extract.
9166      This is add_<optab><mode>_multp2.  */
9167   if (GET_CODE (x) == SIGN_EXTRACT
9168       || GET_CODE (x) == ZERO_EXTRACT)
9169     {
9170       rtx op0 = XEXP (x, 0);
9171       rtx op1 = XEXP (x, 1);
9172       rtx op2 = XEXP (x, 2);
9173
9174       if (GET_CODE (op0) == MULT
9175           && CONST_INT_P (op1)
9176           && op2 == const0_rtx
9177           && CONST_INT_P (XEXP (op0, 1))
9178           && aarch64_is_extend_from_extract (mode,
9179                                              XEXP (op0, 1),
9180                                              op1))
9181         {
9182           return true;
9183         }
9184     }
9185   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9186      No shift.  */
9187   else if (GET_CODE (x) == SIGN_EXTEND
9188            || GET_CODE (x) == ZERO_EXTEND)
9189     return REG_P (XEXP (x, 0));
9190
9191   return false;
9192 }
9193
9194 static bool
9195 aarch64_frint_unspec_p (unsigned int u)
9196 {
9197   switch (u)
9198     {
9199       case UNSPEC_FRINTZ:
9200       case UNSPEC_FRINTP:
9201       case UNSPEC_FRINTM:
9202       case UNSPEC_FRINTA:
9203       case UNSPEC_FRINTN:
9204       case UNSPEC_FRINTX:
9205       case UNSPEC_FRINTI:
9206         return true;
9207
9208       default:
9209         return false;
9210     }
9211 }
9212
9213 /* Return true iff X is an rtx that will match an extr instruction
9214    i.e. as described in the *extr<mode>5_insn family of patterns.
9215    OP0 and OP1 will be set to the operands of the shifts involved
9216    on success and will be NULL_RTX otherwise.  */
9217
9218 static bool
9219 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9220 {
9221   rtx op0, op1;
9222   scalar_int_mode mode;
9223   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9224     return false;
9225
9226   *res_op0 = NULL_RTX;
9227   *res_op1 = NULL_RTX;
9228
9229   if (GET_CODE (x) != IOR)
9230     return false;
9231
9232   op0 = XEXP (x, 0);
9233   op1 = XEXP (x, 1);
9234
9235   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
9236       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
9237     {
9238      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
9239       if (GET_CODE (op1) == ASHIFT)
9240         std::swap (op0, op1);
9241
9242       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9243         return false;
9244
9245       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9246       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9247
9248       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9249           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9250         {
9251           *res_op0 = XEXP (op0, 0);
9252           *res_op1 = XEXP (op1, 0);
9253           return true;
9254         }
9255     }
9256
9257   return false;
9258 }
9259
9260 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9261    storing it in *COST.  Result is true if the total cost of the operation
9262    has now been calculated.  */
9263 static bool
9264 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
9265 {
9266   rtx inner;
9267   rtx comparator;
9268   enum rtx_code cmpcode;
9269
9270   if (COMPARISON_P (op0))
9271     {
9272       inner = XEXP (op0, 0);
9273       comparator = XEXP (op0, 1);
9274       cmpcode = GET_CODE (op0);
9275     }
9276   else
9277     {
9278       inner = op0;
9279       comparator = const0_rtx;
9280       cmpcode = NE;
9281     }
9282
9283   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
9284     {
9285       /* Conditional branch.  */
9286       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9287         return true;
9288       else
9289         {
9290           if (cmpcode == NE || cmpcode == EQ)
9291             {
9292               if (comparator == const0_rtx)
9293                 {
9294                   /* TBZ/TBNZ/CBZ/CBNZ.  */
9295                   if (GET_CODE (inner) == ZERO_EXTRACT)
9296                     /* TBZ/TBNZ.  */
9297                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
9298                                        ZERO_EXTRACT, 0, speed);
9299                   else
9300                     /* CBZ/CBNZ.  */
9301                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
9302
9303                 return true;
9304               }
9305             }
9306           else if (cmpcode == LT || cmpcode == GE)
9307             {
9308               /* TBZ/TBNZ.  */
9309               if (comparator == const0_rtx)
9310                 return true;
9311             }
9312         }
9313     }
9314   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9315     {
9316       /* CCMP.  */
9317       if (GET_CODE (op1) == COMPARE)
9318         {
9319           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
9320           if (XEXP (op1, 1) == const0_rtx)
9321             *cost += 1;
9322           if (speed)
9323             {
9324               machine_mode mode = GET_MODE (XEXP (op1, 0));
9325               const struct cpu_cost_table *extra_cost
9326                 = aarch64_tune_params.insn_extra_cost;
9327
9328               if (GET_MODE_CLASS (mode) == MODE_INT)
9329                 *cost += extra_cost->alu.arith;
9330               else
9331                 *cost += extra_cost->fp[mode == DFmode].compare;
9332             }
9333           return true;
9334         }
9335
9336       /* It's a conditional operation based on the status flags,
9337          so it must be some flavor of CSEL.  */
9338
9339       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
9340       if (GET_CODE (op1) == NEG
9341           || GET_CODE (op1) == NOT
9342           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
9343         op1 = XEXP (op1, 0);
9344       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
9345         {
9346           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
9347           op1 = XEXP (op1, 0);
9348           op2 = XEXP (op2, 0);
9349         }
9350
9351       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
9352       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
9353       return true;
9354     }
9355
9356   /* We don't know what this is, cost all operands.  */
9357   return false;
9358 }
9359
9360 /* Check whether X is a bitfield operation of the form shift + extend that
9361    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
9362    operand to which the bitfield operation is applied.  Otherwise return
9363    NULL_RTX.  */
9364
9365 static rtx
9366 aarch64_extend_bitfield_pattern_p (rtx x)
9367 {
9368   rtx_code outer_code = GET_CODE (x);
9369   machine_mode outer_mode = GET_MODE (x);
9370
9371   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
9372       && outer_mode != SImode && outer_mode != DImode)
9373     return NULL_RTX;
9374
9375   rtx inner = XEXP (x, 0);
9376   rtx_code inner_code = GET_CODE (inner);
9377   machine_mode inner_mode = GET_MODE (inner);
9378   rtx op = NULL_RTX;
9379
9380   switch (inner_code)
9381     {
9382       case ASHIFT:
9383         if (CONST_INT_P (XEXP (inner, 1))
9384             && (inner_mode == QImode || inner_mode == HImode))
9385           op = XEXP (inner, 0);
9386         break;
9387       case LSHIFTRT:
9388         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
9389             && (inner_mode == QImode || inner_mode == HImode))
9390           op = XEXP (inner, 0);
9391         break;
9392       case ASHIFTRT:
9393         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
9394             && (inner_mode == QImode || inner_mode == HImode))
9395           op = XEXP (inner, 0);
9396         break;
9397       default:
9398         break;
9399     }
9400
9401   return op;
9402 }
9403
9404 /* Return true if the mask and a shift amount from an RTX of the form
9405    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9406    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
9407
9408 bool
9409 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
9410                                     rtx shft_amnt)
9411 {
9412   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
9413          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
9414          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
9415          && (INTVAL (mask)
9416              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
9417 }
9418
9419 /* Return true if the masks and a shift amount from an RTX of the form
9420    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
9421    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
9422
9423 bool
9424 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
9425                                    unsigned HOST_WIDE_INT mask1,
9426                                    unsigned HOST_WIDE_INT shft_amnt,
9427                                    unsigned HOST_WIDE_INT mask2)
9428 {
9429   unsigned HOST_WIDE_INT t;
9430
9431   /* Verify that there is no overlap in what bits are set in the two masks.  */
9432   if (mask1 != ~mask2)
9433     return false;
9434
9435   /* Verify that mask2 is not all zeros or ones.  */
9436   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
9437     return false;
9438
9439   /* The shift amount should always be less than the mode size.  */
9440   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
9441
9442   /* Verify that the mask being shifted is contiguous and would be in the
9443      least significant bits after shifting by shft_amnt.  */
9444   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
9445   return (t == (t & -t));
9446 }
9447
9448 /* Calculate the cost of calculating X, storing it in *COST.  Result
9449    is true if the total cost of the operation has now been calculated.  */
9450 static bool
9451 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
9452                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
9453 {
9454   rtx op0, op1, op2;
9455   const struct cpu_cost_table *extra_cost
9456     = aarch64_tune_params.insn_extra_cost;
9457   int code = GET_CODE (x);
9458   scalar_int_mode int_mode;
9459
9460   /* By default, assume that everything has equivalent cost to the
9461      cheapest instruction.  Any additional costs are applied as a delta
9462      above this default.  */
9463   *cost = COSTS_N_INSNS (1);
9464
9465   switch (code)
9466     {
9467     case SET:
9468       /* The cost depends entirely on the operands to SET.  */
9469       *cost = 0;
9470       op0 = SET_DEST (x);
9471       op1 = SET_SRC (x);
9472
9473       switch (GET_CODE (op0))
9474         {
9475         case MEM:
9476           if (speed)
9477             {
9478               rtx address = XEXP (op0, 0);
9479               if (VECTOR_MODE_P (mode))
9480                 *cost += extra_cost->ldst.storev;
9481               else if (GET_MODE_CLASS (mode) == MODE_INT)
9482                 *cost += extra_cost->ldst.store;
9483               else if (mode == SFmode)
9484                 *cost += extra_cost->ldst.storef;
9485               else if (mode == DFmode)
9486                 *cost += extra_cost->ldst.stored;
9487
9488               *cost +=
9489                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9490                                                      0, speed));
9491             }
9492
9493           *cost += rtx_cost (op1, mode, SET, 1, speed);
9494           return true;
9495
9496         case SUBREG:
9497           if (! REG_P (SUBREG_REG (op0)))
9498             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
9499
9500           /* Fall through.  */
9501         case REG:
9502           /* The cost is one per vector-register copied.  */
9503           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
9504             {
9505               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
9506               *cost = COSTS_N_INSNS (nregs);
9507             }
9508           /* const0_rtx is in general free, but we will use an
9509              instruction to set a register to 0.  */
9510           else if (REG_P (op1) || op1 == const0_rtx)
9511             {
9512               /* The cost is 1 per register copied.  */
9513               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
9514               *cost = COSTS_N_INSNS (nregs);
9515             }
9516           else
9517             /* Cost is just the cost of the RHS of the set.  */
9518             *cost += rtx_cost (op1, mode, SET, 1, speed);
9519           return true;
9520
9521         case ZERO_EXTRACT:
9522         case SIGN_EXTRACT:
9523           /* Bit-field insertion.  Strip any redundant widening of
9524              the RHS to meet the width of the target.  */
9525           if (GET_CODE (op1) == SUBREG)
9526             op1 = SUBREG_REG (op1);
9527           if ((GET_CODE (op1) == ZERO_EXTEND
9528                || GET_CODE (op1) == SIGN_EXTEND)
9529               && CONST_INT_P (XEXP (op0, 1))
9530               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
9531               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
9532             op1 = XEXP (op1, 0);
9533
9534           if (CONST_INT_P (op1))
9535             {
9536               /* MOV immediate is assumed to always be cheap.  */
9537               *cost = COSTS_N_INSNS (1);
9538             }
9539           else
9540             {
9541               /* BFM.  */
9542               if (speed)
9543                 *cost += extra_cost->alu.bfi;
9544               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
9545             }
9546
9547           return true;
9548
9549         default:
9550           /* We can't make sense of this, assume default cost.  */
9551           *cost = COSTS_N_INSNS (1);
9552           return false;
9553         }
9554       return false;
9555
9556     case CONST_INT:
9557       /* If an instruction can incorporate a constant within the
9558          instruction, the instruction's expression avoids calling
9559          rtx_cost() on the constant.  If rtx_cost() is called on a
9560          constant, then it is usually because the constant must be
9561          moved into a register by one or more instructions.
9562
9563          The exception is constant 0, which can be expressed
9564          as XZR/WZR and is therefore free.  The exception to this is
9565          if we have (set (reg) (const0_rtx)) in which case we must cost
9566          the move.  However, we can catch that when we cost the SET, so
9567          we don't need to consider that here.  */
9568       if (x == const0_rtx)
9569         *cost = 0;
9570       else
9571         {
9572           /* To an approximation, building any other constant is
9573              proportionally expensive to the number of instructions
9574              required to build that constant.  This is true whether we
9575              are compiling for SPEED or otherwise.  */
9576           if (!is_a <scalar_int_mode> (mode, &int_mode))
9577             int_mode = word_mode;
9578           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
9579                                  (NULL_RTX, x, false, int_mode));
9580         }
9581       return true;
9582
9583     case CONST_DOUBLE:
9584
9585       /* First determine number of instructions to do the move
9586           as an integer constant.  */
9587       if (!aarch64_float_const_representable_p (x)
9588            && !aarch64_can_const_movi_rtx_p (x, mode)
9589            && aarch64_float_const_rtx_p (x))
9590         {
9591           unsigned HOST_WIDE_INT ival;
9592           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
9593           gcc_assert (succeed);
9594
9595           scalar_int_mode imode = (mode == HFmode
9596                                    ? SImode
9597                                    : int_mode_for_mode (mode).require ());
9598           int ncost = aarch64_internal_mov_immediate
9599                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9600           *cost += COSTS_N_INSNS (ncost);
9601           return true;
9602         }
9603
9604       if (speed)
9605         {
9606           /* mov[df,sf]_aarch64.  */
9607           if (aarch64_float_const_representable_p (x))
9608             /* FMOV (scalar immediate).  */
9609             *cost += extra_cost->fp[mode == DFmode].fpconst;
9610           else if (!aarch64_float_const_zero_rtx_p (x))
9611             {
9612               /* This will be a load from memory.  */
9613               if (mode == DFmode)
9614                 *cost += extra_cost->ldst.loadd;
9615               else
9616                 *cost += extra_cost->ldst.loadf;
9617             }
9618           else
9619             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
9620                or MOV v0.s[0], wzr - neither of which are modeled by the
9621                cost tables.  Just use the default cost.  */
9622             {
9623             }
9624         }
9625
9626       return true;
9627
9628     case MEM:
9629       if (speed)
9630         {
9631           /* For loads we want the base cost of a load, plus an
9632              approximation for the additional cost of the addressing
9633              mode.  */
9634           rtx address = XEXP (x, 0);
9635           if (VECTOR_MODE_P (mode))
9636             *cost += extra_cost->ldst.loadv;
9637           else if (GET_MODE_CLASS (mode) == MODE_INT)
9638             *cost += extra_cost->ldst.load;
9639           else if (mode == SFmode)
9640             *cost += extra_cost->ldst.loadf;
9641           else if (mode == DFmode)
9642             *cost += extra_cost->ldst.loadd;
9643
9644           *cost +=
9645                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9646                                                      0, speed));
9647         }
9648
9649       return true;
9650
9651     case NEG:
9652       op0 = XEXP (x, 0);
9653
9654       if (VECTOR_MODE_P (mode))
9655         {
9656           if (speed)
9657             {
9658               /* FNEG.  */
9659               *cost += extra_cost->vect.alu;
9660             }
9661           return false;
9662         }
9663
9664       if (GET_MODE_CLASS (mode) == MODE_INT)
9665         {
9666           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9667               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9668             {
9669               /* CSETM.  */
9670               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
9671               return true;
9672             }
9673
9674           /* Cost this as SUB wzr, X.  */
9675           op0 = CONST0_RTX (mode);
9676           op1 = XEXP (x, 0);
9677           goto cost_minus;
9678         }
9679
9680       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9681         {
9682           /* Support (neg(fma...)) as a single instruction only if
9683              sign of zeros is unimportant.  This matches the decision
9684              making in aarch64.md.  */
9685           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
9686             {
9687               /* FNMADD.  */
9688               *cost = rtx_cost (op0, mode, NEG, 0, speed);
9689               return true;
9690             }
9691           if (GET_CODE (op0) == MULT)
9692             {
9693               /* FNMUL.  */
9694               *cost = rtx_cost (op0, mode, NEG, 0, speed);
9695               return true;
9696             }
9697           if (speed)
9698             /* FNEG.  */
9699             *cost += extra_cost->fp[mode == DFmode].neg;
9700           return false;
9701         }
9702
9703       return false;
9704
9705     case CLRSB:
9706     case CLZ:
9707       if (speed)
9708         {
9709           if (VECTOR_MODE_P (mode))
9710             *cost += extra_cost->vect.alu;
9711           else
9712             *cost += extra_cost->alu.clz;
9713         }
9714
9715       return false;
9716
9717     case COMPARE:
9718       op0 = XEXP (x, 0);
9719       op1 = XEXP (x, 1);
9720
9721       if (op1 == const0_rtx
9722           && GET_CODE (op0) == AND)
9723         {
9724           x = op0;
9725           mode = GET_MODE (op0);
9726           goto cost_logic;
9727         }
9728
9729       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
9730         {
9731           /* TODO: A write to the CC flags possibly costs extra, this
9732              needs encoding in the cost tables.  */
9733
9734           mode = GET_MODE (op0);
9735           /* ANDS.  */
9736           if (GET_CODE (op0) == AND)
9737             {
9738               x = op0;
9739               goto cost_logic;
9740             }
9741
9742           if (GET_CODE (op0) == PLUS)
9743             {
9744               /* ADDS (and CMN alias).  */
9745               x = op0;
9746               goto cost_plus;
9747             }
9748
9749           if (GET_CODE (op0) == MINUS)
9750             {
9751               /* SUBS.  */
9752               x = op0;
9753               goto cost_minus;
9754             }
9755
9756           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
9757               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
9758               && CONST_INT_P (XEXP (op0, 2)))
9759             {
9760               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
9761                  Handle it here directly rather than going to cost_logic
9762                  since we know the immediate generated for the TST is valid
9763                  so we can avoid creating an intermediate rtx for it only
9764                  for costing purposes.  */
9765               if (speed)
9766                 *cost += extra_cost->alu.logical;
9767
9768               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
9769                                  ZERO_EXTRACT, 0, speed);
9770               return true;
9771             }
9772
9773           if (GET_CODE (op1) == NEG)
9774             {
9775               /* CMN.  */
9776               if (speed)
9777                 *cost += extra_cost->alu.arith;
9778
9779               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
9780               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
9781               return true;
9782             }
9783
9784           /* CMP.
9785
9786              Compare can freely swap the order of operands, and
9787              canonicalization puts the more complex operation first.
9788              But the integer MINUS logic expects the shift/extend
9789              operation in op1.  */
9790           if (! (REG_P (op0)
9791                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
9792           {
9793             op0 = XEXP (x, 1);
9794             op1 = XEXP (x, 0);
9795           }
9796           goto cost_minus;
9797         }
9798
9799       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
9800         {
9801           /* FCMP.  */
9802           if (speed)
9803             *cost += extra_cost->fp[mode == DFmode].compare;
9804
9805           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
9806             {
9807               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
9808               /* FCMP supports constant 0.0 for no extra cost. */
9809               return true;
9810             }
9811           return false;
9812         }
9813
9814       if (VECTOR_MODE_P (mode))
9815         {
9816           /* Vector compare.  */
9817           if (speed)
9818             *cost += extra_cost->vect.alu;
9819
9820           if (aarch64_float_const_zero_rtx_p (op1))
9821             {
9822               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
9823                  cost.  */
9824               return true;
9825             }
9826           return false;
9827         }
9828       return false;
9829
9830     case MINUS:
9831       {
9832         op0 = XEXP (x, 0);
9833         op1 = XEXP (x, 1);
9834
9835 cost_minus:
9836         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
9837
9838         /* Detect valid immediates.  */
9839         if ((GET_MODE_CLASS (mode) == MODE_INT
9840              || (GET_MODE_CLASS (mode) == MODE_CC
9841                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
9842             && CONST_INT_P (op1)
9843             && aarch64_uimm12_shift (INTVAL (op1)))
9844           {
9845             if (speed)
9846               /* SUB(S) (immediate).  */
9847               *cost += extra_cost->alu.arith;
9848             return true;
9849           }
9850
9851         /* Look for SUB (extended register).  */
9852         if (is_a <scalar_int_mode> (mode, &int_mode)
9853             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
9854           {
9855             if (speed)
9856               *cost += extra_cost->alu.extend_arith;
9857
9858             op1 = aarch64_strip_extend (op1, true);
9859             *cost += rtx_cost (op1, VOIDmode,
9860                                (enum rtx_code) GET_CODE (op1), 0, speed);
9861             return true;
9862           }
9863
9864         rtx new_op1 = aarch64_strip_extend (op1, false);
9865
9866         /* Cost this as an FMA-alike operation.  */
9867         if ((GET_CODE (new_op1) == MULT
9868              || aarch64_shift_p (GET_CODE (new_op1)))
9869             && code != COMPARE)
9870           {
9871             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
9872                                             (enum rtx_code) code,
9873                                             speed);
9874             return true;
9875           }
9876
9877         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
9878
9879         if (speed)
9880           {
9881             if (VECTOR_MODE_P (mode))
9882               {
9883                 /* Vector SUB.  */
9884                 *cost += extra_cost->vect.alu;
9885               }
9886             else if (GET_MODE_CLASS (mode) == MODE_INT)
9887               {
9888                 /* SUB(S).  */
9889                 *cost += extra_cost->alu.arith;
9890               }
9891             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9892               {
9893                 /* FSUB.  */
9894                 *cost += extra_cost->fp[mode == DFmode].addsub;
9895               }
9896           }
9897         return true;
9898       }
9899
9900     case PLUS:
9901       {
9902         rtx new_op0;
9903
9904         op0 = XEXP (x, 0);
9905         op1 = XEXP (x, 1);
9906
9907 cost_plus:
9908         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9909             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9910           {
9911             /* CSINC.  */
9912             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
9913             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9914             return true;
9915           }
9916
9917         if (GET_MODE_CLASS (mode) == MODE_INT
9918             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
9919                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
9920           {
9921             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
9922
9923             if (speed)
9924               /* ADD (immediate).  */
9925               *cost += extra_cost->alu.arith;
9926             return true;
9927           }
9928
9929         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9930
9931         /* Look for ADD (extended register).  */
9932         if (is_a <scalar_int_mode> (mode, &int_mode)
9933             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
9934           {
9935             if (speed)
9936               *cost += extra_cost->alu.extend_arith;
9937
9938             op0 = aarch64_strip_extend (op0, true);
9939             *cost += rtx_cost (op0, VOIDmode,
9940                                (enum rtx_code) GET_CODE (op0), 0, speed);
9941             return true;
9942           }
9943
9944         /* Strip any extend, leave shifts behind as we will
9945            cost them through mult_cost.  */
9946         new_op0 = aarch64_strip_extend (op0, false);
9947
9948         if (GET_CODE (new_op0) == MULT
9949             || aarch64_shift_p (GET_CODE (new_op0)))
9950           {
9951             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
9952                                             speed);
9953             return true;
9954           }
9955
9956         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
9957
9958         if (speed)
9959           {
9960             if (VECTOR_MODE_P (mode))
9961               {
9962                 /* Vector ADD.  */
9963                 *cost += extra_cost->vect.alu;
9964               }
9965             else if (GET_MODE_CLASS (mode) == MODE_INT)
9966               {
9967                 /* ADD.  */
9968                 *cost += extra_cost->alu.arith;
9969               }
9970             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9971               {
9972                 /* FADD.  */
9973                 *cost += extra_cost->fp[mode == DFmode].addsub;
9974               }
9975           }
9976         return true;
9977       }
9978
9979     case BSWAP:
9980       *cost = COSTS_N_INSNS (1);
9981
9982       if (speed)
9983         {
9984           if (VECTOR_MODE_P (mode))
9985             *cost += extra_cost->vect.alu;
9986           else
9987             *cost += extra_cost->alu.rev;
9988         }
9989       return false;
9990
9991     case IOR:
9992       if (aarch_rev16_p (x))
9993         {
9994           *cost = COSTS_N_INSNS (1);
9995
9996           if (speed)
9997             {
9998               if (VECTOR_MODE_P (mode))
9999                 *cost += extra_cost->vect.alu;
10000               else
10001                 *cost += extra_cost->alu.rev;
10002             }
10003           return true;
10004         }
10005
10006       if (aarch64_extr_rtx_p (x, &op0, &op1))
10007         {
10008           *cost += rtx_cost (op0, mode, IOR, 0, speed);
10009           *cost += rtx_cost (op1, mode, IOR, 1, speed);
10010           if (speed)
10011             *cost += extra_cost->alu.shift;
10012
10013           return true;
10014         }
10015     /* Fall through.  */
10016     case XOR:
10017     case AND:
10018     cost_logic:
10019       op0 = XEXP (x, 0);
10020       op1 = XEXP (x, 1);
10021
10022       if (VECTOR_MODE_P (mode))
10023         {
10024           if (speed)
10025             *cost += extra_cost->vect.alu;
10026           return true;
10027         }
10028
10029       if (code == AND
10030           && GET_CODE (op0) == MULT
10031           && CONST_INT_P (XEXP (op0, 1))
10032           && CONST_INT_P (op1)
10033           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10034                                INTVAL (op1)) != 0)
10035         {
10036           /* This is a UBFM/SBFM.  */
10037           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10038           if (speed)
10039             *cost += extra_cost->alu.bfx;
10040           return true;
10041         }
10042
10043       if (is_int_mode (mode, &int_mode))
10044         {
10045           if (CONST_INT_P (op1))
10046             {
10047               /* We have a mask + shift version of a UBFIZ
10048                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
10049               if (GET_CODE (op0) == ASHIFT
10050                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10051                                                          XEXP (op0, 1)))
10052                 {
10053                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
10054                                      (enum rtx_code) code, 0, speed);
10055                   if (speed)
10056                     *cost += extra_cost->alu.bfx;
10057
10058                   return true;
10059                 }
10060               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10061                 {
10062                 /* We possibly get the immediate for free, this is not
10063                    modelled.  */
10064                   *cost += rtx_cost (op0, int_mode,
10065                                      (enum rtx_code) code, 0, speed);
10066                   if (speed)
10067                     *cost += extra_cost->alu.logical;
10068
10069                   return true;
10070                 }
10071             }
10072           else
10073             {
10074               rtx new_op0 = op0;
10075
10076               /* Handle ORN, EON, or BIC.  */
10077               if (GET_CODE (op0) == NOT)
10078                 op0 = XEXP (op0, 0);
10079
10080               new_op0 = aarch64_strip_shift (op0);
10081
10082               /* If we had a shift on op0 then this is a logical-shift-
10083                  by-register/immediate operation.  Otherwise, this is just
10084                  a logical operation.  */
10085               if (speed)
10086                 {
10087                   if (new_op0 != op0)
10088                     {
10089                       /* Shift by immediate.  */
10090                       if (CONST_INT_P (XEXP (op0, 1)))
10091                         *cost += extra_cost->alu.log_shift;
10092                       else
10093                         *cost += extra_cost->alu.log_shift_reg;
10094                     }
10095                   else
10096                     *cost += extra_cost->alu.logical;
10097                 }
10098
10099               /* In both cases we want to cost both operands.  */
10100               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10101                                  0, speed);
10102               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10103                                  1, speed);
10104
10105               return true;
10106             }
10107         }
10108       return false;
10109
10110     case NOT:
10111       x = XEXP (x, 0);
10112       op0 = aarch64_strip_shift (x);
10113
10114       if (VECTOR_MODE_P (mode))
10115         {
10116           /* Vector NOT.  */
10117           *cost += extra_cost->vect.alu;
10118           return false;
10119         }
10120
10121       /* MVN-shifted-reg.  */
10122       if (op0 != x)
10123         {
10124           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10125
10126           if (speed)
10127             *cost += extra_cost->alu.log_shift;
10128
10129           return true;
10130         }
10131       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10132          Handle the second form here taking care that 'a' in the above can
10133          be a shift.  */
10134       else if (GET_CODE (op0) == XOR)
10135         {
10136           rtx newop0 = XEXP (op0, 0);
10137           rtx newop1 = XEXP (op0, 1);
10138           rtx op0_stripped = aarch64_strip_shift (newop0);
10139
10140           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10141           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10142
10143           if (speed)
10144             {
10145               if (op0_stripped != newop0)
10146                 *cost += extra_cost->alu.log_shift;
10147               else
10148                 *cost += extra_cost->alu.logical;
10149             }
10150
10151           return true;
10152         }
10153       /* MVN.  */
10154       if (speed)
10155         *cost += extra_cost->alu.logical;
10156
10157       return false;
10158
10159     case ZERO_EXTEND:
10160
10161       op0 = XEXP (x, 0);
10162       /* If a value is written in SI mode, then zero extended to DI
10163          mode, the operation will in general be free as a write to
10164          a 'w' register implicitly zeroes the upper bits of an 'x'
10165          register.  However, if this is
10166
10167            (set (reg) (zero_extend (reg)))
10168
10169          we must cost the explicit register move.  */
10170       if (mode == DImode
10171           && GET_MODE (op0) == SImode
10172           && outer == SET)
10173         {
10174           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10175
10176         /* If OP_COST is non-zero, then the cost of the zero extend
10177            is effectively the cost of the inner operation.  Otherwise
10178            we have a MOV instruction and we take the cost from the MOV
10179            itself.  This is true independently of whether we are
10180            optimizing for space or time.  */
10181           if (op_cost)
10182             *cost = op_cost;
10183
10184           return true;
10185         }
10186       else if (MEM_P (op0))
10187         {
10188           /* All loads can zero extend to any size for free.  */
10189           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10190           return true;
10191         }
10192
10193       op0 = aarch64_extend_bitfield_pattern_p (x);
10194       if (op0)
10195         {
10196           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
10197           if (speed)
10198             *cost += extra_cost->alu.bfx;
10199           return true;
10200         }
10201
10202       if (speed)
10203         {
10204           if (VECTOR_MODE_P (mode))
10205             {
10206               /* UMOV.  */
10207               *cost += extra_cost->vect.alu;
10208             }
10209           else
10210             {
10211               /* We generate an AND instead of UXTB/UXTH.  */
10212               *cost += extra_cost->alu.logical;
10213             }
10214         }
10215       return false;
10216
10217     case SIGN_EXTEND:
10218       if (MEM_P (XEXP (x, 0)))
10219         {
10220           /* LDRSH.  */
10221           if (speed)
10222             {
10223               rtx address = XEXP (XEXP (x, 0), 0);
10224               *cost += extra_cost->ldst.load_sign_extend;
10225
10226               *cost +=
10227                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10228                                                      0, speed));
10229             }
10230           return true;
10231         }
10232
10233       op0 = aarch64_extend_bitfield_pattern_p (x);
10234       if (op0)
10235         {
10236           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
10237           if (speed)
10238             *cost += extra_cost->alu.bfx;
10239           return true;
10240         }
10241
10242       if (speed)
10243         {
10244           if (VECTOR_MODE_P (mode))
10245             *cost += extra_cost->vect.alu;
10246           else
10247             *cost += extra_cost->alu.extend;
10248         }
10249       return false;
10250
10251     case ASHIFT:
10252       op0 = XEXP (x, 0);
10253       op1 = XEXP (x, 1);
10254
10255       if (CONST_INT_P (op1))
10256         {
10257           if (speed)
10258             {
10259               if (VECTOR_MODE_P (mode))
10260                 {
10261                   /* Vector shift (immediate).  */
10262                   *cost += extra_cost->vect.alu;
10263                 }
10264               else
10265                 {
10266                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
10267                      aliases.  */
10268                   *cost += extra_cost->alu.shift;
10269                 }
10270             }
10271
10272           /* We can incorporate zero/sign extend for free.  */
10273           if (GET_CODE (op0) == ZERO_EXTEND
10274               || GET_CODE (op0) == SIGN_EXTEND)
10275             op0 = XEXP (op0, 0);
10276
10277           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
10278           return true;
10279         }
10280       else
10281         {
10282           if (VECTOR_MODE_P (mode))
10283             {
10284               if (speed)
10285                 /* Vector shift (register).  */
10286                 *cost += extra_cost->vect.alu;
10287             }
10288           else
10289             {
10290               if (speed)
10291                 /* LSLV.  */
10292                 *cost += extra_cost->alu.shift_reg;
10293
10294               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10295                   && CONST_INT_P (XEXP (op1, 1))
10296                   && known_eq (INTVAL (XEXP (op1, 1)),
10297                                GET_MODE_BITSIZE (mode) - 1))
10298                 {
10299                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10300                   /* We already demanded XEXP (op1, 0) to be REG_P, so
10301                      don't recurse into it.  */
10302                   return true;
10303                 }
10304             }
10305           return false;  /* All arguments need to be in registers.  */
10306         }
10307
10308     case ROTATE:
10309     case ROTATERT:
10310     case LSHIFTRT:
10311     case ASHIFTRT:
10312       op0 = XEXP (x, 0);
10313       op1 = XEXP (x, 1);
10314
10315       if (CONST_INT_P (op1))
10316         {
10317           /* ASR (immediate) and friends.  */
10318           if (speed)
10319             {
10320               if (VECTOR_MODE_P (mode))
10321                 *cost += extra_cost->vect.alu;
10322               else
10323                 *cost += extra_cost->alu.shift;
10324             }
10325
10326           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10327           return true;
10328         }
10329       else
10330         {
10331           if (VECTOR_MODE_P (mode))
10332             {
10333               if (speed)
10334                 /* Vector shift (register).  */
10335                 *cost += extra_cost->vect.alu;
10336             }
10337           else
10338             {
10339               if (speed)
10340                 /* ASR (register) and friends.  */
10341                 *cost += extra_cost->alu.shift_reg;
10342
10343               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10344                   && CONST_INT_P (XEXP (op1, 1))
10345                   && known_eq (INTVAL (XEXP (op1, 1)),
10346                                GET_MODE_BITSIZE (mode) - 1))
10347                 {
10348                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10349                   /* We already demanded XEXP (op1, 0) to be REG_P, so
10350                      don't recurse into it.  */
10351                   return true;
10352                 }
10353             }
10354           return false;  /* All arguments need to be in registers.  */
10355         }
10356
10357     case SYMBOL_REF:
10358
10359       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
10360           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
10361         {
10362           /* LDR.  */
10363           if (speed)
10364             *cost += extra_cost->ldst.load;
10365         }
10366       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
10367                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
10368         {
10369           /* ADRP, followed by ADD.  */
10370           *cost += COSTS_N_INSNS (1);
10371           if (speed)
10372             *cost += 2 * extra_cost->alu.arith;
10373         }
10374       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
10375                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10376         {
10377           /* ADR.  */
10378           if (speed)
10379             *cost += extra_cost->alu.arith;
10380         }
10381
10382       if (flag_pic)
10383         {
10384           /* One extra load instruction, after accessing the GOT.  */
10385           *cost += COSTS_N_INSNS (1);
10386           if (speed)
10387             *cost += extra_cost->ldst.load;
10388         }
10389       return true;
10390
10391     case HIGH:
10392     case LO_SUM:
10393       /* ADRP/ADD (immediate).  */
10394       if (speed)
10395         *cost += extra_cost->alu.arith;
10396       return true;
10397
10398     case ZERO_EXTRACT:
10399     case SIGN_EXTRACT:
10400       /* UBFX/SBFX.  */
10401       if (speed)
10402         {
10403           if (VECTOR_MODE_P (mode))
10404             *cost += extra_cost->vect.alu;
10405           else
10406             *cost += extra_cost->alu.bfx;
10407         }
10408
10409       /* We can trust that the immediates used will be correct (there
10410          are no by-register forms), so we need only cost op0.  */
10411       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
10412       return true;
10413
10414     case MULT:
10415       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
10416       /* aarch64_rtx_mult_cost always handles recursion to its
10417          operands.  */
10418       return true;
10419
10420     case MOD:
10421     /* We can expand signed mod by power of 2 using a NEGS, two parallel
10422        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
10423        an unconditional negate.  This case should only ever be reached through
10424        the set_smod_pow2_cheap check in expmed.c.  */
10425       if (CONST_INT_P (XEXP (x, 1))
10426           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
10427           && (mode == SImode || mode == DImode))
10428         {
10429           /* We expand to 4 instructions.  Reset the baseline.  */
10430           *cost = COSTS_N_INSNS (4);
10431
10432           if (speed)
10433             *cost += 2 * extra_cost->alu.logical
10434                      + 2 * extra_cost->alu.arith;
10435
10436           return true;
10437         }
10438
10439     /* Fall-through.  */
10440     case UMOD:
10441       if (speed)
10442         {
10443           /* Slighly prefer UMOD over SMOD.  */
10444           if (VECTOR_MODE_P (mode))
10445             *cost += extra_cost->vect.alu;
10446           else if (GET_MODE_CLASS (mode) == MODE_INT)
10447             *cost += (extra_cost->mult[mode == DImode].add
10448                       + extra_cost->mult[mode == DImode].idiv
10449                       + (code == MOD ? 1 : 0));
10450         }
10451       return false;  /* All arguments need to be in registers.  */
10452
10453     case DIV:
10454     case UDIV:
10455     case SQRT:
10456       if (speed)
10457         {
10458           if (VECTOR_MODE_P (mode))
10459             *cost += extra_cost->vect.alu;
10460           else if (GET_MODE_CLASS (mode) == MODE_INT)
10461             /* There is no integer SQRT, so only DIV and UDIV can get
10462                here.  */
10463             *cost += (extra_cost->mult[mode == DImode].idiv
10464                      /* Slighly prefer UDIV over SDIV.  */
10465                      + (code == DIV ? 1 : 0));
10466           else
10467             *cost += extra_cost->fp[mode == DFmode].div;
10468         }
10469       return false;  /* All arguments need to be in registers.  */
10470
10471     case IF_THEN_ELSE:
10472       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
10473                                          XEXP (x, 2), cost, speed);
10474
10475     case EQ:
10476     case NE:
10477     case GT:
10478     case GTU:
10479     case LT:
10480     case LTU:
10481     case GE:
10482     case GEU:
10483     case LE:
10484     case LEU:
10485
10486       return false; /* All arguments must be in registers.  */
10487
10488     case FMA:
10489       op0 = XEXP (x, 0);
10490       op1 = XEXP (x, 1);
10491       op2 = XEXP (x, 2);
10492
10493       if (speed)
10494         {
10495           if (VECTOR_MODE_P (mode))
10496             *cost += extra_cost->vect.alu;
10497           else
10498             *cost += extra_cost->fp[mode == DFmode].fma;
10499         }
10500
10501       /* FMSUB, FNMADD, and FNMSUB are free.  */
10502       if (GET_CODE (op0) == NEG)
10503         op0 = XEXP (op0, 0);
10504
10505       if (GET_CODE (op2) == NEG)
10506         op2 = XEXP (op2, 0);
10507
10508       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
10509          and the by-element operand as operand 0.  */
10510       if (GET_CODE (op1) == NEG)
10511         op1 = XEXP (op1, 0);
10512
10513       /* Catch vector-by-element operations.  The by-element operand can
10514          either be (vec_duplicate (vec_select (x))) or just
10515          (vec_select (x)), depending on whether we are multiplying by
10516          a vector or a scalar.
10517
10518          Canonicalization is not very good in these cases, FMA4 will put the
10519          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
10520       if (GET_CODE (op0) == VEC_DUPLICATE)
10521         op0 = XEXP (op0, 0);
10522       else if (GET_CODE (op1) == VEC_DUPLICATE)
10523         op1 = XEXP (op1, 0);
10524
10525       if (GET_CODE (op0) == VEC_SELECT)
10526         op0 = XEXP (op0, 0);
10527       else if (GET_CODE (op1) == VEC_SELECT)
10528         op1 = XEXP (op1, 0);
10529
10530       /* If the remaining parameters are not registers,
10531          get the cost to put them into registers.  */
10532       *cost += rtx_cost (op0, mode, FMA, 0, speed);
10533       *cost += rtx_cost (op1, mode, FMA, 1, speed);
10534       *cost += rtx_cost (op2, mode, FMA, 2, speed);
10535       return true;
10536
10537     case FLOAT:
10538     case UNSIGNED_FLOAT:
10539       if (speed)
10540         *cost += extra_cost->fp[mode == DFmode].fromint;
10541       return false;
10542
10543     case FLOAT_EXTEND:
10544       if (speed)
10545         {
10546           if (VECTOR_MODE_P (mode))
10547             {
10548               /*Vector truncate.  */
10549               *cost += extra_cost->vect.alu;
10550             }
10551           else
10552             *cost += extra_cost->fp[mode == DFmode].widen;
10553         }
10554       return false;
10555
10556     case FLOAT_TRUNCATE:
10557       if (speed)
10558         {
10559           if (VECTOR_MODE_P (mode))
10560             {
10561               /*Vector conversion.  */
10562               *cost += extra_cost->vect.alu;
10563             }
10564           else
10565             *cost += extra_cost->fp[mode == DFmode].narrow;
10566         }
10567       return false;
10568
10569     case FIX:
10570     case UNSIGNED_FIX:
10571       x = XEXP (x, 0);
10572       /* Strip the rounding part.  They will all be implemented
10573          by the fcvt* family of instructions anyway.  */
10574       if (GET_CODE (x) == UNSPEC)
10575         {
10576           unsigned int uns_code = XINT (x, 1);
10577
10578           if (uns_code == UNSPEC_FRINTA
10579               || uns_code == UNSPEC_FRINTM
10580               || uns_code == UNSPEC_FRINTN
10581               || uns_code == UNSPEC_FRINTP
10582               || uns_code == UNSPEC_FRINTZ)
10583             x = XVECEXP (x, 0, 0);
10584         }
10585
10586       if (speed)
10587         {
10588           if (VECTOR_MODE_P (mode))
10589             *cost += extra_cost->vect.alu;
10590           else
10591             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
10592         }
10593
10594       /* We can combine fmul by a power of 2 followed by a fcvt into a single
10595          fixed-point fcvt.  */
10596       if (GET_CODE (x) == MULT
10597           && ((VECTOR_MODE_P (mode)
10598                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
10599               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
10600         {
10601           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
10602                              0, speed);
10603           return true;
10604         }
10605
10606       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
10607       return true;
10608
10609     case ABS:
10610       if (VECTOR_MODE_P (mode))
10611         {
10612           /* ABS (vector).  */
10613           if (speed)
10614             *cost += extra_cost->vect.alu;
10615         }
10616       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10617         {
10618           op0 = XEXP (x, 0);
10619
10620           /* FABD, which is analogous to FADD.  */
10621           if (GET_CODE (op0) == MINUS)
10622             {
10623               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
10624               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
10625               if (speed)
10626                 *cost += extra_cost->fp[mode == DFmode].addsub;
10627
10628               return true;
10629             }
10630           /* Simple FABS is analogous to FNEG.  */
10631           if (speed)
10632             *cost += extra_cost->fp[mode == DFmode].neg;
10633         }
10634       else
10635         {
10636           /* Integer ABS will either be split to
10637              two arithmetic instructions, or will be an ABS
10638              (scalar), which we don't model.  */
10639           *cost = COSTS_N_INSNS (2);
10640           if (speed)
10641             *cost += 2 * extra_cost->alu.arith;
10642         }
10643       return false;
10644
10645     case SMAX:
10646     case SMIN:
10647       if (speed)
10648         {
10649           if (VECTOR_MODE_P (mode))
10650             *cost += extra_cost->vect.alu;
10651           else
10652             {
10653               /* FMAXNM/FMINNM/FMAX/FMIN.
10654                  TODO: This may not be accurate for all implementations, but
10655                  we do not model this in the cost tables.  */
10656               *cost += extra_cost->fp[mode == DFmode].addsub;
10657             }
10658         }
10659       return false;
10660
10661     case UNSPEC:
10662       /* The floating point round to integer frint* instructions.  */
10663       if (aarch64_frint_unspec_p (XINT (x, 1)))
10664         {
10665           if (speed)
10666             *cost += extra_cost->fp[mode == DFmode].roundint;
10667
10668           return false;
10669         }
10670
10671       if (XINT (x, 1) == UNSPEC_RBIT)
10672         {
10673           if (speed)
10674             *cost += extra_cost->alu.rev;
10675
10676           return false;
10677         }
10678       break;
10679
10680     case TRUNCATE:
10681
10682       /* Decompose <su>muldi3_highpart.  */
10683       if (/* (truncate:DI  */
10684           mode == DImode
10685           /*   (lshiftrt:TI  */
10686           && GET_MODE (XEXP (x, 0)) == TImode
10687           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
10688           /*      (mult:TI  */
10689           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
10690           /*        (ANY_EXTEND:TI (reg:DI))
10691                     (ANY_EXTEND:TI (reg:DI)))  */
10692           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
10693                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
10694               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
10695                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
10696           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
10697           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
10698           /*     (const_int 64)  */
10699           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10700           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
10701         {
10702           /* UMULH/SMULH.  */
10703           if (speed)
10704             *cost += extra_cost->mult[mode == DImode].extend;
10705           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
10706                              mode, MULT, 0, speed);
10707           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
10708                              mode, MULT, 1, speed);
10709           return true;
10710         }
10711
10712       /* Fall through.  */
10713     default:
10714       break;
10715     }
10716
10717   if (dump_file
10718       && flag_aarch64_verbose_cost)
10719     fprintf (dump_file,
10720       "\nFailed to cost RTX.  Assuming default cost.\n");
10721
10722   return true;
10723 }
10724
10725 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
10726    calculated for X.  This cost is stored in *COST.  Returns true
10727    if the total cost of X was calculated.  */
10728 static bool
10729 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
10730                    int param, int *cost, bool speed)
10731 {
10732   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
10733
10734   if (dump_file
10735       && flag_aarch64_verbose_cost)
10736     {
10737       print_rtl_single (dump_file, x);
10738       fprintf (dump_file, "\n%s cost: %d (%s)\n",
10739                speed ? "Hot" : "Cold",
10740                *cost, result ? "final" : "partial");
10741     }
10742
10743   return result;
10744 }
10745
10746 static int
10747 aarch64_register_move_cost (machine_mode mode,
10748                             reg_class_t from_i, reg_class_t to_i)
10749 {
10750   enum reg_class from = (enum reg_class) from_i;
10751   enum reg_class to = (enum reg_class) to_i;
10752   const struct cpu_regmove_cost *regmove_cost
10753     = aarch64_tune_params.regmove_cost;
10754
10755   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
10756   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
10757     to = GENERAL_REGS;
10758
10759   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
10760     from = GENERAL_REGS;
10761
10762   /* Moving between GPR and stack cost is the same as GP2GP.  */
10763   if ((from == GENERAL_REGS && to == STACK_REG)
10764       || (to == GENERAL_REGS && from == STACK_REG))
10765     return regmove_cost->GP2GP;
10766
10767   /* To/From the stack register, we move via the gprs.  */
10768   if (to == STACK_REG || from == STACK_REG)
10769     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
10770             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
10771
10772   if (known_eq (GET_MODE_SIZE (mode), 16))
10773     {
10774       /* 128-bit operations on general registers require 2 instructions.  */
10775       if (from == GENERAL_REGS && to == GENERAL_REGS)
10776         return regmove_cost->GP2GP * 2;
10777       else if (from == GENERAL_REGS)
10778         return regmove_cost->GP2FP * 2;
10779       else if (to == GENERAL_REGS)
10780         return regmove_cost->FP2GP * 2;
10781
10782       /* When AdvSIMD instructions are disabled it is not possible to move
10783          a 128-bit value directly between Q registers.  This is handled in
10784          secondary reload.  A general register is used as a scratch to move
10785          the upper DI value and the lower DI value is moved directly,
10786          hence the cost is the sum of three moves. */
10787       if (! TARGET_SIMD)
10788         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
10789
10790       return regmove_cost->FP2FP;
10791     }
10792
10793   if (from == GENERAL_REGS && to == GENERAL_REGS)
10794     return regmove_cost->GP2GP;
10795   else if (from == GENERAL_REGS)
10796     return regmove_cost->GP2FP;
10797   else if (to == GENERAL_REGS)
10798     return regmove_cost->FP2GP;
10799
10800   return regmove_cost->FP2FP;
10801 }
10802
10803 static int
10804 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
10805                           reg_class_t rclass ATTRIBUTE_UNUSED,
10806                           bool in ATTRIBUTE_UNUSED)
10807 {
10808   return aarch64_tune_params.memmov_cost;
10809 }
10810
10811 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
10812    to optimize 1.0/sqrt.  */
10813
10814 static bool
10815 use_rsqrt_p (machine_mode mode)
10816 {
10817   return (!flag_trapping_math
10818           && flag_unsafe_math_optimizations
10819           && ((aarch64_tune_params.approx_modes->recip_sqrt
10820                & AARCH64_APPROX_MODE (mode))
10821               || flag_mrecip_low_precision_sqrt));
10822 }
10823
10824 /* Function to decide when to use the approximate reciprocal square root
10825    builtin.  */
10826
10827 static tree
10828 aarch64_builtin_reciprocal (tree fndecl)
10829 {
10830   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
10831
10832   if (!use_rsqrt_p (mode))
10833     return NULL_TREE;
10834   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
10835 }
10836
10837 /* Emit instruction sequence to compute either the approximate square root
10838    or its approximate reciprocal, depending on the flag RECP, and return
10839    whether the sequence was emitted or not.  */
10840
10841 bool
10842 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
10843 {
10844   machine_mode mode = GET_MODE (dst);
10845
10846   if (GET_MODE_INNER (mode) == HFmode)
10847     {
10848       gcc_assert (!recp);
10849       return false;
10850     }
10851
10852   if (!recp)
10853     {
10854       if (!(flag_mlow_precision_sqrt
10855             || (aarch64_tune_params.approx_modes->sqrt
10856                 & AARCH64_APPROX_MODE (mode))))
10857         return false;
10858
10859       if (flag_finite_math_only
10860           || flag_trapping_math
10861           || !flag_unsafe_math_optimizations
10862           || optimize_function_for_size_p (cfun))
10863         return false;
10864     }
10865   else
10866     /* Caller assumes we cannot fail.  */
10867     gcc_assert (use_rsqrt_p (mode));
10868
10869   machine_mode mmsk = mode_for_int_vector (mode).require ();
10870   rtx xmsk = gen_reg_rtx (mmsk);
10871   if (!recp)
10872     /* When calculating the approximate square root, compare the
10873        argument with 0.0 and create a mask.  */
10874     emit_insn (gen_rtx_SET (xmsk,
10875                             gen_rtx_NEG (mmsk,
10876                                          gen_rtx_EQ (mmsk, src,
10877                                                      CONST0_RTX (mode)))));
10878
10879   /* Estimate the approximate reciprocal square root.  */
10880   rtx xdst = gen_reg_rtx (mode);
10881   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
10882
10883   /* Iterate over the series twice for SF and thrice for DF.  */
10884   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10885
10886   /* Optionally iterate over the series once less for faster performance
10887      while sacrificing the accuracy.  */
10888   if ((recp && flag_mrecip_low_precision_sqrt)
10889       || (!recp && flag_mlow_precision_sqrt))
10890     iterations--;
10891
10892   /* Iterate over the series to calculate the approximate reciprocal square
10893      root.  */
10894   rtx x1 = gen_reg_rtx (mode);
10895   while (iterations--)
10896     {
10897       rtx x2 = gen_reg_rtx (mode);
10898       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
10899
10900       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
10901
10902       if (iterations > 0)
10903         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
10904     }
10905
10906   if (!recp)
10907     {
10908       /* Qualify the approximate reciprocal square root when the argument is
10909          0.0 by squashing the intermediary result to 0.0.  */
10910       rtx xtmp = gen_reg_rtx (mmsk);
10911       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
10912                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
10913       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
10914
10915       /* Calculate the approximate square root.  */
10916       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
10917     }
10918
10919   /* Finalize the approximation.  */
10920   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
10921
10922   return true;
10923 }
10924
10925 /* Emit the instruction sequence to compute the approximation for the division
10926    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
10927
10928 bool
10929 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10930 {
10931   machine_mode mode = GET_MODE (quo);
10932
10933   if (GET_MODE_INNER (mode) == HFmode)
10934     return false;
10935
10936   bool use_approx_division_p = (flag_mlow_precision_div
10937                                 || (aarch64_tune_params.approx_modes->division
10938                                     & AARCH64_APPROX_MODE (mode)));
10939
10940   if (!flag_finite_math_only
10941       || flag_trapping_math
10942       || !flag_unsafe_math_optimizations
10943       || optimize_function_for_size_p (cfun)
10944       || !use_approx_division_p)
10945     return false;
10946
10947   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10948     return false;
10949
10950   /* Estimate the approximate reciprocal.  */
10951   rtx xrcp = gen_reg_rtx (mode);
10952   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
10953
10954   /* Iterate over the series twice for SF and thrice for DF.  */
10955   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10956
10957   /* Optionally iterate over the series once less for faster performance,
10958      while sacrificing the accuracy.  */
10959   if (flag_mlow_precision_div)
10960     iterations--;
10961
10962   /* Iterate over the series to calculate the approximate reciprocal.  */
10963   rtx xtmp = gen_reg_rtx (mode);
10964   while (iterations--)
10965     {
10966       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
10967
10968       if (iterations > 0)
10969         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10970     }
10971
10972   if (num != CONST1_RTX (mode))
10973     {
10974       /* As the approximate reciprocal of DEN is already calculated, only
10975          calculate the approximate division when NUM is not 1.0.  */
10976       rtx xnum = force_reg (mode, num);
10977       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10978     }
10979
10980   /* Finalize the approximation.  */
10981   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10982   return true;
10983 }
10984
10985 /* Return the number of instructions that can be issued per cycle.  */
10986 static int
10987 aarch64_sched_issue_rate (void)
10988 {
10989   return aarch64_tune_params.issue_rate;
10990 }
10991
10992 static int
10993 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10994 {
10995   int issue_rate = aarch64_sched_issue_rate ();
10996
10997   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10998 }
10999
11000
11001 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11002    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
11003    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
11004
11005 static int
11006 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11007                                                     int ready_index)
11008 {
11009   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11010 }
11011
11012
11013 /* Vectorizer cost model target hooks.  */
11014
11015 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
11016 static int
11017 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11018                                     tree vectype,
11019                                     int misalign ATTRIBUTE_UNUSED)
11020 {
11021   unsigned elements;
11022   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11023   bool fp = false;
11024
11025   if (vectype != NULL)
11026     fp = FLOAT_TYPE_P (vectype);
11027
11028   switch (type_of_cost)
11029     {
11030       case scalar_stmt:
11031         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11032
11033       case scalar_load:
11034         return costs->scalar_load_cost;
11035
11036       case scalar_store:
11037         return costs->scalar_store_cost;
11038
11039       case vector_stmt:
11040         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11041
11042       case vector_load:
11043         return costs->vec_align_load_cost;
11044
11045       case vector_store:
11046         return costs->vec_store_cost;
11047
11048       case vec_to_scalar:
11049         return costs->vec_to_scalar_cost;
11050
11051       case scalar_to_vec:
11052         return costs->scalar_to_vec_cost;
11053
11054       case unaligned_load:
11055       case vector_gather_load:
11056         return costs->vec_unalign_load_cost;
11057
11058       case unaligned_store:
11059       case vector_scatter_store:
11060         return costs->vec_unalign_store_cost;
11061
11062       case cond_branch_taken:
11063         return costs->cond_taken_branch_cost;
11064
11065       case cond_branch_not_taken:
11066         return costs->cond_not_taken_branch_cost;
11067
11068       case vec_perm:
11069         return costs->vec_permute_cost;
11070
11071       case vec_promote_demote:
11072         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11073
11074       case vec_construct:
11075         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
11076         return elements / 2 + 1;
11077
11078       default:
11079         gcc_unreachable ();
11080     }
11081 }
11082
11083 /* Implement targetm.vectorize.add_stmt_cost.  */
11084 static unsigned
11085 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
11086                        struct _stmt_vec_info *stmt_info, int misalign,
11087                        enum vect_cost_model_location where)
11088 {
11089   unsigned *cost = (unsigned *) data;
11090   unsigned retval = 0;
11091
11092   if (flag_vect_cost_model)
11093     {
11094       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
11095       int stmt_cost =
11096             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
11097
11098       /* Statements in an inner loop relative to the loop being
11099          vectorized are weighted more heavily.  The value here is
11100          arbitrary and could potentially be improved with analysis.  */
11101       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
11102         count *= 50; /*  FIXME  */
11103
11104       retval = (unsigned) (count * stmt_cost);
11105       cost[where] += retval;
11106     }
11107
11108   return retval;
11109 }
11110
11111 static void initialize_aarch64_code_model (struct gcc_options *);
11112
11113 /* Parse the TO_PARSE string and put the architecture struct that it
11114    selects into RES and the architectural features into ISA_FLAGS.
11115    Return an aarch64_parse_opt_result describing the parse result.
11116    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11117    When the TO_PARSE string contains an invalid extension,
11118    a copy of the string is created and stored to INVALID_EXTENSION.  */
11119
11120 static enum aarch64_parse_opt_result
11121 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11122                     uint64_t *isa_flags, std::string *invalid_extension)
11123 {
11124   const char *ext;
11125   const struct processor *arch;
11126   size_t len;
11127
11128   ext = strchr (to_parse, '+');
11129
11130   if (ext != NULL)
11131     len = ext - to_parse;
11132   else
11133     len = strlen (to_parse);
11134
11135   if (len == 0)
11136     return AARCH64_PARSE_MISSING_ARG;
11137
11138
11139   /* Loop through the list of supported ARCHes to find a match.  */
11140   for (arch = all_architectures; arch->name != NULL; arch++)
11141     {
11142       if (strlen (arch->name) == len
11143           && strncmp (arch->name, to_parse, len) == 0)
11144         {
11145           uint64_t isa_temp = arch->flags;
11146
11147           if (ext != NULL)
11148             {
11149               /* TO_PARSE string contains at least one extension.  */
11150               enum aarch64_parse_opt_result ext_res
11151                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11152
11153               if (ext_res != AARCH64_PARSE_OK)
11154                 return ext_res;
11155             }
11156           /* Extension parsing was successful.  Confirm the result
11157              arch and ISA flags.  */
11158           *res = arch;
11159           *isa_flags = isa_temp;
11160           return AARCH64_PARSE_OK;
11161         }
11162     }
11163
11164   /* ARCH name not found in list.  */
11165   return AARCH64_PARSE_INVALID_ARG;
11166 }
11167
11168 /* Parse the TO_PARSE string and put the result tuning in RES and the
11169    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
11170    describing the parse result.  If there is an error parsing, RES and
11171    ISA_FLAGS are left unchanged.
11172    When the TO_PARSE string contains an invalid extension,
11173    a copy of the string is created and stored to INVALID_EXTENSION.  */
11174
11175 static enum aarch64_parse_opt_result
11176 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11177                    uint64_t *isa_flags, std::string *invalid_extension)
11178 {
11179   const char *ext;
11180   const struct processor *cpu;
11181   size_t len;
11182
11183   ext = strchr (to_parse, '+');
11184
11185   if (ext != NULL)
11186     len = ext - to_parse;
11187   else
11188     len = strlen (to_parse);
11189
11190   if (len == 0)
11191     return AARCH64_PARSE_MISSING_ARG;
11192
11193
11194   /* Loop through the list of supported CPUs to find a match.  */
11195   for (cpu = all_cores; cpu->name != NULL; cpu++)
11196     {
11197       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
11198         {
11199           uint64_t isa_temp = cpu->flags;
11200
11201
11202           if (ext != NULL)
11203             {
11204               /* TO_PARSE string contains at least one extension.  */
11205               enum aarch64_parse_opt_result ext_res
11206                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11207
11208               if (ext_res != AARCH64_PARSE_OK)
11209                 return ext_res;
11210             }
11211           /* Extension parsing was successfull.  Confirm the result
11212              cpu and ISA flags.  */
11213           *res = cpu;
11214           *isa_flags = isa_temp;
11215           return AARCH64_PARSE_OK;
11216         }
11217     }
11218
11219   /* CPU name not found in list.  */
11220   return AARCH64_PARSE_INVALID_ARG;
11221 }
11222
11223 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11224    Return an aarch64_parse_opt_result describing the parse result.
11225    If the parsing fails the RES does not change.  */
11226
11227 static enum aarch64_parse_opt_result
11228 aarch64_parse_tune (const char *to_parse, const struct processor **res)
11229 {
11230   const struct processor *cpu;
11231
11232   /* Loop through the list of supported CPUs to find a match.  */
11233   for (cpu = all_cores; cpu->name != NULL; cpu++)
11234     {
11235       if (strcmp (cpu->name, to_parse) == 0)
11236         {
11237           *res = cpu;
11238           return AARCH64_PARSE_OK;
11239         }
11240     }
11241
11242   /* CPU name not found in list.  */
11243   return AARCH64_PARSE_INVALID_ARG;
11244 }
11245
11246 /* Parse TOKEN, which has length LENGTH to see if it is an option
11247    described in FLAG.  If it is, return the index bit for that fusion type.
11248    If not, error (printing OPTION_NAME) and return zero.  */
11249
11250 static unsigned int
11251 aarch64_parse_one_option_token (const char *token,
11252                                 size_t length,
11253                                 const struct aarch64_flag_desc *flag,
11254                                 const char *option_name)
11255 {
11256   for (; flag->name != NULL; flag++)
11257     {
11258       if (length == strlen (flag->name)
11259           && !strncmp (flag->name, token, length))
11260         return flag->flag;
11261     }
11262
11263   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
11264   return 0;
11265 }
11266
11267 /* Parse OPTION which is a comma-separated list of flags to enable.
11268    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11269    default state we inherit from the CPU tuning structures.  OPTION_NAME
11270    gives the top-level option we are parsing in the -moverride string,
11271    for use in error messages.  */
11272
11273 static unsigned int
11274 aarch64_parse_boolean_options (const char *option,
11275                                const struct aarch64_flag_desc *flags,
11276                                unsigned int initial_state,
11277                                const char *option_name)
11278 {
11279   const char separator = '.';
11280   const char* specs = option;
11281   const char* ntoken = option;
11282   unsigned int found_flags = initial_state;
11283
11284   while ((ntoken = strchr (specs, separator)))
11285     {
11286       size_t token_length = ntoken - specs;
11287       unsigned token_ops = aarch64_parse_one_option_token (specs,
11288                                                            token_length,
11289                                                            flags,
11290                                                            option_name);
11291       /* If we find "none" (or, for simplicity's sake, an error) anywhere
11292          in the token stream, reset the supported operations.  So:
11293
11294            adrp+add.cmp+branch.none.adrp+add
11295
11296            would have the result of turning on only adrp+add fusion.  */
11297       if (!token_ops)
11298         found_flags = 0;
11299
11300       found_flags |= token_ops;
11301       specs = ++ntoken;
11302     }
11303
11304   /* We ended with a comma, print something.  */
11305   if (!(*specs))
11306     {
11307       error ("%s string ill-formed\n", option_name);
11308       return 0;
11309     }
11310
11311   /* We still have one more token to parse.  */
11312   size_t token_length = strlen (specs);
11313   unsigned token_ops = aarch64_parse_one_option_token (specs,
11314                                                        token_length,
11315                                                        flags,
11316                                                        option_name);
11317    if (!token_ops)
11318      found_flags = 0;
11319
11320   found_flags |= token_ops;
11321   return found_flags;
11322 }
11323
11324 /* Support for overriding instruction fusion.  */
11325
11326 static void
11327 aarch64_parse_fuse_string (const char *fuse_string,
11328                             struct tune_params *tune)
11329 {
11330   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
11331                                                      aarch64_fusible_pairs,
11332                                                      tune->fusible_ops,
11333                                                      "fuse=");
11334 }
11335
11336 /* Support for overriding other tuning flags.  */
11337
11338 static void
11339 aarch64_parse_tune_string (const char *tune_string,
11340                             struct tune_params *tune)
11341 {
11342   tune->extra_tuning_flags
11343     = aarch64_parse_boolean_options (tune_string,
11344                                      aarch64_tuning_flags,
11345                                      tune->extra_tuning_flags,
11346                                      "tune=");
11347 }
11348
11349 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11350    Accept the valid SVE vector widths allowed by
11351    aarch64_sve_vector_bits_enum and use it to override sve_width
11352    in TUNE.  */
11353
11354 static void
11355 aarch64_parse_sve_width_string (const char *tune_string,
11356                                 struct tune_params *tune)
11357 {
11358   int width = -1;
11359
11360   int n = sscanf (tune_string, "%d", &width);
11361   if (n == EOF)
11362     {
11363       error ("invalid format for sve_width");
11364       return;
11365     }
11366   switch (width)
11367     {
11368     case SVE_128:
11369     case SVE_256:
11370     case SVE_512:
11371     case SVE_1024:
11372     case SVE_2048:
11373       break;
11374     default:
11375       error ("invalid sve_width value: %d", width);
11376     }
11377   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
11378 }
11379
11380 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11381    we understand.  If it is, extract the option string and handoff to
11382    the appropriate function.  */
11383
11384 void
11385 aarch64_parse_one_override_token (const char* token,
11386                                   size_t length,
11387                                   struct tune_params *tune)
11388 {
11389   const struct aarch64_tuning_override_function *fn
11390     = aarch64_tuning_override_functions;
11391
11392   const char *option_part = strchr (token, '=');
11393   if (!option_part)
11394     {
11395       error ("tuning string missing in option (%s)", token);
11396       return;
11397     }
11398
11399   /* Get the length of the option name.  */
11400   length = option_part - token;
11401   /* Skip the '=' to get to the option string.  */
11402   option_part++;
11403
11404   for (; fn->name != NULL; fn++)
11405     {
11406       if (!strncmp (fn->name, token, length))
11407         {
11408           fn->parse_override (option_part, tune);
11409           return;
11410         }
11411     }
11412
11413   error ("unknown tuning option (%s)",token);
11414   return;
11415 }
11416
11417 /* A checking mechanism for the implementation of the tls size.  */
11418
11419 static void
11420 initialize_aarch64_tls_size (struct gcc_options *opts)
11421 {
11422   if (aarch64_tls_size == 0)
11423     aarch64_tls_size = 24;
11424
11425   switch (opts->x_aarch64_cmodel_var)
11426     {
11427     case AARCH64_CMODEL_TINY:
11428       /* Both the default and maximum TLS size allowed under tiny is 1M which
11429          needs two instructions to address, so we clamp the size to 24.  */
11430       if (aarch64_tls_size > 24)
11431         aarch64_tls_size = 24;
11432       break;
11433     case AARCH64_CMODEL_SMALL:
11434       /* The maximum TLS size allowed under small is 4G.  */
11435       if (aarch64_tls_size > 32)
11436         aarch64_tls_size = 32;
11437       break;
11438     case AARCH64_CMODEL_LARGE:
11439       /* The maximum TLS size allowed under large is 16E.
11440          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
11441       if (aarch64_tls_size > 48)
11442         aarch64_tls_size = 48;
11443       break;
11444     default:
11445       gcc_unreachable ();
11446     }
11447
11448   return;
11449 }
11450
11451 /* Parse STRING looking for options in the format:
11452      string     :: option:string
11453      option     :: name=substring
11454      name       :: {a-z}
11455      substring  :: defined by option.  */
11456
11457 static void
11458 aarch64_parse_override_string (const char* input_string,
11459                                struct tune_params* tune)
11460 {
11461   const char separator = ':';
11462   size_t string_length = strlen (input_string) + 1;
11463   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
11464   char *string = string_root;
11465   strncpy (string, input_string, string_length);
11466   string[string_length - 1] = '\0';
11467
11468   char* ntoken = string;
11469
11470   while ((ntoken = strchr (string, separator)))
11471     {
11472       size_t token_length = ntoken - string;
11473       /* Make this substring look like a string.  */
11474       *ntoken = '\0';
11475       aarch64_parse_one_override_token (string, token_length, tune);
11476       string = ++ntoken;
11477     }
11478
11479   /* One last option to parse.  */
11480   aarch64_parse_one_override_token (string, strlen (string), tune);
11481   free (string_root);
11482 }
11483
11484
11485 static void
11486 aarch64_override_options_after_change_1 (struct gcc_options *opts)
11487 {
11488   if (accepted_branch_protection_string)
11489     {
11490       opts->x_aarch64_branch_protection_string
11491         = xstrdup (accepted_branch_protection_string);
11492     }
11493
11494   /* PR 70044: We have to be careful about being called multiple times for the
11495      same function.  This means all changes should be repeatable.  */
11496
11497   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
11498      Disable the frame pointer flag so the mid-end will not use a frame
11499      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
11500      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
11501      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
11502   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
11503   if (opts->x_flag_omit_frame_pointer == 0)
11504     opts->x_flag_omit_frame_pointer = 2;
11505
11506   /* If not optimizing for size, set the default
11507      alignment to what the target wants.  */
11508   if (!opts->x_optimize_size)
11509     {
11510       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
11511         opts->x_str_align_loops = aarch64_tune_params.loop_align;
11512       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
11513         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
11514       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
11515         opts->x_str_align_functions = aarch64_tune_params.function_align;
11516     }
11517
11518   /* We default to no pc-relative literal loads.  */
11519
11520   aarch64_pcrelative_literal_loads = false;
11521
11522   /* If -mpc-relative-literal-loads is set on the command line, this
11523      implies that the user asked for PC relative literal loads.  */
11524   if (opts->x_pcrelative_literal_loads == 1)
11525     aarch64_pcrelative_literal_loads = true;
11526
11527   /* In the tiny memory model it makes no sense to disallow PC relative
11528      literal pool loads.  */
11529   if (aarch64_cmodel == AARCH64_CMODEL_TINY
11530       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11531     aarch64_pcrelative_literal_loads = true;
11532
11533   /* When enabling the lower precision Newton series for the square root, also
11534      enable it for the reciprocal square root, since the latter is an
11535      intermediary step for the former.  */
11536   if (flag_mlow_precision_sqrt)
11537     flag_mrecip_low_precision_sqrt = true;
11538 }
11539
11540 /* 'Unpack' up the internal tuning structs and update the options
11541     in OPTS.  The caller must have set up selected_tune and selected_arch
11542     as all the other target-specific codegen decisions are
11543     derived from them.  */
11544
11545 void
11546 aarch64_override_options_internal (struct gcc_options *opts)
11547 {
11548   aarch64_tune_flags = selected_tune->flags;
11549   aarch64_tune = selected_tune->sched_core;
11550   /* Make a copy of the tuning parameters attached to the core, which
11551      we may later overwrite.  */
11552   aarch64_tune_params = *(selected_tune->tune);
11553   aarch64_architecture_version = selected_arch->architecture_version;
11554
11555   if (opts->x_aarch64_override_tune_string)
11556     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
11557                                   &aarch64_tune_params);
11558
11559   /* This target defaults to strict volatile bitfields.  */
11560   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
11561     opts->x_flag_strict_volatile_bitfields = 1;
11562
11563   if (aarch64_stack_protector_guard == SSP_GLOBAL
11564       && opts->x_aarch64_stack_protector_guard_offset_str)
11565     {
11566       error ("incompatible options %<-mstack-protector-guard=global%> and "
11567              "%<-mstack-protector-guard-offset=%s%>",
11568              aarch64_stack_protector_guard_offset_str);
11569     }
11570
11571   if (aarch64_stack_protector_guard == SSP_SYSREG
11572       && !(opts->x_aarch64_stack_protector_guard_offset_str
11573            && opts->x_aarch64_stack_protector_guard_reg_str))
11574     {
11575       error ("both %<-mstack-protector-guard-offset%> and "
11576              "%<-mstack-protector-guard-reg%> must be used "
11577              "with %<-mstack-protector-guard=sysreg%>");
11578     }
11579
11580   if (opts->x_aarch64_stack_protector_guard_reg_str)
11581     {
11582       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
11583           error ("specify a system register with a small string length.");
11584     }
11585
11586   if (opts->x_aarch64_stack_protector_guard_offset_str)
11587     {
11588       char *end;
11589       const char *str = aarch64_stack_protector_guard_offset_str;
11590       errno = 0;
11591       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
11592       if (!*str || *end || errno)
11593         error ("%qs is not a valid offset in %qs", str,
11594                "-mstack-protector-guard-offset=");
11595       aarch64_stack_protector_guard_offset = offs;
11596     }
11597
11598   initialize_aarch64_code_model (opts);
11599   initialize_aarch64_tls_size (opts);
11600
11601   int queue_depth = 0;
11602   switch (aarch64_tune_params.autoprefetcher_model)
11603     {
11604       case tune_params::AUTOPREFETCHER_OFF:
11605         queue_depth = -1;
11606         break;
11607       case tune_params::AUTOPREFETCHER_WEAK:
11608         queue_depth = 0;
11609         break;
11610       case tune_params::AUTOPREFETCHER_STRONG:
11611         queue_depth = max_insn_queue_index + 1;
11612         break;
11613       default:
11614         gcc_unreachable ();
11615     }
11616
11617   /* We don't mind passing in global_options_set here as we don't use
11618      the *options_set structs anyway.  */
11619   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
11620                          queue_depth,
11621                          opts->x_param_values,
11622                          global_options_set.x_param_values);
11623
11624   /* Set up parameters to be used in prefetching algorithm.  Do not
11625      override the defaults unless we are tuning for a core we have
11626      researched values for.  */
11627   if (aarch64_tune_params.prefetch->num_slots > 0)
11628     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
11629                            aarch64_tune_params.prefetch->num_slots,
11630                            opts->x_param_values,
11631                            global_options_set.x_param_values);
11632   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
11633     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
11634                            aarch64_tune_params.prefetch->l1_cache_size,
11635                            opts->x_param_values,
11636                            global_options_set.x_param_values);
11637   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
11638     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
11639                            aarch64_tune_params.prefetch->l1_cache_line_size,
11640                            opts->x_param_values,
11641                            global_options_set.x_param_values);
11642   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
11643     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
11644                            aarch64_tune_params.prefetch->l2_cache_size,
11645                            opts->x_param_values,
11646                            global_options_set.x_param_values);
11647   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
11648     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
11649                            0,
11650                            opts->x_param_values,
11651                            global_options_set.x_param_values);
11652   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
11653     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
11654                            aarch64_tune_params.prefetch->minimum_stride,
11655                            opts->x_param_values,
11656                            global_options_set.x_param_values);
11657
11658   /* Use the alternative scheduling-pressure algorithm by default.  */
11659   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
11660                          opts->x_param_values,
11661                          global_options_set.x_param_values);
11662
11663   /* If the user hasn't changed it via configure then set the default to 64 KB
11664      for the backend.  */
11665   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
11666                          DEFAULT_STK_CLASH_GUARD_SIZE == 0
11667                            ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
11668                          opts->x_param_values,
11669                          global_options_set.x_param_values);
11670
11671   /* Validate the guard size.  */
11672   int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
11673
11674   /* Enforce that interval is the same size as size so the mid-end does the
11675      right thing.  */
11676   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
11677                          guard_size,
11678                          opts->x_param_values,
11679                          global_options_set.x_param_values);
11680
11681   /* The maybe_set calls won't update the value if the user has explicitly set
11682      one.  Which means we need to validate that probing interval and guard size
11683      are equal.  */
11684   int probe_interval
11685     = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
11686   if (guard_size != probe_interval)
11687     error ("stack clash guard size %<%d%> must be equal to probing interval "
11688            "%<%d%>", guard_size, probe_interval);
11689
11690   /* Enable sw prefetching at specified optimization level for
11691      CPUS that have prefetch.  Lower optimization level threshold by 1
11692      when profiling is enabled.  */
11693   if (opts->x_flag_prefetch_loop_arrays < 0
11694       && !opts->x_optimize_size
11695       && aarch64_tune_params.prefetch->default_opt_level >= 0
11696       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
11697     opts->x_flag_prefetch_loop_arrays = 1;
11698
11699   if (opts->x_aarch64_arch_string == NULL)
11700     opts->x_aarch64_arch_string = selected_arch->name;
11701   if (opts->x_aarch64_cpu_string == NULL)
11702     opts->x_aarch64_cpu_string = selected_cpu->name;
11703   if (opts->x_aarch64_tune_string == NULL)
11704     opts->x_aarch64_tune_string = selected_tune->name;
11705
11706   aarch64_override_options_after_change_1 (opts);
11707 }
11708
11709 /* Print a hint with a suggestion for a core or architecture name that
11710    most closely resembles what the user passed in STR.  ARCH is true if
11711    the user is asking for an architecture name.  ARCH is false if the user
11712    is asking for a core name.  */
11713
11714 static void
11715 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
11716 {
11717   auto_vec<const char *> candidates;
11718   const struct processor *entry = arch ? all_architectures : all_cores;
11719   for (; entry->name != NULL; entry++)
11720     candidates.safe_push (entry->name);
11721
11722 #ifdef HAVE_LOCAL_CPU_DETECT
11723   /* Add also "native" as possible value.  */
11724   if (arch)
11725     candidates.safe_push ("native");
11726 #endif
11727
11728   char *s;
11729   const char *hint = candidates_list_and_hint (str, s, candidates);
11730   if (hint)
11731     inform (input_location, "valid arguments are: %s;"
11732                              " did you mean %qs?", s, hint);
11733   else
11734     inform (input_location, "valid arguments are: %s", s);
11735
11736   XDELETEVEC (s);
11737 }
11738
11739 /* Print a hint with a suggestion for a core name that most closely resembles
11740    what the user passed in STR.  */
11741
11742 inline static void
11743 aarch64_print_hint_for_core (const char *str)
11744 {
11745   aarch64_print_hint_for_core_or_arch (str, false);
11746 }
11747
11748 /* Print a hint with a suggestion for an architecture name that most closely
11749    resembles what the user passed in STR.  */
11750
11751 inline static void
11752 aarch64_print_hint_for_arch (const char *str)
11753 {
11754   aarch64_print_hint_for_core_or_arch (str, true);
11755 }
11756
11757
11758 /* Print a hint with a suggestion for an extension name
11759    that most closely resembles what the user passed in STR.  */
11760
11761 void
11762 aarch64_print_hint_for_extensions (const std::string &str)
11763 {
11764   auto_vec<const char *> candidates;
11765   aarch64_get_all_extension_candidates (&candidates);
11766   char *s;
11767   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
11768   if (hint)
11769     inform (input_location, "valid arguments are: %s;"
11770                              " did you mean %qs?", s, hint);
11771   else
11772     inform (input_location, "valid arguments are: %s;", s);
11773
11774   XDELETEVEC (s);
11775 }
11776
11777 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
11778    specified in STR and throw errors if appropriate.  Put the results if
11779    they are valid in RES and ISA_FLAGS.  Return whether the option is
11780    valid.  */
11781
11782 static bool
11783 aarch64_validate_mcpu (const char *str, const struct processor **res,
11784                        uint64_t *isa_flags)
11785 {
11786   std::string invalid_extension;
11787   enum aarch64_parse_opt_result parse_res
11788     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
11789
11790   if (parse_res == AARCH64_PARSE_OK)
11791     return true;
11792
11793   switch (parse_res)
11794     {
11795       case AARCH64_PARSE_MISSING_ARG:
11796         error ("missing cpu name in %<-mcpu=%s%>", str);
11797         break;
11798       case AARCH64_PARSE_INVALID_ARG:
11799         error ("unknown value %qs for %<-mcpu%>", str);
11800         aarch64_print_hint_for_core (str);
11801         break;
11802       case AARCH64_PARSE_INVALID_FEATURE:
11803         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
11804                invalid_extension.c_str (), str);
11805         aarch64_print_hint_for_extensions (invalid_extension);
11806         break;
11807       default:
11808         gcc_unreachable ();
11809     }
11810
11811   return false;
11812 }
11813
11814 /* Parses CONST_STR for branch protection features specified in
11815    aarch64_branch_protect_types, and set any global variables required.  Returns
11816    the parsing result and assigns LAST_STR to the last processed token from
11817    CONST_STR so that it can be used for error reporting.  */
11818
11819 static enum
11820 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
11821                                                           char** last_str)
11822 {
11823   char *str_root = xstrdup (const_str);
11824   char* token_save = NULL;
11825   char *str = strtok_r (str_root, "+", &token_save);
11826   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
11827   if (!str)
11828     res = AARCH64_PARSE_MISSING_ARG;
11829   else
11830     {
11831       char *next_str = strtok_r (NULL, "+", &token_save);
11832       /* Reset the branch protection features to their defaults.  */
11833       aarch64_handle_no_branch_protection (NULL, NULL);
11834
11835       while (str && res == AARCH64_PARSE_OK)
11836         {
11837           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
11838           bool found = false;
11839           /* Search for this type.  */
11840           while (type && type->name && !found && res == AARCH64_PARSE_OK)
11841             {
11842               if (strcmp (str, type->name) == 0)
11843                 {
11844                   found = true;
11845                   res = type->handler (str, next_str);
11846                   str = next_str;
11847                   next_str = strtok_r (NULL, "+", &token_save);
11848                 }
11849               else
11850                 type++;
11851             }
11852           if (found && res == AARCH64_PARSE_OK)
11853             {
11854               bool found_subtype = true;
11855               /* Loop through each token until we find one that isn't a
11856                  subtype.  */
11857               while (found_subtype)
11858                 {
11859                   found_subtype = false;
11860                   const aarch64_branch_protect_type *subtype = type->subtypes;
11861                   /* Search for the subtype.  */
11862                   while (str && subtype && subtype->name && !found_subtype
11863                           && res == AARCH64_PARSE_OK)
11864                     {
11865                       if (strcmp (str, subtype->name) == 0)
11866                         {
11867                           found_subtype = true;
11868                           res = subtype->handler (str, next_str);
11869                           str = next_str;
11870                           next_str = strtok_r (NULL, "+", &token_save);
11871                         }
11872                       else
11873                         subtype++;
11874                     }
11875                 }
11876             }
11877           else if (!found)
11878             res = AARCH64_PARSE_INVALID_ARG;
11879         }
11880     }
11881   /* Copy the last processed token into the argument to pass it back.
11882     Used by option and attribute validation to print the offending token.  */
11883   if (last_str)
11884     {
11885       if (str) strcpy (*last_str, str);
11886       else *last_str = NULL;
11887     }
11888   if (res == AARCH64_PARSE_OK)
11889     {
11890       /* If needed, alloc the accepted string then copy in const_str.
11891         Used by override_option_after_change_1.  */
11892       if (!accepted_branch_protection_string)
11893         accepted_branch_protection_string = (char *) xmalloc (
11894                                                       BRANCH_PROTECT_STR_MAX
11895                                                         + 1);
11896       strncpy (accepted_branch_protection_string, const_str,
11897                 BRANCH_PROTECT_STR_MAX + 1);
11898       /* Forcibly null-terminate.  */
11899       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
11900     }
11901   return res;
11902 }
11903
11904 static bool
11905 aarch64_validate_mbranch_protection (const char *const_str)
11906 {
11907   char *str = (char *) xmalloc (strlen (const_str));
11908   enum aarch64_parse_opt_result res =
11909     aarch64_parse_branch_protection (const_str, &str);
11910   if (res == AARCH64_PARSE_INVALID_ARG)
11911     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
11912   else if (res == AARCH64_PARSE_MISSING_ARG)
11913     error ("missing argument for %<-mbranch-protection=%>");
11914   free (str);
11915   return res == AARCH64_PARSE_OK;
11916 }
11917
11918 /* Validate a command-line -march option.  Parse the arch and extensions
11919    (if any) specified in STR and throw errors if appropriate.  Put the
11920    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
11921    option is valid.  */
11922
11923 static bool
11924 aarch64_validate_march (const char *str, const struct processor **res,
11925                          uint64_t *isa_flags)
11926 {
11927   std::string invalid_extension;
11928   enum aarch64_parse_opt_result parse_res
11929     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
11930
11931   if (parse_res == AARCH64_PARSE_OK)
11932     return true;
11933
11934   switch (parse_res)
11935     {
11936       case AARCH64_PARSE_MISSING_ARG:
11937         error ("missing arch name in %<-march=%s%>", str);
11938         break;
11939       case AARCH64_PARSE_INVALID_ARG:
11940         error ("unknown value %qs for %<-march%>", str);
11941         aarch64_print_hint_for_arch (str);
11942         break;
11943       case AARCH64_PARSE_INVALID_FEATURE:
11944         error ("invalid feature modifier %qs in %<-march=%s%>",
11945                invalid_extension.c_str (), str);
11946         aarch64_print_hint_for_extensions (invalid_extension);
11947         break;
11948       default:
11949         gcc_unreachable ();
11950     }
11951
11952   return false;
11953 }
11954
11955 /* Validate a command-line -mtune option.  Parse the cpu
11956    specified in STR and throw errors if appropriate.  Put the
11957    result, if it is valid, in RES.  Return whether the option is
11958    valid.  */
11959
11960 static bool
11961 aarch64_validate_mtune (const char *str, const struct processor **res)
11962 {
11963   enum aarch64_parse_opt_result parse_res
11964     = aarch64_parse_tune (str, res);
11965
11966   if (parse_res == AARCH64_PARSE_OK)
11967     return true;
11968
11969   switch (parse_res)
11970     {
11971       case AARCH64_PARSE_MISSING_ARG:
11972         error ("missing cpu name in %<-mtune=%s%>", str);
11973         break;
11974       case AARCH64_PARSE_INVALID_ARG:
11975         error ("unknown value %qs for %<-mtune%>", str);
11976         aarch64_print_hint_for_core (str);
11977         break;
11978       default:
11979         gcc_unreachable ();
11980     }
11981   return false;
11982 }
11983
11984 /* Return the CPU corresponding to the enum CPU.
11985    If it doesn't specify a cpu, return the default.  */
11986
11987 static const struct processor *
11988 aarch64_get_tune_cpu (enum aarch64_processor cpu)
11989 {
11990   if (cpu != aarch64_none)
11991     return &all_cores[cpu];
11992
11993   /* The & 0x3f is to extract the bottom 6 bits that encode the
11994      default cpu as selected by the --with-cpu GCC configure option
11995      in config.gcc.
11996      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
11997      flags mechanism should be reworked to make it more sane.  */
11998   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11999 }
12000
12001 /* Return the architecture corresponding to the enum ARCH.
12002    If it doesn't specify a valid architecture, return the default.  */
12003
12004 static const struct processor *
12005 aarch64_get_arch (enum aarch64_arch arch)
12006 {
12007   if (arch != aarch64_no_arch)
12008     return &all_architectures[arch];
12009
12010   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12011
12012   return &all_architectures[cpu->arch];
12013 }
12014
12015 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
12016
12017 static poly_uint16
12018 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12019 {
12020   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12021      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12022      deciding which .md file patterns to use and when deciding whether
12023      something is a legitimate address or constant.  */
12024   if (value == SVE_SCALABLE || value == SVE_128)
12025     return poly_uint16 (2, 2);
12026   else
12027     return (int) value / 64;
12028 }
12029
12030 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
12031    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12032    tuning structs.  In particular it must set selected_tune and
12033    aarch64_isa_flags that define the available ISA features and tuning
12034    decisions.  It must also set selected_arch as this will be used to
12035    output the .arch asm tags for each function.  */
12036
12037 static void
12038 aarch64_override_options (void)
12039 {
12040   uint64_t cpu_isa = 0;
12041   uint64_t arch_isa = 0;
12042   aarch64_isa_flags = 0;
12043
12044   bool valid_cpu = true;
12045   bool valid_tune = true;
12046   bool valid_arch = true;
12047
12048   selected_cpu = NULL;
12049   selected_arch = NULL;
12050   selected_tune = NULL;
12051
12052   if (aarch64_branch_protection_string)
12053     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12054
12055   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12056      If either of -march or -mtune is given, they override their
12057      respective component of -mcpu.  */
12058   if (aarch64_cpu_string)
12059     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12060                                         &cpu_isa);
12061
12062   if (aarch64_arch_string)
12063     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
12064                                           &arch_isa);
12065
12066   if (aarch64_tune_string)
12067     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
12068
12069 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12070   SUBTARGET_OVERRIDE_OPTIONS;
12071 #endif
12072
12073   /* If the user did not specify a processor, choose the default
12074      one for them.  This will be the CPU set during configuration using
12075      --with-cpu, otherwise it is "generic".  */
12076   if (!selected_cpu)
12077     {
12078       if (selected_arch)
12079         {
12080           selected_cpu = &all_cores[selected_arch->ident];
12081           aarch64_isa_flags = arch_isa;
12082           explicit_arch = selected_arch->arch;
12083         }
12084       else
12085         {
12086           /* Get default configure-time CPU.  */
12087           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
12088           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
12089         }
12090
12091       if (selected_tune)
12092         explicit_tune_core = selected_tune->ident;
12093     }
12094   /* If both -mcpu and -march are specified check that they are architecturally
12095      compatible, warn if they're not and prefer the -march ISA flags.  */
12096   else if (selected_arch)
12097     {
12098       if (selected_arch->arch != selected_cpu->arch)
12099         {
12100           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12101                        all_architectures[selected_cpu->arch].name,
12102                        selected_arch->name);
12103         }
12104       aarch64_isa_flags = arch_isa;
12105       explicit_arch = selected_arch->arch;
12106       explicit_tune_core = selected_tune ? selected_tune->ident
12107                                           : selected_cpu->ident;
12108     }
12109   else
12110     {
12111       /* -mcpu but no -march.  */
12112       aarch64_isa_flags = cpu_isa;
12113       explicit_tune_core = selected_tune ? selected_tune->ident
12114                                           : selected_cpu->ident;
12115       gcc_assert (selected_cpu);
12116       selected_arch = &all_architectures[selected_cpu->arch];
12117       explicit_arch = selected_arch->arch;
12118     }
12119
12120   /* Set the arch as well as we will need it when outputing
12121      the .arch directive in assembly.  */
12122   if (!selected_arch)
12123     {
12124       gcc_assert (selected_cpu);
12125       selected_arch = &all_architectures[selected_cpu->arch];
12126     }
12127
12128   if (!selected_tune)
12129     selected_tune = selected_cpu;
12130
12131   if (aarch64_enable_bti == 2)
12132     {
12133 #ifdef TARGET_ENABLE_BTI
12134       aarch64_enable_bti = 1;
12135 #else
12136       aarch64_enable_bti = 0;
12137 #endif
12138     }
12139
12140   /* Return address signing is currently not supported for ILP32 targets.  For
12141      LP64 targets use the configured option in the absence of a command-line
12142      option for -mbranch-protection.  */
12143   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12144     {
12145 #ifdef TARGET_ENABLE_PAC_RET
12146       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12147 #else
12148       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12149 #endif
12150     }
12151
12152 #ifndef HAVE_AS_MABI_OPTION
12153   /* The compiler may have been configured with 2.23.* binutils, which does
12154      not have support for ILP32.  */
12155   if (TARGET_ILP32)
12156     error ("assembler does not support %<-mabi=ilp32%>");
12157 #endif
12158
12159   /* Convert -msve-vector-bits to a VG count.  */
12160   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12161
12162   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12163     sorry ("return address signing is only supported for %<-mabi=lp64%>");
12164
12165   /* Make sure we properly set up the explicit options.  */
12166   if ((aarch64_cpu_string && valid_cpu)
12167        || (aarch64_tune_string && valid_tune))
12168     gcc_assert (explicit_tune_core != aarch64_none);
12169
12170   if ((aarch64_cpu_string && valid_cpu)
12171        || (aarch64_arch_string && valid_arch))
12172     gcc_assert (explicit_arch != aarch64_no_arch);
12173
12174   /* The pass to insert speculation tracking runs before
12175      shrink-wrapping and the latter does not know how to update the
12176      tracking status.  So disable it in this case.  */
12177   if (aarch64_track_speculation)
12178     flag_shrink_wrap = 0;
12179
12180   aarch64_override_options_internal (&global_options);
12181
12182   /* Save these options as the default ones in case we push and pop them later
12183      while processing functions with potential target attributes.  */
12184   target_option_default_node = target_option_current_node
12185       = build_target_option_node (&global_options);
12186 }
12187
12188 /* Implement targetm.override_options_after_change.  */
12189
12190 static void
12191 aarch64_override_options_after_change (void)
12192 {
12193   aarch64_override_options_after_change_1 (&global_options);
12194 }
12195
12196 static struct machine_function *
12197 aarch64_init_machine_status (void)
12198 {
12199   struct machine_function *machine;
12200   machine = ggc_cleared_alloc<machine_function> ();
12201   return machine;
12202 }
12203
12204 void
12205 aarch64_init_expanders (void)
12206 {
12207   init_machine_status = aarch64_init_machine_status;
12208 }
12209
12210 /* A checking mechanism for the implementation of the various code models.  */
12211 static void
12212 initialize_aarch64_code_model (struct gcc_options *opts)
12213 {
12214    if (opts->x_flag_pic)
12215      {
12216        switch (opts->x_aarch64_cmodel_var)
12217          {
12218          case AARCH64_CMODEL_TINY:
12219            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
12220            break;
12221          case AARCH64_CMODEL_SMALL:
12222 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12223            aarch64_cmodel = (flag_pic == 2
12224                              ? AARCH64_CMODEL_SMALL_PIC
12225                              : AARCH64_CMODEL_SMALL_SPIC);
12226 #else
12227            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
12228 #endif
12229            break;
12230          case AARCH64_CMODEL_LARGE:
12231            sorry ("code model %qs with %<-f%s%>", "large",
12232                   opts->x_flag_pic > 1 ? "PIC" : "pic");
12233            break;
12234          default:
12235            gcc_unreachable ();
12236          }
12237      }
12238    else
12239      aarch64_cmodel = opts->x_aarch64_cmodel_var;
12240 }
12241
12242 /* Implement TARGET_OPTION_SAVE.  */
12243
12244 static void
12245 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
12246 {
12247   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
12248   ptr->x_aarch64_branch_protection_string
12249     = opts->x_aarch64_branch_protection_string;
12250 }
12251
12252 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
12253    using the information saved in PTR.  */
12254
12255 static void
12256 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
12257 {
12258   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
12259   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12260   opts->x_explicit_arch = ptr->x_explicit_arch;
12261   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
12262   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
12263   opts->x_aarch64_branch_protection_string
12264     = ptr->x_aarch64_branch_protection_string;
12265   if (opts->x_aarch64_branch_protection_string)
12266     {
12267       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
12268                                         NULL);
12269     }
12270
12271   aarch64_override_options_internal (opts);
12272 }
12273
12274 /* Implement TARGET_OPTION_PRINT.  */
12275
12276 static void
12277 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
12278 {
12279   const struct processor *cpu
12280     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12281   uint64_t isa_flags = ptr->x_aarch64_isa_flags;
12282   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
12283   std::string extension
12284     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
12285
12286   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
12287   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
12288            arch->name, extension.c_str ());
12289 }
12290
12291 static GTY(()) tree aarch64_previous_fndecl;
12292
12293 void
12294 aarch64_reset_previous_fndecl (void)
12295 {
12296   aarch64_previous_fndecl = NULL;
12297 }
12298
12299 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
12300    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
12301    make sure optab availability predicates are recomputed when necessary.  */
12302
12303 void
12304 aarch64_save_restore_target_globals (tree new_tree)
12305 {
12306   if (TREE_TARGET_GLOBALS (new_tree))
12307     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
12308   else if (new_tree == target_option_default_node)
12309     restore_target_globals (&default_target_globals);
12310   else
12311     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
12312 }
12313
12314 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
12315    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
12316    of the function, if such exists.  This function may be called multiple
12317    times on a single function so use aarch64_previous_fndecl to avoid
12318    setting up identical state.  */
12319
12320 static void
12321 aarch64_set_current_function (tree fndecl)
12322 {
12323   if (!fndecl || fndecl == aarch64_previous_fndecl)
12324     return;
12325
12326   tree old_tree = (aarch64_previous_fndecl
12327                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
12328                    : NULL_TREE);
12329
12330   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12331
12332   /* If current function has no attributes but the previous one did,
12333      use the default node.  */
12334   if (!new_tree && old_tree)
12335     new_tree = target_option_default_node;
12336
12337   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
12338      the default have been handled by aarch64_save_restore_target_globals from
12339      aarch64_pragma_target_parse.  */
12340   if (old_tree == new_tree)
12341     return;
12342
12343   aarch64_previous_fndecl = fndecl;
12344
12345   /* First set the target options.  */
12346   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
12347
12348   aarch64_save_restore_target_globals (new_tree);
12349 }
12350
12351 /* Enum describing the various ways we can handle attributes.
12352    In many cases we can reuse the generic option handling machinery.  */
12353
12354 enum aarch64_attr_opt_type
12355 {
12356   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
12357   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
12358   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
12359   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
12360 };
12361
12362 /* All the information needed to handle a target attribute.
12363    NAME is the name of the attribute.
12364    ATTR_TYPE specifies the type of behavior of the attribute as described
12365    in the definition of enum aarch64_attr_opt_type.
12366    ALLOW_NEG is true if the attribute supports a "no-" form.
12367    HANDLER is the function that takes the attribute string as an argument
12368    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12369    OPT_NUM is the enum specifying the option that the attribute modifies.
12370    This is needed for attributes that mirror the behavior of a command-line
12371    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12372    aarch64_attr_enum.  */
12373
12374 struct aarch64_attribute_info
12375 {
12376   const char *name;
12377   enum aarch64_attr_opt_type attr_type;
12378   bool allow_neg;
12379   bool (*handler) (const char *);
12380   enum opt_code opt_num;
12381 };
12382
12383 /* Handle the ARCH_STR argument to the arch= target attribute.  */
12384
12385 static bool
12386 aarch64_handle_attr_arch (const char *str)
12387 {
12388   const struct processor *tmp_arch = NULL;
12389   std::string invalid_extension;
12390   enum aarch64_parse_opt_result parse_res
12391     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
12392
12393   if (parse_res == AARCH64_PARSE_OK)
12394     {
12395       gcc_assert (tmp_arch);
12396       selected_arch = tmp_arch;
12397       explicit_arch = selected_arch->arch;
12398       return true;
12399     }
12400
12401   switch (parse_res)
12402     {
12403       case AARCH64_PARSE_MISSING_ARG:
12404         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12405         break;
12406       case AARCH64_PARSE_INVALID_ARG:
12407         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
12408         aarch64_print_hint_for_arch (str);
12409         break;
12410       case AARCH64_PARSE_INVALID_FEATURE:
12411         error ("invalid feature modifier %s of value (\"%s\") in "
12412                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12413         aarch64_print_hint_for_extensions (invalid_extension);
12414         break;
12415       default:
12416         gcc_unreachable ();
12417     }
12418
12419   return false;
12420 }
12421
12422 /* Handle the argument CPU_STR to the cpu= target attribute.  */
12423
12424 static bool
12425 aarch64_handle_attr_cpu (const char *str)
12426 {
12427   const struct processor *tmp_cpu = NULL;
12428   std::string invalid_extension;
12429   enum aarch64_parse_opt_result parse_res
12430     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
12431
12432   if (parse_res == AARCH64_PARSE_OK)
12433     {
12434       gcc_assert (tmp_cpu);
12435       selected_tune = tmp_cpu;
12436       explicit_tune_core = selected_tune->ident;
12437
12438       selected_arch = &all_architectures[tmp_cpu->arch];
12439       explicit_arch = selected_arch->arch;
12440       return true;
12441     }
12442
12443   switch (parse_res)
12444     {
12445       case AARCH64_PARSE_MISSING_ARG:
12446         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12447         break;
12448       case AARCH64_PARSE_INVALID_ARG:
12449         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
12450         aarch64_print_hint_for_core (str);
12451         break;
12452       case AARCH64_PARSE_INVALID_FEATURE:
12453         error ("invalid feature modifier %s of value (\"%s\") in "
12454                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12455         aarch64_print_hint_for_extensions (invalid_extension);
12456         break;
12457       default:
12458         gcc_unreachable ();
12459     }
12460
12461   return false;
12462 }
12463
12464 /* Handle the argument STR to the branch-protection= attribute.  */
12465
12466  static bool
12467  aarch64_handle_attr_branch_protection (const char* str)
12468  {
12469   char *err_str = (char *) xmalloc (strlen (str));
12470   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
12471                                                                       &err_str);
12472   bool success = false;
12473   switch (res)
12474     {
12475      case AARCH64_PARSE_MISSING_ARG:
12476        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12477               " attribute");
12478        break;
12479      case AARCH64_PARSE_INVALID_ARG:
12480        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12481               "=\")%> pragma or attribute", err_str);
12482        break;
12483      case AARCH64_PARSE_OK:
12484        success = true;
12485       /* Fall through.  */
12486      case AARCH64_PARSE_INVALID_FEATURE:
12487        break;
12488      default:
12489        gcc_unreachable ();
12490     }
12491   free (err_str);
12492   return success;
12493  }
12494
12495 /* Handle the argument STR to the tune= target attribute.  */
12496
12497 static bool
12498 aarch64_handle_attr_tune (const char *str)
12499 {
12500   const struct processor *tmp_tune = NULL;
12501   enum aarch64_parse_opt_result parse_res
12502     = aarch64_parse_tune (str, &tmp_tune);
12503
12504   if (parse_res == AARCH64_PARSE_OK)
12505     {
12506       gcc_assert (tmp_tune);
12507       selected_tune = tmp_tune;
12508       explicit_tune_core = selected_tune->ident;
12509       return true;
12510     }
12511
12512   switch (parse_res)
12513     {
12514       case AARCH64_PARSE_INVALID_ARG:
12515         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
12516         aarch64_print_hint_for_core (str);
12517         break;
12518       default:
12519         gcc_unreachable ();
12520     }
12521
12522   return false;
12523 }
12524
12525 /* Parse an architecture extensions target attribute string specified in STR.
12526    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
12527    if successful.  Update aarch64_isa_flags to reflect the ISA features
12528    modified.  */
12529
12530 static bool
12531 aarch64_handle_attr_isa_flags (char *str)
12532 {
12533   enum aarch64_parse_opt_result parse_res;
12534   uint64_t isa_flags = aarch64_isa_flags;
12535
12536   /* We allow "+nothing" in the beginning to clear out all architectural
12537      features if the user wants to handpick specific features.  */
12538   if (strncmp ("+nothing", str, 8) == 0)
12539     {
12540       isa_flags = 0;
12541       str += 8;
12542     }
12543
12544   std::string invalid_extension;
12545   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
12546
12547   if (parse_res == AARCH64_PARSE_OK)
12548     {
12549       aarch64_isa_flags = isa_flags;
12550       return true;
12551     }
12552
12553   switch (parse_res)
12554     {
12555       case AARCH64_PARSE_MISSING_ARG:
12556         error ("missing value in %<target()%> pragma or attribute");
12557         break;
12558
12559       case AARCH64_PARSE_INVALID_FEATURE:
12560         error ("invalid feature modifier %s of value (\"%s\") in "
12561                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12562         break;
12563
12564       default:
12565         gcc_unreachable ();
12566     }
12567
12568  return false;
12569 }
12570
12571 /* The target attributes that we support.  On top of these we also support just
12572    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
12573    handled explicitly in aarch64_process_one_target_attr.  */
12574
12575 static const struct aarch64_attribute_info aarch64_attributes[] =
12576 {
12577   { "general-regs-only", aarch64_attr_mask, false, NULL,
12578      OPT_mgeneral_regs_only },
12579   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
12580      OPT_mfix_cortex_a53_835769 },
12581   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
12582      OPT_mfix_cortex_a53_843419 },
12583   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
12584   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
12585   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
12586      OPT_momit_leaf_frame_pointer },
12587   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
12588   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
12589      OPT_march_ },
12590   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
12591   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
12592      OPT_mtune_ },
12593   { "branch-protection", aarch64_attr_custom, false,
12594      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
12595   { "sign-return-address", aarch64_attr_enum, false, NULL,
12596      OPT_msign_return_address_ },
12597   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
12598 };
12599
12600 /* Parse ARG_STR which contains the definition of one target attribute.
12601    Show appropriate errors if any or return true if the attribute is valid.  */
12602
12603 static bool
12604 aarch64_process_one_target_attr (char *arg_str)
12605 {
12606   bool invert = false;
12607
12608   size_t len = strlen (arg_str);
12609
12610   if (len == 0)
12611     {
12612       error ("malformed %<target()%> pragma or attribute");
12613       return false;
12614     }
12615
12616   char *str_to_check = (char *) alloca (len + 1);
12617   strcpy (str_to_check, arg_str);
12618
12619   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
12620      It is easier to detect and handle it explicitly here rather than going
12621      through the machinery for the rest of the target attributes in this
12622      function.  */
12623   if (*str_to_check == '+')
12624     return aarch64_handle_attr_isa_flags (str_to_check);
12625
12626   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
12627     {
12628       invert = true;
12629       str_to_check += 3;
12630     }
12631   char *arg = strchr (str_to_check, '=');
12632
12633   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
12634      and point ARG to "foo".  */
12635   if (arg)
12636     {
12637       *arg = '\0';
12638       arg++;
12639     }
12640   const struct aarch64_attribute_info *p_attr;
12641   bool found = false;
12642   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
12643     {
12644       /* If the names don't match up, or the user has given an argument
12645          to an attribute that doesn't accept one, or didn't give an argument
12646          to an attribute that expects one, fail to match.  */
12647       if (strcmp (str_to_check, p_attr->name) != 0)
12648         continue;
12649
12650       found = true;
12651       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
12652                               || p_attr->attr_type == aarch64_attr_enum;
12653
12654       if (attr_need_arg_p ^ (arg != NULL))
12655         {
12656           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
12657           return false;
12658         }
12659
12660       /* If the name matches but the attribute does not allow "no-" versions
12661          then we can't match.  */
12662       if (invert && !p_attr->allow_neg)
12663         {
12664           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
12665           return false;
12666         }
12667
12668       switch (p_attr->attr_type)
12669         {
12670         /* Has a custom handler registered.
12671            For example, cpu=, arch=, tune=.  */
12672           case aarch64_attr_custom:
12673             gcc_assert (p_attr->handler);
12674             if (!p_attr->handler (arg))
12675               return false;
12676             break;
12677
12678           /* Either set or unset a boolean option.  */
12679           case aarch64_attr_bool:
12680             {
12681               struct cl_decoded_option decoded;
12682
12683               generate_option (p_attr->opt_num, NULL, !invert,
12684                                CL_TARGET, &decoded);
12685               aarch64_handle_option (&global_options, &global_options_set,
12686                                       &decoded, input_location);
12687               break;
12688             }
12689           /* Set or unset a bit in the target_flags.  aarch64_handle_option
12690              should know what mask to apply given the option number.  */
12691           case aarch64_attr_mask:
12692             {
12693               struct cl_decoded_option decoded;
12694               /* We only need to specify the option number.
12695                  aarch64_handle_option will know which mask to apply.  */
12696               decoded.opt_index = p_attr->opt_num;
12697               decoded.value = !invert;
12698               aarch64_handle_option (&global_options, &global_options_set,
12699                                       &decoded, input_location);
12700               break;
12701             }
12702           /* Use the option setting machinery to set an option to an enum.  */
12703           case aarch64_attr_enum:
12704             {
12705               gcc_assert (arg);
12706               bool valid;
12707               int value;
12708               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
12709                                               &value, CL_TARGET);
12710               if (valid)
12711                 {
12712                   set_option (&global_options, NULL, p_attr->opt_num, value,
12713                               NULL, DK_UNSPECIFIED, input_location,
12714                               global_dc);
12715                 }
12716               else
12717                 {
12718                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
12719                 }
12720               break;
12721             }
12722           default:
12723             gcc_unreachable ();
12724         }
12725     }
12726
12727   /* If we reached here we either have found an attribute and validated
12728      it or didn't match any.  If we matched an attribute but its arguments
12729      were malformed we will have returned false already.  */
12730   return found;
12731 }
12732
12733 /* Count how many times the character C appears in
12734    NULL-terminated string STR.  */
12735
12736 static unsigned int
12737 num_occurences_in_str (char c, char *str)
12738 {
12739   unsigned int res = 0;
12740   while (*str != '\0')
12741     {
12742       if (*str == c)
12743         res++;
12744
12745       str++;
12746     }
12747
12748   return res;
12749 }
12750
12751 /* Parse the tree in ARGS that contains the target attribute information
12752    and update the global target options space.  */
12753
12754 bool
12755 aarch64_process_target_attr (tree args)
12756 {
12757   if (TREE_CODE (args) == TREE_LIST)
12758     {
12759       do
12760         {
12761           tree head = TREE_VALUE (args);
12762           if (head)
12763             {
12764               if (!aarch64_process_target_attr (head))
12765                 return false;
12766             }
12767           args = TREE_CHAIN (args);
12768         } while (args);
12769
12770       return true;
12771     }
12772
12773   if (TREE_CODE (args) != STRING_CST)
12774     {
12775       error ("attribute %<target%> argument not a string");
12776       return false;
12777     }
12778
12779   size_t len = strlen (TREE_STRING_POINTER (args));
12780   char *str_to_check = (char *) alloca (len + 1);
12781   strcpy (str_to_check, TREE_STRING_POINTER (args));
12782
12783   if (len == 0)
12784     {
12785       error ("malformed %<target()%> pragma or attribute");
12786       return false;
12787     }
12788
12789   /* Used to catch empty spaces between commas i.e.
12790      attribute ((target ("attr1,,attr2"))).  */
12791   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
12792
12793   /* Handle multiple target attributes separated by ','.  */
12794   char *token = strtok_r (str_to_check, ",", &str_to_check);
12795
12796   unsigned int num_attrs = 0;
12797   while (token)
12798     {
12799       num_attrs++;
12800       if (!aarch64_process_one_target_attr (token))
12801         {
12802           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
12803           return false;
12804         }
12805
12806       token = strtok_r (NULL, ",", &str_to_check);
12807     }
12808
12809   if (num_attrs != num_commas + 1)
12810     {
12811       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
12812       return false;
12813     }
12814
12815   return true;
12816 }
12817
12818 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
12819    process attribute ((target ("..."))).  */
12820
12821 static bool
12822 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
12823 {
12824   struct cl_target_option cur_target;
12825   bool ret;
12826   tree old_optimize;
12827   tree new_target, new_optimize;
12828   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12829
12830   /* If what we're processing is the current pragma string then the
12831      target option node is already stored in target_option_current_node
12832      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
12833      having to re-parse the string.  This is especially useful to keep
12834      arm_neon.h compile times down since that header contains a lot
12835      of intrinsics enclosed in pragmas.  */
12836   if (!existing_target && args == current_target_pragma)
12837     {
12838       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
12839       return true;
12840     }
12841   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12842
12843   old_optimize = build_optimization_node (&global_options);
12844   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12845
12846   /* If the function changed the optimization levels as well as setting
12847      target options, start with the optimizations specified.  */
12848   if (func_optimize && func_optimize != old_optimize)
12849     cl_optimization_restore (&global_options,
12850                              TREE_OPTIMIZATION (func_optimize));
12851
12852   /* Save the current target options to restore at the end.  */
12853   cl_target_option_save (&cur_target, &global_options);
12854
12855   /* If fndecl already has some target attributes applied to it, unpack
12856      them so that we add this attribute on top of them, rather than
12857      overwriting them.  */
12858   if (existing_target)
12859     {
12860       struct cl_target_option *existing_options
12861         = TREE_TARGET_OPTION (existing_target);
12862
12863       if (existing_options)
12864         cl_target_option_restore (&global_options, existing_options);
12865     }
12866   else
12867     cl_target_option_restore (&global_options,
12868                         TREE_TARGET_OPTION (target_option_current_node));
12869
12870   ret = aarch64_process_target_attr (args);
12871
12872   /* Set up any additional state.  */
12873   if (ret)
12874     {
12875       aarch64_override_options_internal (&global_options);
12876       /* Initialize SIMD builtins if we haven't already.
12877          Set current_target_pragma to NULL for the duration so that
12878          the builtin initialization code doesn't try to tag the functions
12879          being built with the attributes specified by any current pragma, thus
12880          going into an infinite recursion.  */
12881       if (TARGET_SIMD)
12882         {
12883           tree saved_current_target_pragma = current_target_pragma;
12884           current_target_pragma = NULL;
12885           aarch64_init_simd_builtins ();
12886           current_target_pragma = saved_current_target_pragma;
12887         }
12888       new_target = build_target_option_node (&global_options);
12889     }
12890   else
12891     new_target = NULL;
12892
12893   new_optimize = build_optimization_node (&global_options);
12894
12895   if (fndecl && ret)
12896     {
12897       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
12898
12899       if (old_optimize != new_optimize)
12900         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
12901     }
12902
12903   cl_target_option_restore (&global_options, &cur_target);
12904
12905   if (old_optimize != new_optimize)
12906     cl_optimization_restore (&global_options,
12907                              TREE_OPTIMIZATION (old_optimize));
12908   return ret;
12909 }
12910
12911 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
12912    tri-bool options (yes, no, don't care) and the default value is
12913    DEF, determine whether to reject inlining.  */
12914
12915 static bool
12916 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
12917                                      int dont_care, int def)
12918 {
12919   /* If the callee doesn't care, always allow inlining.  */
12920   if (callee == dont_care)
12921     return true;
12922
12923   /* If the caller doesn't care, always allow inlining.  */
12924   if (caller == dont_care)
12925     return true;
12926
12927   /* Otherwise, allow inlining if either the callee and caller values
12928      agree, or if the callee is using the default value.  */
12929   return (callee == caller || callee == def);
12930 }
12931
12932 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
12933    to inline CALLEE into CALLER based on target-specific info.
12934    Make sure that the caller and callee have compatible architectural
12935    features.  Then go through the other possible target attributes
12936    and see if they can block inlining.  Try not to reject always_inline
12937    callees unless they are incompatible architecturally.  */
12938
12939 static bool
12940 aarch64_can_inline_p (tree caller, tree callee)
12941 {
12942   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
12943   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
12944
12945   struct cl_target_option *caller_opts
12946         = TREE_TARGET_OPTION (caller_tree ? caller_tree
12947                                            : target_option_default_node);
12948
12949   struct cl_target_option *callee_opts
12950         = TREE_TARGET_OPTION (callee_tree ? callee_tree
12951                                            : target_option_default_node);
12952
12953   /* Callee's ISA flags should be a subset of the caller's.  */
12954   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
12955        != callee_opts->x_aarch64_isa_flags)
12956     return false;
12957
12958   /* Allow non-strict aligned functions inlining into strict
12959      aligned ones.  */
12960   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
12961        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
12962       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
12963            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
12964     return false;
12965
12966   bool always_inline = lookup_attribute ("always_inline",
12967                                           DECL_ATTRIBUTES (callee));
12968
12969   /* If the architectural features match up and the callee is always_inline
12970      then the other attributes don't matter.  */
12971   if (always_inline)
12972     return true;
12973
12974   if (caller_opts->x_aarch64_cmodel_var
12975       != callee_opts->x_aarch64_cmodel_var)
12976     return false;
12977
12978   if (caller_opts->x_aarch64_tls_dialect
12979       != callee_opts->x_aarch64_tls_dialect)
12980     return false;
12981
12982   /* Honour explicit requests to workaround errata.  */
12983   if (!aarch64_tribools_ok_for_inlining_p (
12984           caller_opts->x_aarch64_fix_a53_err835769,
12985           callee_opts->x_aarch64_fix_a53_err835769,
12986           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
12987     return false;
12988
12989   if (!aarch64_tribools_ok_for_inlining_p (
12990           caller_opts->x_aarch64_fix_a53_err843419,
12991           callee_opts->x_aarch64_fix_a53_err843419,
12992           2, TARGET_FIX_ERR_A53_843419))
12993     return false;
12994
12995   /* If the user explicitly specified -momit-leaf-frame-pointer for the
12996      caller and calle and they don't match up, reject inlining.  */
12997   if (!aarch64_tribools_ok_for_inlining_p (
12998           caller_opts->x_flag_omit_leaf_frame_pointer,
12999           callee_opts->x_flag_omit_leaf_frame_pointer,
13000           2, 1))
13001     return false;
13002
13003   /* If the callee has specific tuning overrides, respect them.  */
13004   if (callee_opts->x_aarch64_override_tune_string != NULL
13005       && caller_opts->x_aarch64_override_tune_string == NULL)
13006     return false;
13007
13008   /* If the user specified tuning override strings for the
13009      caller and callee and they don't match up, reject inlining.
13010      We just do a string compare here, we don't analyze the meaning
13011      of the string, as it would be too costly for little gain.  */
13012   if (callee_opts->x_aarch64_override_tune_string
13013       && caller_opts->x_aarch64_override_tune_string
13014       && (strcmp (callee_opts->x_aarch64_override_tune_string,
13015                   caller_opts->x_aarch64_override_tune_string) != 0))
13016     return false;
13017
13018   return true;
13019 }
13020
13021 /* Return true if SYMBOL_REF X binds locally.  */
13022
13023 static bool
13024 aarch64_symbol_binds_local_p (const_rtx x)
13025 {
13026   return (SYMBOL_REF_DECL (x)
13027           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13028           : SYMBOL_REF_LOCAL_P (x));
13029 }
13030
13031 /* Return true if SYMBOL_REF X is thread local */
13032 static bool
13033 aarch64_tls_symbol_p (rtx x)
13034 {
13035   if (! TARGET_HAVE_TLS)
13036     return false;
13037
13038   if (GET_CODE (x) != SYMBOL_REF)
13039     return false;
13040
13041   return SYMBOL_REF_TLS_MODEL (x) != 0;
13042 }
13043
13044 /* Classify a TLS symbol into one of the TLS kinds.  */
13045 enum aarch64_symbol_type
13046 aarch64_classify_tls_symbol (rtx x)
13047 {
13048   enum tls_model tls_kind = tls_symbolic_operand_type (x);
13049
13050   switch (tls_kind)
13051     {
13052     case TLS_MODEL_GLOBAL_DYNAMIC:
13053     case TLS_MODEL_LOCAL_DYNAMIC:
13054       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
13055
13056     case TLS_MODEL_INITIAL_EXEC:
13057       switch (aarch64_cmodel)
13058         {
13059         case AARCH64_CMODEL_TINY:
13060         case AARCH64_CMODEL_TINY_PIC:
13061           return SYMBOL_TINY_TLSIE;
13062         default:
13063           return SYMBOL_SMALL_TLSIE;
13064         }
13065
13066     case TLS_MODEL_LOCAL_EXEC:
13067       if (aarch64_tls_size == 12)
13068         return SYMBOL_TLSLE12;
13069       else if (aarch64_tls_size == 24)
13070         return SYMBOL_TLSLE24;
13071       else if (aarch64_tls_size == 32)
13072         return SYMBOL_TLSLE32;
13073       else if (aarch64_tls_size == 48)
13074         return SYMBOL_TLSLE48;
13075       else
13076         gcc_unreachable ();
13077
13078     case TLS_MODEL_EMULATED:
13079     case TLS_MODEL_NONE:
13080       return SYMBOL_FORCE_TO_MEM;
13081
13082     default:
13083       gcc_unreachable ();
13084     }
13085 }
13086
13087 /* Return the correct method for accessing X + OFFSET, where X is either
13088    a SYMBOL_REF or LABEL_REF.  */
13089
13090 enum aarch64_symbol_type
13091 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
13092 {
13093   if (GET_CODE (x) == LABEL_REF)
13094     {
13095       switch (aarch64_cmodel)
13096         {
13097         case AARCH64_CMODEL_LARGE:
13098           return SYMBOL_FORCE_TO_MEM;
13099
13100         case AARCH64_CMODEL_TINY_PIC:
13101         case AARCH64_CMODEL_TINY:
13102           return SYMBOL_TINY_ABSOLUTE;
13103
13104         case AARCH64_CMODEL_SMALL_SPIC:
13105         case AARCH64_CMODEL_SMALL_PIC:
13106         case AARCH64_CMODEL_SMALL:
13107           return SYMBOL_SMALL_ABSOLUTE;
13108
13109         default:
13110           gcc_unreachable ();
13111         }
13112     }
13113
13114   if (GET_CODE (x) == SYMBOL_REF)
13115     {
13116       if (aarch64_tls_symbol_p (x))
13117         return aarch64_classify_tls_symbol (x);
13118
13119       switch (aarch64_cmodel)
13120         {
13121         case AARCH64_CMODEL_TINY:
13122           /* When we retrieve symbol + offset address, we have to make sure
13123              the offset does not cause overflow of the final address.  But
13124              we have no way of knowing the address of symbol at compile time
13125              so we can't accurately say if the distance between the PC and
13126              symbol + offset is outside the addressible range of +/-1M in the
13127              TINY code model.  So we rely on images not being greater than
13128              1M and cap the offset at 1M and anything beyond 1M will have to
13129              be loaded using an alternative mechanism.  Furthermore if the
13130              symbol is a weak reference to something that isn't known to
13131              resolve to a symbol in this module, then force to memory.  */
13132           if ((SYMBOL_REF_WEAK (x)
13133                && !aarch64_symbol_binds_local_p (x))
13134               || !IN_RANGE (offset, -1048575, 1048575))
13135             return SYMBOL_FORCE_TO_MEM;
13136           return SYMBOL_TINY_ABSOLUTE;
13137
13138         case AARCH64_CMODEL_SMALL:
13139           /* Same reasoning as the tiny code model, but the offset cap here is
13140              4G.  */
13141           if ((SYMBOL_REF_WEAK (x)
13142                && !aarch64_symbol_binds_local_p (x))
13143               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13144                             HOST_WIDE_INT_C (4294967264)))
13145             return SYMBOL_FORCE_TO_MEM;
13146           return SYMBOL_SMALL_ABSOLUTE;
13147
13148         case AARCH64_CMODEL_TINY_PIC:
13149           if (!aarch64_symbol_binds_local_p (x))
13150             return SYMBOL_TINY_GOT;
13151           return SYMBOL_TINY_ABSOLUTE;
13152
13153         case AARCH64_CMODEL_SMALL_SPIC:
13154         case AARCH64_CMODEL_SMALL_PIC:
13155           if (!aarch64_symbol_binds_local_p (x))
13156             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13157                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13158           return SYMBOL_SMALL_ABSOLUTE;
13159
13160         case AARCH64_CMODEL_LARGE:
13161           /* This is alright even in PIC code as the constant
13162              pool reference is always PC relative and within
13163              the same translation unit.  */
13164           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13165             return SYMBOL_SMALL_ABSOLUTE;
13166           else
13167             return SYMBOL_FORCE_TO_MEM;
13168
13169         default:
13170           gcc_unreachable ();
13171         }
13172     }
13173
13174   /* By default push everything into the constant pool.  */
13175   return SYMBOL_FORCE_TO_MEM;
13176 }
13177
13178 bool
13179 aarch64_constant_address_p (rtx x)
13180 {
13181   return (CONSTANT_P (x) && memory_address_p (DImode, x));
13182 }
13183
13184 bool
13185 aarch64_legitimate_pic_operand_p (rtx x)
13186 {
13187   if (GET_CODE (x) == SYMBOL_REF
13188       || (GET_CODE (x) == CONST
13189           && GET_CODE (XEXP (x, 0)) == PLUS
13190           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13191      return false;
13192
13193   return true;
13194 }
13195
13196 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
13197    that should be rematerialized rather than spilled.  */
13198
13199 static bool
13200 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
13201 {
13202   /* Support CSE and rematerialization of common constants.  */
13203   if (CONST_INT_P (x)
13204       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
13205       || GET_CODE (x) == CONST_VECTOR)
13206     return true;
13207
13208   /* Do not allow vector struct mode constants for Advanced SIMD.
13209      We could support 0 and -1 easily, but they need support in
13210      aarch64-simd.md.  */
13211   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13212   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13213     return false;
13214
13215   /* Only accept variable-length vector constants if they can be
13216      handled directly.
13217
13218      ??? It would be possible to handle rematerialization of other
13219      constants via secondary reloads.  */
13220   if (vec_flags & VEC_ANY_SVE)
13221     return aarch64_simd_valid_immediate (x, NULL);
13222
13223   if (GET_CODE (x) == HIGH)
13224     x = XEXP (x, 0);
13225
13226   /* Accept polynomial constants that can be calculated by using the
13227      destination of a move as the sole temporary.  Constants that
13228      require a second temporary cannot be rematerialized (they can't be
13229      forced to memory and also aren't legitimate constants).  */
13230   poly_int64 offset;
13231   if (poly_int_rtx_p (x, &offset))
13232     return aarch64_offset_temporaries (false, offset) <= 1;
13233
13234   /* If an offset is being added to something else, we need to allow the
13235      base to be moved into the destination register, meaning that there
13236      are no free temporaries for the offset.  */
13237   x = strip_offset (x, &offset);
13238   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
13239     return false;
13240
13241   /* Do not allow const (plus (anchor_symbol, const_int)).  */
13242   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
13243     return false;
13244
13245   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
13246      so spilling them is better than rematerialization.  */
13247   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
13248     return true;
13249
13250   /* Label references are always constant.  */
13251   if (GET_CODE (x) == LABEL_REF)
13252     return true;
13253
13254   return false;
13255 }
13256
13257 rtx
13258 aarch64_load_tp (rtx target)
13259 {
13260   if (!target
13261       || GET_MODE (target) != Pmode
13262       || !register_operand (target, Pmode))
13263     target = gen_reg_rtx (Pmode);
13264
13265   /* Can return in any reg.  */
13266   emit_insn (gen_aarch64_load_tp_hard (target));
13267   return target;
13268 }
13269
13270 /* On AAPCS systems, this is the "struct __va_list".  */
13271 static GTY(()) tree va_list_type;
13272
13273 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13274    Return the type to use as __builtin_va_list.
13275
13276    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
13277
13278    struct __va_list
13279    {
13280      void *__stack;
13281      void *__gr_top;
13282      void *__vr_top;
13283      int   __gr_offs;
13284      int   __vr_offs;
13285    };  */
13286
13287 static tree
13288 aarch64_build_builtin_va_list (void)
13289 {
13290   tree va_list_name;
13291   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13292
13293   /* Create the type.  */
13294   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
13295   /* Give it the required name.  */
13296   va_list_name = build_decl (BUILTINS_LOCATION,
13297                              TYPE_DECL,
13298                              get_identifier ("__va_list"),
13299                              va_list_type);
13300   DECL_ARTIFICIAL (va_list_name) = 1;
13301   TYPE_NAME (va_list_type) = va_list_name;
13302   TYPE_STUB_DECL (va_list_type) = va_list_name;
13303
13304   /* Create the fields.  */
13305   f_stack = build_decl (BUILTINS_LOCATION,
13306                         FIELD_DECL, get_identifier ("__stack"),
13307                         ptr_type_node);
13308   f_grtop = build_decl (BUILTINS_LOCATION,
13309                         FIELD_DECL, get_identifier ("__gr_top"),
13310                         ptr_type_node);
13311   f_vrtop = build_decl (BUILTINS_LOCATION,
13312                         FIELD_DECL, get_identifier ("__vr_top"),
13313                         ptr_type_node);
13314   f_groff = build_decl (BUILTINS_LOCATION,
13315                         FIELD_DECL, get_identifier ("__gr_offs"),
13316                         integer_type_node);
13317   f_vroff = build_decl (BUILTINS_LOCATION,
13318                         FIELD_DECL, get_identifier ("__vr_offs"),
13319                         integer_type_node);
13320
13321   /* Tell tree-stdarg pass about our internal offset fields.
13322      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
13323      purpose to identify whether the code is updating va_list internal
13324      offset fields through irregular way.  */
13325   va_list_gpr_counter_field = f_groff;
13326   va_list_fpr_counter_field = f_vroff;
13327
13328   DECL_ARTIFICIAL (f_stack) = 1;
13329   DECL_ARTIFICIAL (f_grtop) = 1;
13330   DECL_ARTIFICIAL (f_vrtop) = 1;
13331   DECL_ARTIFICIAL (f_groff) = 1;
13332   DECL_ARTIFICIAL (f_vroff) = 1;
13333
13334   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
13335   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
13336   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
13337   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
13338   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
13339
13340   TYPE_FIELDS (va_list_type) = f_stack;
13341   DECL_CHAIN (f_stack) = f_grtop;
13342   DECL_CHAIN (f_grtop) = f_vrtop;
13343   DECL_CHAIN (f_vrtop) = f_groff;
13344   DECL_CHAIN (f_groff) = f_vroff;
13345
13346   /* Compute its layout.  */
13347   layout_type (va_list_type);
13348
13349   return va_list_type;
13350 }
13351
13352 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
13353 static void
13354 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
13355 {
13356   const CUMULATIVE_ARGS *cum;
13357   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13358   tree stack, grtop, vrtop, groff, vroff;
13359   tree t;
13360   int gr_save_area_size = cfun->va_list_gpr_size;
13361   int vr_save_area_size = cfun->va_list_fpr_size;
13362   int vr_offset;
13363
13364   cum = &crtl->args.info;
13365   if (cfun->va_list_gpr_size)
13366     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
13367                              cfun->va_list_gpr_size);
13368   if (cfun->va_list_fpr_size)
13369     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
13370                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
13371
13372   if (!TARGET_FLOAT)
13373     {
13374       gcc_assert (cum->aapcs_nvrn == 0);
13375       vr_save_area_size = 0;
13376     }
13377
13378   f_stack = TYPE_FIELDS (va_list_type_node);
13379   f_grtop = DECL_CHAIN (f_stack);
13380   f_vrtop = DECL_CHAIN (f_grtop);
13381   f_groff = DECL_CHAIN (f_vrtop);
13382   f_vroff = DECL_CHAIN (f_groff);
13383
13384   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
13385                   NULL_TREE);
13386   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
13387                   NULL_TREE);
13388   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
13389                   NULL_TREE);
13390   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
13391                   NULL_TREE);
13392   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
13393                   NULL_TREE);
13394
13395   /* Emit code to initialize STACK, which points to the next varargs stack
13396      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
13397      by named arguments.  STACK is 8-byte aligned.  */
13398   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
13399   if (cum->aapcs_stack_size > 0)
13400     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
13401   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
13402   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13403
13404   /* Emit code to initialize GRTOP, the top of the GR save area.
13405      virtual_incoming_args_rtx should have been 16 byte aligned.  */
13406   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
13407   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
13408   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13409
13410   /* Emit code to initialize VRTOP, the top of the VR save area.
13411      This address is gr_save_area_bytes below GRTOP, rounded
13412      down to the next 16-byte boundary.  */
13413   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
13414   vr_offset = ROUND_UP (gr_save_area_size,
13415                         STACK_BOUNDARY / BITS_PER_UNIT);
13416
13417   if (vr_offset)
13418     t = fold_build_pointer_plus_hwi (t, -vr_offset);
13419   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
13420   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13421
13422   /* Emit code to initialize GROFF, the offset from GRTOP of the
13423      next GPR argument.  */
13424   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
13425               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
13426   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13427
13428   /* Likewise emit code to initialize VROFF, the offset from FTOP
13429      of the next VR argument.  */
13430   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
13431               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
13432   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13433 }
13434
13435 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
13436
13437 static tree
13438 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
13439                               gimple_seq *post_p ATTRIBUTE_UNUSED)
13440 {
13441   tree addr;
13442   bool indirect_p;
13443   bool is_ha;           /* is HFA or HVA.  */
13444   bool dw_align;        /* double-word align.  */
13445   machine_mode ag_mode = VOIDmode;
13446   int nregs;
13447   machine_mode mode;
13448
13449   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13450   tree stack, f_top, f_off, off, arg, roundup, on_stack;
13451   HOST_WIDE_INT size, rsize, adjust, align;
13452   tree t, u, cond1, cond2;
13453
13454   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
13455   if (indirect_p)
13456     type = build_pointer_type (type);
13457
13458   mode = TYPE_MODE (type);
13459
13460   f_stack = TYPE_FIELDS (va_list_type_node);
13461   f_grtop = DECL_CHAIN (f_stack);
13462   f_vrtop = DECL_CHAIN (f_grtop);
13463   f_groff = DECL_CHAIN (f_vrtop);
13464   f_vroff = DECL_CHAIN (f_groff);
13465
13466   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
13467                   f_stack, NULL_TREE);
13468   size = int_size_in_bytes (type);
13469
13470   bool abi_break;
13471   align
13472     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
13473
13474   dw_align = false;
13475   adjust = 0;
13476   if (aarch64_vfp_is_call_or_return_candidate (mode,
13477                                                type,
13478                                                &ag_mode,
13479                                                &nregs,
13480                                                &is_ha))
13481     {
13482       /* No frontends can create types with variable-sized modes, so we
13483          shouldn't be asked to pass or return them.  */
13484       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
13485
13486       /* TYPE passed in fp/simd registers.  */
13487       if (!TARGET_FLOAT)
13488         aarch64_err_no_fpadvsimd (mode);
13489
13490       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
13491                       unshare_expr (valist), f_vrtop, NULL_TREE);
13492       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
13493                       unshare_expr (valist), f_vroff, NULL_TREE);
13494
13495       rsize = nregs * UNITS_PER_VREG;
13496
13497       if (is_ha)
13498         {
13499           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
13500             adjust = UNITS_PER_VREG - ag_size;
13501         }
13502       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13503                && size < UNITS_PER_VREG)
13504         {
13505           adjust = UNITS_PER_VREG - size;
13506         }
13507     }
13508   else
13509     {
13510       /* TYPE passed in general registers.  */
13511       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
13512                       unshare_expr (valist), f_grtop, NULL_TREE);
13513       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
13514                       unshare_expr (valist), f_groff, NULL_TREE);
13515       rsize = ROUND_UP (size, UNITS_PER_WORD);
13516       nregs = rsize / UNITS_PER_WORD;
13517
13518       if (align > 8)
13519         {
13520           if (abi_break && warn_psabi)
13521             inform (input_location, "parameter passing for argument of type "
13522                     "%qT changed in GCC 9.1", type);
13523           dw_align = true;
13524         }
13525
13526       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13527           && size < UNITS_PER_WORD)
13528         {
13529           adjust = UNITS_PER_WORD  - size;
13530         }
13531     }
13532
13533   /* Get a local temporary for the field value.  */
13534   off = get_initialized_tmp_var (f_off, pre_p, NULL);
13535
13536   /* Emit code to branch if off >= 0.  */
13537   t = build2 (GE_EXPR, boolean_type_node, off,
13538               build_int_cst (TREE_TYPE (off), 0));
13539   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
13540
13541   if (dw_align)
13542     {
13543       /* Emit: offs = (offs + 15) & -16.  */
13544       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13545                   build_int_cst (TREE_TYPE (off), 15));
13546       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
13547                   build_int_cst (TREE_TYPE (off), -16));
13548       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
13549     }
13550   else
13551     roundup = NULL;
13552
13553   /* Update ap.__[g|v]r_offs  */
13554   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13555               build_int_cst (TREE_TYPE (off), rsize));
13556   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
13557
13558   /* String up.  */
13559   if (roundup)
13560     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13561
13562   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
13563   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
13564               build_int_cst (TREE_TYPE (f_off), 0));
13565   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
13566
13567   /* String up: make sure the assignment happens before the use.  */
13568   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
13569   COND_EXPR_ELSE (cond1) = t;
13570
13571   /* Prepare the trees handling the argument that is passed on the stack;
13572      the top level node will store in ON_STACK.  */
13573   arg = get_initialized_tmp_var (stack, pre_p, NULL);
13574   if (align > 8)
13575     {
13576       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
13577       t = fold_build_pointer_plus_hwi (arg, 15);
13578       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13579                   build_int_cst (TREE_TYPE (t), -16));
13580       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
13581     }
13582   else
13583     roundup = NULL;
13584   /* Advance ap.__stack  */
13585   t = fold_build_pointer_plus_hwi (arg, size + 7);
13586   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13587               build_int_cst (TREE_TYPE (t), -8));
13588   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
13589   /* String up roundup and advance.  */
13590   if (roundup)
13591     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13592   /* String up with arg */
13593   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
13594   /* Big-endianness related address adjustment.  */
13595   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13596       && size < UNITS_PER_WORD)
13597   {
13598     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
13599                 size_int (UNITS_PER_WORD - size));
13600     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
13601   }
13602
13603   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
13604   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
13605
13606   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
13607   t = off;
13608   if (adjust)
13609     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
13610                 build_int_cst (TREE_TYPE (off), adjust));
13611
13612   t = fold_convert (sizetype, t);
13613   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
13614
13615   if (is_ha)
13616     {
13617       /* type ha; // treat as "struct {ftype field[n];}"
13618          ... [computing offs]
13619          for (i = 0; i <nregs; ++i, offs += 16)
13620            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
13621          return ha;  */
13622       int i;
13623       tree tmp_ha, field_t, field_ptr_t;
13624
13625       /* Declare a local variable.  */
13626       tmp_ha = create_tmp_var_raw (type, "ha");
13627       gimple_add_tmp_var (tmp_ha);
13628
13629       /* Establish the base type.  */
13630       switch (ag_mode)
13631         {
13632         case E_SFmode:
13633           field_t = float_type_node;
13634           field_ptr_t = float_ptr_type_node;
13635           break;
13636         case E_DFmode:
13637           field_t = double_type_node;
13638           field_ptr_t = double_ptr_type_node;
13639           break;
13640         case E_TFmode:
13641           field_t = long_double_type_node;
13642           field_ptr_t = long_double_ptr_type_node;
13643           break;
13644         case E_HFmode:
13645           field_t = aarch64_fp16_type_node;
13646           field_ptr_t = aarch64_fp16_ptr_type_node;
13647           break;
13648         case E_V2SImode:
13649         case E_V4SImode:
13650             {
13651               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
13652               field_t = build_vector_type_for_mode (innertype, ag_mode);
13653               field_ptr_t = build_pointer_type (field_t);
13654             }
13655           break;
13656         default:
13657           gcc_assert (0);
13658         }
13659
13660       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
13661       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
13662       addr = t;
13663       t = fold_convert (field_ptr_t, addr);
13664       t = build2 (MODIFY_EXPR, field_t,
13665                   build1 (INDIRECT_REF, field_t, tmp_ha),
13666                   build1 (INDIRECT_REF, field_t, t));
13667
13668       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
13669       for (i = 1; i < nregs; ++i)
13670         {
13671           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
13672           u = fold_convert (field_ptr_t, addr);
13673           u = build2 (MODIFY_EXPR, field_t,
13674                       build2 (MEM_REF, field_t, tmp_ha,
13675                               build_int_cst (field_ptr_t,
13676                                              (i *
13677                                               int_size_in_bytes (field_t)))),
13678                       build1 (INDIRECT_REF, field_t, u));
13679           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
13680         }
13681
13682       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
13683       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
13684     }
13685
13686   COND_EXPR_ELSE (cond2) = t;
13687   addr = fold_convert (build_pointer_type (type), cond1);
13688   addr = build_va_arg_indirect_ref (addr);
13689
13690   if (indirect_p)
13691     addr = build_va_arg_indirect_ref (addr);
13692
13693   return addr;
13694 }
13695
13696 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
13697
13698 static void
13699 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
13700                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
13701                                 int no_rtl)
13702 {
13703   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
13704   CUMULATIVE_ARGS local_cum;
13705   int gr_saved = cfun->va_list_gpr_size;
13706   int vr_saved = cfun->va_list_fpr_size;
13707
13708   /* The caller has advanced CUM up to, but not beyond, the last named
13709      argument.  Advance a local copy of CUM past the last "real" named
13710      argument, to find out how many registers are left over.  */
13711   local_cum = *cum;
13712   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
13713
13714   /* Found out how many registers we need to save.
13715      Honor tree-stdvar analysis results.  */
13716   if (cfun->va_list_gpr_size)
13717     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
13718                     cfun->va_list_gpr_size / UNITS_PER_WORD);
13719   if (cfun->va_list_fpr_size)
13720     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
13721                     cfun->va_list_fpr_size / UNITS_PER_VREG);
13722
13723   if (!TARGET_FLOAT)
13724     {
13725       gcc_assert (local_cum.aapcs_nvrn == 0);
13726       vr_saved = 0;
13727     }
13728
13729   if (!no_rtl)
13730     {
13731       if (gr_saved > 0)
13732         {
13733           rtx ptr, mem;
13734
13735           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
13736           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
13737                                - gr_saved * UNITS_PER_WORD);
13738           mem = gen_frame_mem (BLKmode, ptr);
13739           set_mem_alias_set (mem, get_varargs_alias_set ());
13740
13741           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
13742                                mem, gr_saved);
13743         }
13744       if (vr_saved > 0)
13745         {
13746           /* We can't use move_block_from_reg, because it will use
13747              the wrong mode, storing D regs only.  */
13748           machine_mode mode = TImode;
13749           int off, i, vr_start;
13750
13751           /* Set OFF to the offset from virtual_incoming_args_rtx of
13752              the first vector register.  The VR save area lies below
13753              the GR one, and is aligned to 16 bytes.  */
13754           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
13755                            STACK_BOUNDARY / BITS_PER_UNIT);
13756           off -= vr_saved * UNITS_PER_VREG;
13757
13758           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
13759           for (i = 0; i < vr_saved; ++i)
13760             {
13761               rtx ptr, mem;
13762
13763               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
13764               mem = gen_frame_mem (mode, ptr);
13765               set_mem_alias_set (mem, get_varargs_alias_set ());
13766               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
13767               off += UNITS_PER_VREG;
13768             }
13769         }
13770     }
13771
13772   /* We don't save the size into *PRETEND_SIZE because we want to avoid
13773      any complication of having crtl->args.pretend_args_size changed.  */
13774   cfun->machine->frame.saved_varargs_size
13775     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
13776                  STACK_BOUNDARY / BITS_PER_UNIT)
13777        + vr_saved * UNITS_PER_VREG);
13778 }
13779
13780 static void
13781 aarch64_conditional_register_usage (void)
13782 {
13783   int i;
13784   if (!TARGET_FLOAT)
13785     {
13786       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
13787         {
13788           fixed_regs[i] = 1;
13789           call_used_regs[i] = 1;
13790         }
13791     }
13792   if (!TARGET_SVE)
13793     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
13794       {
13795         fixed_regs[i] = 1;
13796         call_used_regs[i] = 1;
13797       }
13798
13799   /* When tracking speculation, we need a couple of call-clobbered registers
13800      to track the speculation state.  It would be nice to just use
13801      IP0 and IP1, but currently there are numerous places that just
13802      assume these registers are free for other uses (eg pointer
13803      authentication).  */
13804   if (aarch64_track_speculation)
13805     {
13806       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
13807       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
13808       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13809       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13810     }
13811 }
13812
13813 /* Walk down the type tree of TYPE counting consecutive base elements.
13814    If *MODEP is VOIDmode, then set it to the first valid floating point
13815    type.  If a non-floating point type is found, or if a floating point
13816    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
13817    otherwise return the count in the sub-tree.  */
13818 static int
13819 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
13820 {
13821   machine_mode mode;
13822   HOST_WIDE_INT size;
13823
13824   switch (TREE_CODE (type))
13825     {
13826     case REAL_TYPE:
13827       mode = TYPE_MODE (type);
13828       if (mode != DFmode && mode != SFmode
13829           && mode != TFmode && mode != HFmode)
13830         return -1;
13831
13832       if (*modep == VOIDmode)
13833         *modep = mode;
13834
13835       if (*modep == mode)
13836         return 1;
13837
13838       break;
13839
13840     case COMPLEX_TYPE:
13841       mode = TYPE_MODE (TREE_TYPE (type));
13842       if (mode != DFmode && mode != SFmode
13843           && mode != TFmode && mode != HFmode)
13844         return -1;
13845
13846       if (*modep == VOIDmode)
13847         *modep = mode;
13848
13849       if (*modep == mode)
13850         return 2;
13851
13852       break;
13853
13854     case VECTOR_TYPE:
13855       /* Use V2SImode and V4SImode as representatives of all 64-bit
13856          and 128-bit vector types.  */
13857       size = int_size_in_bytes (type);
13858       switch (size)
13859         {
13860         case 8:
13861           mode = V2SImode;
13862           break;
13863         case 16:
13864           mode = V4SImode;
13865           break;
13866         default:
13867           return -1;
13868         }
13869
13870       if (*modep == VOIDmode)
13871         *modep = mode;
13872
13873       /* Vector modes are considered to be opaque: two vectors are
13874          equivalent for the purposes of being homogeneous aggregates
13875          if they are the same size.  */
13876       if (*modep == mode)
13877         return 1;
13878
13879       break;
13880
13881     case ARRAY_TYPE:
13882       {
13883         int count;
13884         tree index = TYPE_DOMAIN (type);
13885
13886         /* Can't handle incomplete types nor sizes that are not
13887            fixed.  */
13888         if (!COMPLETE_TYPE_P (type)
13889             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13890           return -1;
13891
13892         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
13893         if (count == -1
13894             || !index
13895             || !TYPE_MAX_VALUE (index)
13896             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
13897             || !TYPE_MIN_VALUE (index)
13898             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
13899             || count < 0)
13900           return -1;
13901
13902         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
13903                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
13904
13905         /* There must be no padding.  */
13906         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13907                       count * GET_MODE_BITSIZE (*modep)))
13908           return -1;
13909
13910         return count;
13911       }
13912
13913     case RECORD_TYPE:
13914       {
13915         int count = 0;
13916         int sub_count;
13917         tree field;
13918
13919         /* Can't handle incomplete types nor sizes that are not
13920            fixed.  */
13921         if (!COMPLETE_TYPE_P (type)
13922             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13923           return -1;
13924
13925         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13926           {
13927             if (TREE_CODE (field) != FIELD_DECL)
13928               continue;
13929
13930             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13931             if (sub_count < 0)
13932               return -1;
13933             count += sub_count;
13934           }
13935
13936         /* There must be no padding.  */
13937         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13938                       count * GET_MODE_BITSIZE (*modep)))
13939           return -1;
13940
13941         return count;
13942       }
13943
13944     case UNION_TYPE:
13945     case QUAL_UNION_TYPE:
13946       {
13947         /* These aren't very interesting except in a degenerate case.  */
13948         int count = 0;
13949         int sub_count;
13950         tree field;
13951
13952         /* Can't handle incomplete types nor sizes that are not
13953            fixed.  */
13954         if (!COMPLETE_TYPE_P (type)
13955             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13956           return -1;
13957
13958         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13959           {
13960             if (TREE_CODE (field) != FIELD_DECL)
13961               continue;
13962
13963             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13964             if (sub_count < 0)
13965               return -1;
13966             count = count > sub_count ? count : sub_count;
13967           }
13968
13969         /* There must be no padding.  */
13970         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13971                       count * GET_MODE_BITSIZE (*modep)))
13972           return -1;
13973
13974         return count;
13975       }
13976
13977     default:
13978       break;
13979     }
13980
13981   return -1;
13982 }
13983
13984 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
13985    type as described in AAPCS64 \S 4.1.2.
13986
13987    See the comment above aarch64_composite_type_p for the notes on MODE.  */
13988
13989 static bool
13990 aarch64_short_vector_p (const_tree type,
13991                         machine_mode mode)
13992 {
13993   poly_int64 size = -1;
13994
13995   if (type && TREE_CODE (type) == VECTOR_TYPE)
13996     size = int_size_in_bytes (type);
13997   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
13998             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
13999     size = GET_MODE_SIZE (mode);
14000
14001   return known_eq (size, 8) || known_eq (size, 16);
14002 }
14003
14004 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14005    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
14006    array types.  The C99 floating-point complex types are also considered
14007    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
14008    types, which are GCC extensions and out of the scope of AAPCS64, are
14009    treated as composite types here as well.
14010
14011    Note that MODE itself is not sufficient in determining whether a type
14012    is such a composite type or not.  This is because
14013    stor-layout.c:compute_record_mode may have already changed the MODE
14014    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
14015    structure with only one field may have its MODE set to the mode of the
14016    field.  Also an integer mode whose size matches the size of the
14017    RECORD_TYPE type may be used to substitute the original mode
14018    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
14019    solely relied on.  */
14020
14021 static bool
14022 aarch64_composite_type_p (const_tree type,
14023                           machine_mode mode)
14024 {
14025   if (aarch64_short_vector_p (type, mode))
14026     return false;
14027
14028   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14029     return true;
14030
14031   if (mode == BLKmode
14032       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14033       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14034     return true;
14035
14036   return false;
14037 }
14038
14039 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14040    shall be passed or returned in simd/fp register(s) (providing these
14041    parameter passing registers are available).
14042
14043    Upon successful return, *COUNT returns the number of needed registers,
14044    *BASE_MODE returns the mode of the individual register and when IS_HAF
14045    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14046    floating-point aggregate or a homogeneous short-vector aggregate.  */
14047
14048 static bool
14049 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
14050                                          const_tree type,
14051                                          machine_mode *base_mode,
14052                                          int *count,
14053                                          bool *is_ha)
14054 {
14055   machine_mode new_mode = VOIDmode;
14056   bool composite_p = aarch64_composite_type_p (type, mode);
14057
14058   if (is_ha != NULL) *is_ha = false;
14059
14060   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
14061       || aarch64_short_vector_p (type, mode))
14062     {
14063       *count = 1;
14064       new_mode = mode;
14065     }
14066   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
14067     {
14068       if (is_ha != NULL) *is_ha = true;
14069       *count = 2;
14070       new_mode = GET_MODE_INNER (mode);
14071     }
14072   else if (type && composite_p)
14073     {
14074       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
14075
14076       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
14077         {
14078           if (is_ha != NULL) *is_ha = true;
14079           *count = ag_count;
14080         }
14081       else
14082         return false;
14083     }
14084   else
14085     return false;
14086
14087   *base_mode = new_mode;
14088   return true;
14089 }
14090
14091 /* Implement TARGET_STRUCT_VALUE_RTX.  */
14092
14093 static rtx
14094 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
14095                           int incoming ATTRIBUTE_UNUSED)
14096 {
14097   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
14098 }
14099
14100 /* Implements target hook vector_mode_supported_p.  */
14101 static bool
14102 aarch64_vector_mode_supported_p (machine_mode mode)
14103 {
14104   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14105   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14106 }
14107
14108 /* Return appropriate SIMD container
14109    for MODE within a vector of WIDTH bits.  */
14110 static machine_mode
14111 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14112 {
14113   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14114     switch (mode)
14115       {
14116       case E_DFmode:
14117         return VNx2DFmode;
14118       case E_SFmode:
14119         return VNx4SFmode;
14120       case E_HFmode:
14121         return VNx8HFmode;
14122       case E_DImode:
14123         return VNx2DImode;
14124       case E_SImode:
14125         return VNx4SImode;
14126       case E_HImode:
14127         return VNx8HImode;
14128       case E_QImode:
14129         return VNx16QImode;
14130       default:
14131         return word_mode;
14132       }
14133
14134   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14135   if (TARGET_SIMD)
14136     {
14137       if (known_eq (width, 128))
14138         switch (mode)
14139           {
14140           case E_DFmode:
14141             return V2DFmode;
14142           case E_SFmode:
14143             return V4SFmode;
14144           case E_HFmode:
14145             return V8HFmode;
14146           case E_SImode:
14147             return V4SImode;
14148           case E_HImode:
14149             return V8HImode;
14150           case E_QImode:
14151             return V16QImode;
14152           case E_DImode:
14153             return V2DImode;
14154           default:
14155             break;
14156           }
14157       else
14158         switch (mode)
14159           {
14160           case E_SFmode:
14161             return V2SFmode;
14162           case E_HFmode:
14163             return V4HFmode;
14164           case E_SImode:
14165             return V2SImode;
14166           case E_HImode:
14167             return V4HImode;
14168           case E_QImode:
14169             return V8QImode;
14170           default:
14171             break;
14172           }
14173     }
14174   return word_mode;
14175 }
14176
14177 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
14178 static machine_mode
14179 aarch64_preferred_simd_mode (scalar_mode mode)
14180 {
14181   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
14182   return aarch64_simd_container_mode (mode, bits);
14183 }
14184
14185 /* Return a list of possible vector sizes for the vectorizer
14186    to iterate over.  */
14187 static void
14188 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
14189 {
14190   if (TARGET_SVE)
14191     sizes->safe_push (BYTES_PER_SVE_VECTOR);
14192   sizes->safe_push (16);
14193   sizes->safe_push (8);
14194 }
14195
14196 /* Implement TARGET_MANGLE_TYPE.  */
14197
14198 static const char *
14199 aarch64_mangle_type (const_tree type)
14200 {
14201   /* The AArch64 ABI documents say that "__va_list" has to be
14202      mangled as if it is in the "std" namespace.  */
14203   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
14204     return "St9__va_list";
14205
14206   /* Half-precision float.  */
14207   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
14208     return "Dh";
14209
14210   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
14211      builtin types.  */
14212   if (TYPE_NAME (type) != NULL)
14213     return aarch64_mangle_builtin_type (type);
14214
14215   /* Use the default mangling.  */
14216   return NULL;
14217 }
14218
14219 /* Find the first rtx_insn before insn that will generate an assembly
14220    instruction.  */
14221
14222 static rtx_insn *
14223 aarch64_prev_real_insn (rtx_insn *insn)
14224 {
14225   if (!insn)
14226     return NULL;
14227
14228   do
14229     {
14230       insn = prev_real_insn (insn);
14231     }
14232   while (insn && recog_memoized (insn) < 0);
14233
14234   return insn;
14235 }
14236
14237 static bool
14238 is_madd_op (enum attr_type t1)
14239 {
14240   unsigned int i;
14241   /* A number of these may be AArch32 only.  */
14242   enum attr_type mlatypes[] = {
14243     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
14244     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
14245     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
14246   };
14247
14248   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
14249     {
14250       if (t1 == mlatypes[i])
14251         return true;
14252     }
14253
14254   return false;
14255 }
14256
14257 /* Check if there is a register dependency between a load and the insn
14258    for which we hold recog_data.  */
14259
14260 static bool
14261 dep_between_memop_and_curr (rtx memop)
14262 {
14263   rtx load_reg;
14264   int opno;
14265
14266   gcc_assert (GET_CODE (memop) == SET);
14267
14268   if (!REG_P (SET_DEST (memop)))
14269     return false;
14270
14271   load_reg = SET_DEST (memop);
14272   for (opno = 1; opno < recog_data.n_operands; opno++)
14273     {
14274       rtx operand = recog_data.operand[opno];
14275       if (REG_P (operand)
14276           && reg_overlap_mentioned_p (load_reg, operand))
14277         return true;
14278
14279     }
14280   return false;
14281 }
14282
14283
14284 /* When working around the Cortex-A53 erratum 835769,
14285    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
14286    instruction and has a preceding memory instruction such that a NOP
14287    should be inserted between them.  */
14288
14289 bool
14290 aarch64_madd_needs_nop (rtx_insn* insn)
14291 {
14292   enum attr_type attr_type;
14293   rtx_insn *prev;
14294   rtx body;
14295
14296   if (!TARGET_FIX_ERR_A53_835769)
14297     return false;
14298
14299   if (!INSN_P (insn) || recog_memoized (insn) < 0)
14300     return false;
14301
14302   attr_type = get_attr_type (insn);
14303   if (!is_madd_op (attr_type))
14304     return false;
14305
14306   prev = aarch64_prev_real_insn (insn);
14307   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
14308      Restore recog state to INSN to avoid state corruption.  */
14309   extract_constrain_insn_cached (insn);
14310
14311   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
14312     return false;
14313
14314   body = single_set (prev);
14315
14316   /* If the previous insn is a memory op and there is no dependency between
14317      it and the DImode madd, emit a NOP between them.  If body is NULL then we
14318      have a complex memory operation, probably a load/store pair.
14319      Be conservative for now and emit a NOP.  */
14320   if (GET_MODE (recog_data.operand[0]) == DImode
14321       && (!body || !dep_between_memop_and_curr (body)))
14322     return true;
14323
14324   return false;
14325
14326 }
14327
14328
14329 /* Implement FINAL_PRESCAN_INSN.  */
14330
14331 void
14332 aarch64_final_prescan_insn (rtx_insn *insn)
14333 {
14334   if (aarch64_madd_needs_nop (insn))
14335     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
14336 }
14337
14338
14339 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
14340    instruction.  */
14341
14342 bool
14343 aarch64_sve_index_immediate_p (rtx base_or_step)
14344 {
14345   return (CONST_INT_P (base_or_step)
14346           && IN_RANGE (INTVAL (base_or_step), -16, 15));
14347 }
14348
14349 /* Return true if X is a valid immediate for the SVE ADD and SUB
14350    instructions.  Negate X first if NEGATE_P is true.  */
14351
14352 bool
14353 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
14354 {
14355   rtx elt;
14356
14357   if (!const_vec_duplicate_p (x, &elt)
14358       || !CONST_INT_P (elt))
14359     return false;
14360
14361   HOST_WIDE_INT val = INTVAL (elt);
14362   if (negate_p)
14363     val = -val;
14364   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
14365
14366   if (val & 0xff)
14367     return IN_RANGE (val, 0, 0xff);
14368   return IN_RANGE (val, 0, 0xff00);
14369 }
14370
14371 /* Return true if X is a valid immediate operand for an SVE logical
14372    instruction such as AND.  */
14373
14374 bool
14375 aarch64_sve_bitmask_immediate_p (rtx x)
14376 {
14377   rtx elt;
14378
14379   return (const_vec_duplicate_p (x, &elt)
14380           && CONST_INT_P (elt)
14381           && aarch64_bitmask_imm (INTVAL (elt),
14382                                   GET_MODE_INNER (GET_MODE (x))));
14383 }
14384
14385 /* Return true if X is a valid immediate for the SVE DUP and CPY
14386    instructions.  */
14387
14388 bool
14389 aarch64_sve_dup_immediate_p (rtx x)
14390 {
14391   rtx elt;
14392
14393   if (!const_vec_duplicate_p (x, &elt)
14394       || !CONST_INT_P (elt))
14395     return false;
14396
14397   HOST_WIDE_INT val = INTVAL (elt);
14398   if (val & 0xff)
14399     return IN_RANGE (val, -0x80, 0x7f);
14400   return IN_RANGE (val, -0x8000, 0x7f00);
14401 }
14402
14403 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14404    SIGNED_P says whether the operand is signed rather than unsigned.  */
14405
14406 bool
14407 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
14408 {
14409   rtx elt;
14410
14411   return (const_vec_duplicate_p (x, &elt)
14412           && CONST_INT_P (elt)
14413           && (signed_p
14414               ? IN_RANGE (INTVAL (elt), -16, 15)
14415               : IN_RANGE (INTVAL (elt), 0, 127)));
14416 }
14417
14418 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14419    instruction.  Negate X first if NEGATE_P is true.  */
14420
14421 bool
14422 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
14423 {
14424   rtx elt;
14425   REAL_VALUE_TYPE r;
14426
14427   if (!const_vec_duplicate_p (x, &elt)
14428       || GET_CODE (elt) != CONST_DOUBLE)
14429     return false;
14430
14431   r = *CONST_DOUBLE_REAL_VALUE (elt);
14432
14433   if (negate_p)
14434     r = real_value_negate (&r);
14435
14436   if (real_equal (&r, &dconst1))
14437     return true;
14438   if (real_equal (&r, &dconsthalf))
14439     return true;
14440   return false;
14441 }
14442
14443 /* Return true if X is a valid immediate operand for an SVE FMUL
14444    instruction.  */
14445
14446 bool
14447 aarch64_sve_float_mul_immediate_p (rtx x)
14448 {
14449   rtx elt;
14450
14451   /* GCC will never generate a multiply with an immediate of 2, so there is no
14452      point testing for it (even though it is a valid constant).  */
14453   return (const_vec_duplicate_p (x, &elt)
14454           && GET_CODE (elt) == CONST_DOUBLE
14455           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
14456 }
14457
14458 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14459    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
14460    is nonnull, use it to describe valid immediates.  */
14461 static bool
14462 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
14463                                     simd_immediate_info *info,
14464                                     enum simd_immediate_check which,
14465                                     simd_immediate_info::insn_type insn)
14466 {
14467   /* Try a 4-byte immediate with LSL.  */
14468   for (unsigned int shift = 0; shift < 32; shift += 8)
14469     if ((val32 & (0xff << shift)) == val32)
14470       {
14471         if (info)
14472           *info = simd_immediate_info (SImode, val32 >> shift, insn,
14473                                        simd_immediate_info::LSL, shift);
14474         return true;
14475       }
14476
14477   /* Try a 2-byte immediate with LSL.  */
14478   unsigned int imm16 = val32 & 0xffff;
14479   if (imm16 == (val32 >> 16))
14480     for (unsigned int shift = 0; shift < 16; shift += 8)
14481       if ((imm16 & (0xff << shift)) == imm16)
14482         {
14483           if (info)
14484             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
14485                                          simd_immediate_info::LSL, shift);
14486           return true;
14487         }
14488
14489   /* Try a 4-byte immediate with MSL, except for cases that MVN
14490      can handle.  */
14491   if (which == AARCH64_CHECK_MOV)
14492     for (unsigned int shift = 8; shift < 24; shift += 8)
14493       {
14494         unsigned int low = (1 << shift) - 1;
14495         if (((val32 & (0xff << shift)) | low) == val32)
14496           {
14497             if (info)
14498               *info = simd_immediate_info (SImode, val32 >> shift, insn,
14499                                            simd_immediate_info::MSL, shift);
14500             return true;
14501           }
14502       }
14503
14504   return false;
14505 }
14506
14507 /* Return true if replicating VAL64 is a valid immediate for the
14508    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
14509    use it to describe valid immediates.  */
14510 static bool
14511 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
14512                                  simd_immediate_info *info,
14513                                  enum simd_immediate_check which)
14514 {
14515   unsigned int val32 = val64 & 0xffffffff;
14516   unsigned int val16 = val64 & 0xffff;
14517   unsigned int val8 = val64 & 0xff;
14518
14519   if (val32 == (val64 >> 32))
14520     {
14521       if ((which & AARCH64_CHECK_ORR) != 0
14522           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
14523                                                  simd_immediate_info::MOV))
14524         return true;
14525
14526       if ((which & AARCH64_CHECK_BIC) != 0
14527           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
14528                                                  simd_immediate_info::MVN))
14529         return true;
14530
14531       /* Try using a replicated byte.  */
14532       if (which == AARCH64_CHECK_MOV
14533           && val16 == (val32 >> 16)
14534           && val8 == (val16 >> 8))
14535         {
14536           if (info)
14537             *info = simd_immediate_info (QImode, val8);
14538           return true;
14539         }
14540     }
14541
14542   /* Try using a bit-to-bytemask.  */
14543   if (which == AARCH64_CHECK_MOV)
14544     {
14545       unsigned int i;
14546       for (i = 0; i < 64; i += 8)
14547         {
14548           unsigned char byte = (val64 >> i) & 0xff;
14549           if (byte != 0 && byte != 0xff)
14550             break;
14551         }
14552       if (i == 64)
14553         {
14554           if (info)
14555             *info = simd_immediate_info (DImode, val64);
14556           return true;
14557         }
14558     }
14559   return false;
14560 }
14561
14562 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
14563    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
14564
14565 static bool
14566 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
14567                              simd_immediate_info *info)
14568 {
14569   scalar_int_mode mode = DImode;
14570   unsigned int val32 = val64 & 0xffffffff;
14571   if (val32 == (val64 >> 32))
14572     {
14573       mode = SImode;
14574       unsigned int val16 = val32 & 0xffff;
14575       if (val16 == (val32 >> 16))
14576         {
14577           mode = HImode;
14578           unsigned int val8 = val16 & 0xff;
14579           if (val8 == (val16 >> 8))
14580             mode = QImode;
14581         }
14582     }
14583   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
14584   if (IN_RANGE (val, -0x80, 0x7f))
14585     {
14586       /* DUP with no shift.  */
14587       if (info)
14588         *info = simd_immediate_info (mode, val);
14589       return true;
14590     }
14591   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
14592     {
14593       /* DUP with LSL #8.  */
14594       if (info)
14595         *info = simd_immediate_info (mode, val);
14596       return true;
14597     }
14598   if (aarch64_bitmask_imm (val64, mode))
14599     {
14600       /* DUPM.  */
14601       if (info)
14602         *info = simd_immediate_info (mode, val);
14603       return true;
14604     }
14605   return false;
14606 }
14607
14608 /* Return true if OP is a valid SIMD immediate for the operation
14609    described by WHICH.  If INFO is nonnull, use it to describe valid
14610    immediates.  */
14611 bool
14612 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
14613                               enum simd_immediate_check which)
14614 {
14615   machine_mode mode = GET_MODE (op);
14616   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14617   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14618     return false;
14619
14620   scalar_mode elt_mode = GET_MODE_INNER (mode);
14621   rtx base, step;
14622   unsigned int n_elts;
14623   if (GET_CODE (op) == CONST_VECTOR
14624       && CONST_VECTOR_DUPLICATE_P (op))
14625     n_elts = CONST_VECTOR_NPATTERNS (op);
14626   else if ((vec_flags & VEC_SVE_DATA)
14627            && const_vec_series_p (op, &base, &step))
14628     {
14629       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
14630       if (!aarch64_sve_index_immediate_p (base)
14631           || !aarch64_sve_index_immediate_p (step))
14632         return false;
14633
14634       if (info)
14635         *info = simd_immediate_info (elt_mode, base, step);
14636       return true;
14637     }
14638   else if (GET_CODE (op) == CONST_VECTOR
14639            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
14640     /* N_ELTS set above.  */;
14641   else
14642     return false;
14643
14644   /* Handle PFALSE and PTRUE.  */
14645   if (vec_flags & VEC_SVE_PRED)
14646     return (op == CONST0_RTX (mode)
14647             || op == CONSTM1_RTX (mode));
14648
14649   scalar_float_mode elt_float_mode;
14650   if (n_elts == 1
14651       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
14652     {
14653       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
14654       if (aarch64_float_const_zero_rtx_p (elt)
14655           || aarch64_float_const_representable_p (elt))
14656         {
14657           if (info)
14658             *info = simd_immediate_info (elt_float_mode, elt);
14659           return true;
14660         }
14661     }
14662
14663   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
14664   if (elt_size > 8)
14665     return false;
14666
14667   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
14668
14669   /* Expand the vector constant out into a byte vector, with the least
14670      significant byte of the register first.  */
14671   auto_vec<unsigned char, 16> bytes;
14672   bytes.reserve (n_elts * elt_size);
14673   for (unsigned int i = 0; i < n_elts; i++)
14674     {
14675       /* The vector is provided in gcc endian-neutral fashion.
14676          For aarch64_be Advanced SIMD, it must be laid out in the vector
14677          register in reverse order.  */
14678       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
14679       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
14680
14681       if (elt_mode != elt_int_mode)
14682         elt = gen_lowpart (elt_int_mode, elt);
14683
14684       if (!CONST_INT_P (elt))
14685         return false;
14686
14687       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
14688       for (unsigned int byte = 0; byte < elt_size; byte++)
14689         {
14690           bytes.quick_push (elt_val & 0xff);
14691           elt_val >>= BITS_PER_UNIT;
14692         }
14693     }
14694
14695   /* The immediate must repeat every eight bytes.  */
14696   unsigned int nbytes = bytes.length ();
14697   for (unsigned i = 8; i < nbytes; ++i)
14698     if (bytes[i] != bytes[i - 8])
14699       return false;
14700
14701   /* Get the repeating 8-byte value as an integer.  No endian correction
14702      is needed here because bytes is already in lsb-first order.  */
14703   unsigned HOST_WIDE_INT val64 = 0;
14704   for (unsigned int i = 0; i < 8; i++)
14705     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
14706               << (i * BITS_PER_UNIT));
14707
14708   if (vec_flags & VEC_SVE_DATA)
14709     return aarch64_sve_valid_immediate (val64, info);
14710   else
14711     return aarch64_advsimd_valid_immediate (val64, info, which);
14712 }
14713
14714 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
14715    has a step in the range of INDEX.  Return the index expression if so,
14716    otherwise return null.  */
14717 rtx
14718 aarch64_check_zero_based_sve_index_immediate (rtx x)
14719 {
14720   rtx base, step;
14721   if (const_vec_series_p (x, &base, &step)
14722       && base == const0_rtx
14723       && aarch64_sve_index_immediate_p (step))
14724     return step;
14725   return NULL_RTX;
14726 }
14727
14728 /* Check of immediate shift constants are within range.  */
14729 bool
14730 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
14731 {
14732   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
14733   if (left)
14734     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
14735   else
14736     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
14737 }
14738
14739 /* Return the bitmask CONST_INT to select the bits required by a zero extract
14740    operation of width WIDTH at bit position POS.  */
14741
14742 rtx
14743 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
14744 {
14745   gcc_assert (CONST_INT_P (width));
14746   gcc_assert (CONST_INT_P (pos));
14747
14748   unsigned HOST_WIDE_INT mask
14749     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
14750   return GEN_INT (mask << UINTVAL (pos));
14751 }
14752
14753 bool
14754 aarch64_mov_operand_p (rtx x, machine_mode mode)
14755 {
14756   if (GET_CODE (x) == HIGH
14757       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
14758     return true;
14759
14760   if (CONST_INT_P (x))
14761     return true;
14762
14763   if (VECTOR_MODE_P (GET_MODE (x)))
14764     return aarch64_simd_valid_immediate (x, NULL);
14765
14766   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
14767     return true;
14768
14769   if (aarch64_sve_cnt_immediate_p (x))
14770     return true;
14771
14772   return aarch64_classify_symbolic_expression (x)
14773     == SYMBOL_TINY_ABSOLUTE;
14774 }
14775
14776 /* Return a const_int vector of VAL.  */
14777 rtx
14778 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
14779 {
14780   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
14781   return gen_const_vec_duplicate (mode, c);
14782 }
14783
14784 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
14785
14786 bool
14787 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
14788 {
14789   machine_mode vmode;
14790
14791   vmode = aarch64_simd_container_mode (mode, 64);
14792   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
14793   return aarch64_simd_valid_immediate (op_v, NULL);
14794 }
14795
14796 /* Construct and return a PARALLEL RTX vector with elements numbering the
14797    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
14798    the vector - from the perspective of the architecture.  This does not
14799    line up with GCC's perspective on lane numbers, so we end up with
14800    different masks depending on our target endian-ness.  The diagram
14801    below may help.  We must draw the distinction when building masks
14802    which select one half of the vector.  An instruction selecting
14803    architectural low-lanes for a big-endian target, must be described using
14804    a mask selecting GCC high-lanes.
14805
14806                  Big-Endian             Little-Endian
14807
14808 GCC             0   1   2   3           3   2   1   0
14809               | x | x | x | x |       | x | x | x | x |
14810 Architecture    3   2   1   0           3   2   1   0
14811
14812 Low Mask:         { 2, 3 }                { 0, 1 }
14813 High Mask:        { 0, 1 }                { 2, 3 }
14814
14815    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
14816
14817 rtx
14818 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
14819 {
14820   rtvec v = rtvec_alloc (nunits / 2);
14821   int high_base = nunits / 2;
14822   int low_base = 0;
14823   int base;
14824   rtx t1;
14825   int i;
14826
14827   if (BYTES_BIG_ENDIAN)
14828     base = high ? low_base : high_base;
14829   else
14830     base = high ? high_base : low_base;
14831
14832   for (i = 0; i < nunits / 2; i++)
14833     RTVEC_ELT (v, i) = GEN_INT (base + i);
14834
14835   t1 = gen_rtx_PARALLEL (mode, v);
14836   return t1;
14837 }
14838
14839 /* Check OP for validity as a PARALLEL RTX vector with elements
14840    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
14841    from the perspective of the architecture.  See the diagram above
14842    aarch64_simd_vect_par_cnst_half for more details.  */
14843
14844 bool
14845 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
14846                                        bool high)
14847 {
14848   int nelts;
14849   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
14850     return false;
14851
14852   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
14853   HOST_WIDE_INT count_op = XVECLEN (op, 0);
14854   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
14855   int i = 0;
14856
14857   if (count_op != count_ideal)
14858     return false;
14859
14860   for (i = 0; i < count_ideal; i++)
14861     {
14862       rtx elt_op = XVECEXP (op, 0, i);
14863       rtx elt_ideal = XVECEXP (ideal, 0, i);
14864
14865       if (!CONST_INT_P (elt_op)
14866           || INTVAL (elt_ideal) != INTVAL (elt_op))
14867         return false;
14868     }
14869   return true;
14870 }
14871
14872 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
14873    HIGH (exclusive).  */
14874 void
14875 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
14876                           const_tree exp)
14877 {
14878   HOST_WIDE_INT lane;
14879   gcc_assert (CONST_INT_P (operand));
14880   lane = INTVAL (operand);
14881
14882   if (lane < low || lane >= high)
14883   {
14884     if (exp)
14885       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
14886     else
14887       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
14888   }
14889 }
14890
14891 /* Peform endian correction on lane number N, which indexes a vector
14892    of mode MODE, and return the result as an SImode rtx.  */
14893
14894 rtx
14895 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
14896 {
14897   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
14898 }
14899
14900 /* Return TRUE if OP is a valid vector addressing mode.  */
14901
14902 bool
14903 aarch64_simd_mem_operand_p (rtx op)
14904 {
14905   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
14906                         || REG_P (XEXP (op, 0)));
14907 }
14908
14909 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
14910
14911 bool
14912 aarch64_sve_ld1r_operand_p (rtx op)
14913 {
14914   struct aarch64_address_info addr;
14915   scalar_mode mode;
14916
14917   return (MEM_P (op)
14918           && is_a <scalar_mode> (GET_MODE (op), &mode)
14919           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
14920           && addr.type == ADDRESS_REG_IMM
14921           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
14922 }
14923
14924 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
14925    The conditions for STR are the same.  */
14926 bool
14927 aarch64_sve_ldr_operand_p (rtx op)
14928 {
14929   struct aarch64_address_info addr;
14930
14931   return (MEM_P (op)
14932           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
14933                                        false, ADDR_QUERY_ANY)
14934           && addr.type == ADDRESS_REG_IMM);
14935 }
14936
14937 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
14938    We need to be able to access the individual pieces, so the range
14939    is different from LD[234] and ST[234].  */
14940 bool
14941 aarch64_sve_struct_memory_operand_p (rtx op)
14942 {
14943   if (!MEM_P (op))
14944     return false;
14945
14946   machine_mode mode = GET_MODE (op);
14947   struct aarch64_address_info addr;
14948   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
14949                                  ADDR_QUERY_ANY)
14950       || addr.type != ADDRESS_REG_IMM)
14951     return false;
14952
14953   poly_int64 first = addr.const_offset;
14954   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
14955   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
14956           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
14957 }
14958
14959 /* Emit a register copy from operand to operand, taking care not to
14960    early-clobber source registers in the process.
14961
14962    COUNT is the number of components into which the copy needs to be
14963    decomposed.  */
14964 void
14965 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
14966                                 unsigned int count)
14967 {
14968   unsigned int i;
14969   int rdest = REGNO (operands[0]);
14970   int rsrc = REGNO (operands[1]);
14971
14972   if (!reg_overlap_mentioned_p (operands[0], operands[1])
14973       || rdest < rsrc)
14974     for (i = 0; i < count; i++)
14975       emit_move_insn (gen_rtx_REG (mode, rdest + i),
14976                       gen_rtx_REG (mode, rsrc + i));
14977   else
14978     for (i = 0; i < count; i++)
14979       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
14980                       gen_rtx_REG (mode, rsrc + count - i - 1));
14981 }
14982
14983 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
14984    one of VSTRUCT modes: OI, CI, or XI.  */
14985 int
14986 aarch64_simd_attr_length_rglist (machine_mode mode)
14987 {
14988   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
14989   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
14990 }
14991
14992 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
14993    alignment of a vector to 128 bits.  SVE predicates have an alignment of
14994    16 bits.  */
14995 static HOST_WIDE_INT
14996 aarch64_simd_vector_alignment (const_tree type)
14997 {
14998   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14999     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15000        be set for non-predicate vectors of booleans.  Modes are the most
15001        direct way we have of identifying real SVE predicate types.  */
15002     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
15003   return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
15004 }
15005
15006 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
15007 static poly_uint64
15008 aarch64_vectorize_preferred_vector_alignment (const_tree type)
15009 {
15010   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
15011     {
15012       /* If the length of the vector is fixed, try to align to that length,
15013          otherwise don't try to align at all.  */
15014       HOST_WIDE_INT result;
15015       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
15016         result = TYPE_ALIGN (TREE_TYPE (type));
15017       return result;
15018     }
15019   return TYPE_ALIGN (type);
15020 }
15021
15022 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
15023 static bool
15024 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
15025 {
15026   if (is_packed)
15027     return false;
15028
15029   /* For fixed-length vectors, check that the vectorizer will aim for
15030      full-vector alignment.  This isn't true for generic GCC vectors
15031      that are wider than the ABI maximum of 128 bits.  */
15032   poly_uint64 preferred_alignment =
15033     aarch64_vectorize_preferred_vector_alignment (type);
15034   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15035       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
15036                    preferred_alignment))
15037     return false;
15038
15039   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
15040   return true;
15041 }
15042
15043 /* Return true if the vector misalignment factor is supported by the
15044    target.  */
15045 static bool
15046 aarch64_builtin_support_vector_misalignment (machine_mode mode,
15047                                              const_tree type, int misalignment,
15048                                              bool is_packed)
15049 {
15050   if (TARGET_SIMD && STRICT_ALIGNMENT)
15051     {
15052       /* Return if movmisalign pattern is not supported for this mode.  */
15053       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
15054         return false;
15055
15056       /* Misalignment factor is unknown at compile time.  */
15057       if (misalignment == -1)
15058         return false;
15059     }
15060   return default_builtin_support_vector_misalignment (mode, type, misalignment,
15061                                                       is_packed);
15062 }
15063
15064 /* If VALS is a vector constant that can be loaded into a register
15065    using DUP, generate instructions to do so and return an RTX to
15066    assign to the register.  Otherwise return NULL_RTX.  */
15067 static rtx
15068 aarch64_simd_dup_constant (rtx vals)
15069 {
15070   machine_mode mode = GET_MODE (vals);
15071   machine_mode inner_mode = GET_MODE_INNER (mode);
15072   rtx x;
15073
15074   if (!const_vec_duplicate_p (vals, &x))
15075     return NULL_RTX;
15076
15077   /* We can load this constant by using DUP and a constant in a
15078      single ARM register.  This will be cheaper than a vector
15079      load.  */
15080   x = copy_to_mode_reg (inner_mode, x);
15081   return gen_vec_duplicate (mode, x);
15082 }
15083
15084
15085 /* Generate code to load VALS, which is a PARALLEL containing only
15086    constants (for vec_init) or CONST_VECTOR, efficiently into a
15087    register.  Returns an RTX to copy into the register, or NULL_RTX
15088    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
15089 static rtx
15090 aarch64_simd_make_constant (rtx vals)
15091 {
15092   machine_mode mode = GET_MODE (vals);
15093   rtx const_dup;
15094   rtx const_vec = NULL_RTX;
15095   int n_const = 0;
15096   int i;
15097
15098   if (GET_CODE (vals) == CONST_VECTOR)
15099     const_vec = vals;
15100   else if (GET_CODE (vals) == PARALLEL)
15101     {
15102       /* A CONST_VECTOR must contain only CONST_INTs and
15103          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
15104          Only store valid constants in a CONST_VECTOR.  */
15105       int n_elts = XVECLEN (vals, 0);
15106       for (i = 0; i < n_elts; ++i)
15107         {
15108           rtx x = XVECEXP (vals, 0, i);
15109           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15110             n_const++;
15111         }
15112       if (n_const == n_elts)
15113         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
15114     }
15115   else
15116     gcc_unreachable ();
15117
15118   if (const_vec != NULL_RTX
15119       && aarch64_simd_valid_immediate (const_vec, NULL))
15120     /* Load using MOVI/MVNI.  */
15121     return const_vec;
15122   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
15123     /* Loaded using DUP.  */
15124     return const_dup;
15125   else if (const_vec != NULL_RTX)
15126     /* Load from constant pool. We cannot take advantage of single-cycle
15127        LD1 because we need a PC-relative addressing mode.  */
15128     return const_vec;
15129   else
15130     /* A PARALLEL containing something not valid inside CONST_VECTOR.
15131        We cannot construct an initializer.  */
15132     return NULL_RTX;
15133 }
15134
15135 /* Expand a vector initialisation sequence, such that TARGET is
15136    initialised to contain VALS.  */
15137
15138 void
15139 aarch64_expand_vector_init (rtx target, rtx vals)
15140 {
15141   machine_mode mode = GET_MODE (target);
15142   scalar_mode inner_mode = GET_MODE_INNER (mode);
15143   /* The number of vector elements.  */
15144   int n_elts = XVECLEN (vals, 0);
15145   /* The number of vector elements which are not constant.  */
15146   int n_var = 0;
15147   rtx any_const = NULL_RTX;
15148   /* The first element of vals.  */
15149   rtx v0 = XVECEXP (vals, 0, 0);
15150   bool all_same = true;
15151
15152   /* This is a special vec_init<M><N> where N is not an element mode but a
15153      vector mode with half the elements of M.  We expect to find two entries
15154      of mode N in VALS and we must put their concatentation into TARGET.  */
15155   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
15156     {
15157       gcc_assert (known_eq (GET_MODE_SIZE (mode),
15158                   2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
15159       rtx lo = XVECEXP (vals, 0, 0);
15160       rtx hi = XVECEXP (vals, 0, 1);
15161       machine_mode narrow_mode = GET_MODE (lo);
15162       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
15163       gcc_assert (narrow_mode == GET_MODE (hi));
15164
15165       /* When we want to concatenate a half-width vector with zeroes we can
15166          use the aarch64_combinez[_be] patterns.  Just make sure that the
15167          zeroes are in the right half.  */
15168       if (BYTES_BIG_ENDIAN
15169           && aarch64_simd_imm_zero (lo, narrow_mode)
15170           && general_operand (hi, narrow_mode))
15171         emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
15172       else if (!BYTES_BIG_ENDIAN
15173                && aarch64_simd_imm_zero (hi, narrow_mode)
15174                && general_operand (lo, narrow_mode))
15175         emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
15176       else
15177         {
15178           /* Else create the two half-width registers and combine them.  */
15179           if (!REG_P (lo))
15180             lo = force_reg (GET_MODE (lo), lo);
15181           if (!REG_P (hi))
15182             hi = force_reg (GET_MODE (hi), hi);
15183
15184           if (BYTES_BIG_ENDIAN)
15185             std::swap (lo, hi);
15186           emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
15187         }
15188      return;
15189    }
15190
15191   /* Count the number of variable elements to initialise.  */
15192   for (int i = 0; i < n_elts; ++i)
15193     {
15194       rtx x = XVECEXP (vals, 0, i);
15195       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
15196         ++n_var;
15197       else
15198         any_const = x;
15199
15200       all_same &= rtx_equal_p (x, v0);
15201     }
15202
15203   /* No variable elements, hand off to aarch64_simd_make_constant which knows
15204      how best to handle this.  */
15205   if (n_var == 0)
15206     {
15207       rtx constant = aarch64_simd_make_constant (vals);
15208       if (constant != NULL_RTX)
15209         {
15210           emit_move_insn (target, constant);
15211           return;
15212         }
15213     }
15214
15215   /* Splat a single non-constant element if we can.  */
15216   if (all_same)
15217     {
15218       rtx x = copy_to_mode_reg (inner_mode, v0);
15219       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15220       return;
15221     }
15222
15223   enum insn_code icode = optab_handler (vec_set_optab, mode);
15224   gcc_assert (icode != CODE_FOR_nothing);
15225
15226   /* If there are only variable elements, try to optimize
15227      the insertion using dup for the most common element
15228      followed by insertions.  */
15229
15230   /* The algorithm will fill matches[*][0] with the earliest matching element,
15231      and matches[X][1] with the count of duplicate elements (if X is the
15232      earliest element which has duplicates).  */
15233
15234   if (n_var == n_elts && n_elts <= 16)
15235     {
15236       int matches[16][2] = {0};
15237       for (int i = 0; i < n_elts; i++)
15238         {
15239           for (int j = 0; j <= i; j++)
15240             {
15241               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
15242                 {
15243                   matches[i][0] = j;
15244                   matches[j][1]++;
15245                   break;
15246                 }
15247             }
15248         }
15249       int maxelement = 0;
15250       int maxv = 0;
15251       for (int i = 0; i < n_elts; i++)
15252         if (matches[i][1] > maxv)
15253           {
15254             maxelement = i;
15255             maxv = matches[i][1];
15256           }
15257
15258       /* Create a duplicate of the most common element, unless all elements
15259          are equally useless to us, in which case just immediately set the
15260          vector register using the first element.  */
15261
15262       if (maxv == 1)
15263         {
15264           /* For vectors of two 64-bit elements, we can do even better.  */
15265           if (n_elts == 2
15266               && (inner_mode == E_DImode
15267                   || inner_mode == E_DFmode))
15268
15269             {
15270               rtx x0 = XVECEXP (vals, 0, 0);
15271               rtx x1 = XVECEXP (vals, 0, 1);
15272               /* Combine can pick up this case, but handling it directly
15273                  here leaves clearer RTL.
15274
15275                  This is load_pair_lanes<mode>, and also gives us a clean-up
15276                  for store_pair_lanes<mode>.  */
15277               if (memory_operand (x0, inner_mode)
15278                   && memory_operand (x1, inner_mode)
15279                   && !STRICT_ALIGNMENT
15280                   && rtx_equal_p (XEXP (x1, 0),
15281                                   plus_constant (Pmode,
15282                                                  XEXP (x0, 0),
15283                                                  GET_MODE_SIZE (inner_mode))))
15284                 {
15285                   rtx t;
15286                   if (inner_mode == DFmode)
15287                     t = gen_load_pair_lanesdf (target, x0, x1);
15288                   else
15289                     t = gen_load_pair_lanesdi (target, x0, x1);
15290                   emit_insn (t);
15291                   return;
15292                 }
15293             }
15294           /* The subreg-move sequence below will move into lane zero of the
15295              vector register.  For big-endian we want that position to hold
15296              the last element of VALS.  */
15297           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
15298           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15299           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
15300         }
15301       else
15302         {
15303           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15304           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15305         }
15306
15307       /* Insert the rest.  */
15308       for (int i = 0; i < n_elts; i++)
15309         {
15310           rtx x = XVECEXP (vals, 0, i);
15311           if (matches[i][0] == maxelement)
15312             continue;
15313           x = copy_to_mode_reg (inner_mode, x);
15314           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15315         }
15316       return;
15317     }
15318
15319   /* Initialise a vector which is part-variable.  We want to first try
15320      to build those lanes which are constant in the most efficient way we
15321      can.  */
15322   if (n_var != n_elts)
15323     {
15324       rtx copy = copy_rtx (vals);
15325
15326       /* Load constant part of vector.  We really don't care what goes into the
15327          parts we will overwrite, but we're more likely to be able to load the
15328          constant efficiently if it has fewer, larger, repeating parts
15329          (see aarch64_simd_valid_immediate).  */
15330       for (int i = 0; i < n_elts; i++)
15331         {
15332           rtx x = XVECEXP (vals, 0, i);
15333           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15334             continue;
15335           rtx subst = any_const;
15336           for (int bit = n_elts / 2; bit > 0; bit /= 2)
15337             {
15338               /* Look in the copied vector, as more elements are const.  */
15339               rtx test = XVECEXP (copy, 0, i ^ bit);
15340               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
15341                 {
15342                   subst = test;
15343                   break;
15344                 }
15345             }
15346           XVECEXP (copy, 0, i) = subst;
15347         }
15348       aarch64_expand_vector_init (target, copy);
15349     }
15350
15351   /* Insert the variable lanes directly.  */
15352   for (int i = 0; i < n_elts; i++)
15353     {
15354       rtx x = XVECEXP (vals, 0, i);
15355       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15356         continue;
15357       x = copy_to_mode_reg (inner_mode, x);
15358       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15359     }
15360 }
15361
15362 /* Emit RTL corresponding to:
15363    insr TARGET, ELEM.  */
15364
15365 static void
15366 emit_insr (rtx target, rtx elem)
15367 {
15368   machine_mode mode = GET_MODE (target);
15369   scalar_mode elem_mode = GET_MODE_INNER (mode);
15370   elem = force_reg (elem_mode, elem);
15371
15372   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
15373   gcc_assert (icode != CODE_FOR_nothing);
15374   emit_insn (GEN_FCN (icode) (target, target, elem));
15375 }
15376
15377 /* Subroutine of aarch64_sve_expand_vector_init for handling
15378    trailing constants.
15379    This function works as follows:
15380    (a) Create a new vector consisting of trailing constants.
15381    (b) Initialize TARGET with the constant vector using emit_move_insn.
15382    (c) Insert remaining elements in TARGET using insr.
15383    NELTS is the total number of elements in original vector while
15384    while NELTS_REQD is the number of elements that are actually
15385    significant.
15386
15387    ??? The heuristic used is to do above only if number of constants
15388    is at least half the total number of elements.  May need fine tuning.  */
15389
15390 static bool
15391 aarch64_sve_expand_vector_init_handle_trailing_constants
15392  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
15393 {
15394   machine_mode mode = GET_MODE (target);
15395   scalar_mode elem_mode = GET_MODE_INNER (mode);
15396   int n_trailing_constants = 0;
15397
15398   for (int i = nelts_reqd - 1;
15399        i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
15400        i--)
15401     n_trailing_constants++;
15402
15403   if (n_trailing_constants >= nelts_reqd / 2)
15404     {
15405       rtx_vector_builder v (mode, 1, nelts);
15406       for (int i = 0; i < nelts; i++)
15407         v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
15408       rtx const_vec = v.build ();
15409       emit_move_insn (target, const_vec);
15410
15411       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
15412         emit_insr (target, builder.elt (i));
15413
15414       return true;
15415     }
15416
15417   return false;
15418 }
15419
15420 /* Subroutine of aarch64_sve_expand_vector_init.
15421    Works as follows:
15422    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
15423    (b) Skip trailing elements from BUILDER, which are the same as
15424        element NELTS_REQD - 1.
15425    (c) Insert earlier elements in reverse order in TARGET using insr.  */
15426
15427 static void
15428 aarch64_sve_expand_vector_init_insert_elems (rtx target,
15429                                              const rtx_vector_builder &builder,
15430                                              int nelts_reqd)
15431 {
15432   machine_mode mode = GET_MODE (target);
15433   scalar_mode elem_mode = GET_MODE_INNER (mode);
15434
15435   struct expand_operand ops[2];
15436   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
15437   gcc_assert (icode != CODE_FOR_nothing);
15438
15439   create_output_operand (&ops[0], target, mode);
15440   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
15441   expand_insn (icode, 2, ops);
15442
15443   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
15444   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
15445     emit_insr (target, builder.elt (i));
15446 }
15447
15448 /* Subroutine of aarch64_sve_expand_vector_init to handle case
15449    when all trailing elements of builder are same.
15450    This works as follows:
15451    (a) Use expand_insn interface to broadcast last vector element in TARGET.
15452    (b) Insert remaining elements in TARGET using insr.
15453
15454    ??? The heuristic used is to do above if number of same trailing elements
15455    is at least 3/4 of total number of elements, loosely based on
15456    heuristic from mostly_zeros_p.  May need fine-tuning.  */
15457
15458 static bool
15459 aarch64_sve_expand_vector_init_handle_trailing_same_elem
15460  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
15461 {
15462   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
15463   if (ndups >= (3 * nelts_reqd) / 4)
15464     {
15465       aarch64_sve_expand_vector_init_insert_elems (target, builder,
15466                                                    nelts_reqd - ndups + 1);
15467       return true;
15468     }
15469
15470   return false;
15471 }
15472
15473 /* Initialize register TARGET from BUILDER. NELTS is the constant number
15474    of elements in BUILDER.
15475
15476    The function tries to initialize TARGET from BUILDER if it fits one
15477    of the special cases outlined below.
15478
15479    Failing that, the function divides BUILDER into two sub-vectors:
15480    v_even = even elements of BUILDER;
15481    v_odd = odd elements of BUILDER;
15482
15483    and recursively calls itself with v_even and v_odd.
15484
15485    if (recursive call succeeded for v_even or v_odd)
15486      TARGET = zip (v_even, v_odd)
15487
15488    The function returns true if it managed to build TARGET from BUILDER
15489    with one of the special cases, false otherwise.
15490
15491    Example: {a, 1, b, 2, c, 3, d, 4}
15492
15493    The vector gets divided into:
15494    v_even = {a, b, c, d}
15495    v_odd = {1, 2, 3, 4}
15496
15497    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
15498    initialize tmp2 from constant vector v_odd using emit_move_insn.
15499
15500    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
15501    4 elements, so we construct tmp1 from v_even using insr:
15502    tmp1 = dup(d)
15503    insr tmp1, c
15504    insr tmp1, b
15505    insr tmp1, a
15506
15507    And finally:
15508    TARGET = zip (tmp1, tmp2)
15509    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
15510
15511 static bool
15512 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
15513                                 int nelts, int nelts_reqd)
15514 {
15515   machine_mode mode = GET_MODE (target);
15516
15517   /* Case 1: Vector contains trailing constants.  */
15518
15519   if (aarch64_sve_expand_vector_init_handle_trailing_constants
15520        (target, builder, nelts, nelts_reqd))
15521     return true;
15522
15523   /* Case 2: Vector contains leading constants.  */
15524
15525   rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
15526   for (int i = 0; i < nelts_reqd; i++)
15527     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
15528   rev_builder.finalize ();
15529
15530   if (aarch64_sve_expand_vector_init_handle_trailing_constants
15531        (target, rev_builder, nelts, nelts_reqd))
15532     {
15533       emit_insn (gen_aarch64_sve_rev (mode, target, target));
15534       return true;
15535     }
15536
15537   /* Case 3: Vector contains trailing same element.  */
15538
15539   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
15540        (target, builder, nelts_reqd))
15541     return true;
15542
15543   /* Case 4: Vector contains leading same element.  */
15544
15545   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
15546        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
15547     {
15548       emit_insn (gen_aarch64_sve_rev (mode, target, target));
15549       return true;
15550     }
15551
15552   /* Avoid recursing below 4-elements.
15553      ??? The threshold 4 may need fine-tuning.  */
15554
15555   if (nelts_reqd <= 4)
15556     return false;
15557
15558   rtx_vector_builder v_even (mode, 1, nelts);
15559   rtx_vector_builder v_odd (mode, 1, nelts);
15560
15561   for (int i = 0; i < nelts * 2; i += 2)
15562     {
15563       v_even.quick_push (builder.elt (i));
15564       v_odd.quick_push (builder.elt (i + 1));
15565     }
15566
15567   v_even.finalize ();
15568   v_odd.finalize ();
15569
15570   rtx tmp1 = gen_reg_rtx (mode);
15571   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
15572                                                     nelts, nelts_reqd / 2);
15573
15574   rtx tmp2 = gen_reg_rtx (mode);
15575   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
15576                                                    nelts, nelts_reqd / 2);
15577
15578   if (!did_even_p && !did_odd_p)
15579     return false;
15580
15581   /* Initialize v_even and v_odd using INSR if it didn't match any of the
15582      special cases and zip v_even, v_odd.  */
15583
15584   if (!did_even_p)
15585     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
15586
15587   if (!did_odd_p)
15588     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
15589
15590   rtvec v = gen_rtvec (2, tmp1, tmp2);
15591   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
15592   return true;
15593 }
15594
15595 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
15596
15597 void
15598 aarch64_sve_expand_vector_init (rtx target, rtx vals)
15599 {
15600   machine_mode mode = GET_MODE (target);
15601   int nelts = XVECLEN (vals, 0);
15602
15603   rtx_vector_builder v (mode, 1, nelts);
15604   for (int i = 0; i < nelts; i++)
15605     v.quick_push (XVECEXP (vals, 0, i));
15606   v.finalize ();
15607
15608   /* If neither sub-vectors of v could be initialized specially,
15609      then use INSR to insert all elements from v into TARGET.
15610      ??? This might not be optimal for vectors with large
15611      initializers like 16-element or above.
15612      For nelts < 4, it probably isn't useful to handle specially.  */
15613
15614   if (nelts < 4
15615       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
15616     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
15617 }
15618
15619 static unsigned HOST_WIDE_INT
15620 aarch64_shift_truncation_mask (machine_mode mode)
15621 {
15622   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
15623     return 0;
15624   return GET_MODE_UNIT_BITSIZE (mode) - 1;
15625 }
15626
15627 /* Select a format to encode pointers in exception handling data.  */
15628 int
15629 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
15630 {
15631    int type;
15632    switch (aarch64_cmodel)
15633      {
15634      case AARCH64_CMODEL_TINY:
15635      case AARCH64_CMODEL_TINY_PIC:
15636      case AARCH64_CMODEL_SMALL:
15637      case AARCH64_CMODEL_SMALL_PIC:
15638      case AARCH64_CMODEL_SMALL_SPIC:
15639        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
15640           for everything.  */
15641        type = DW_EH_PE_sdata4;
15642        break;
15643      default:
15644        /* No assumptions here.  8-byte relocs required.  */
15645        type = DW_EH_PE_sdata8;
15646        break;
15647      }
15648    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
15649 }
15650
15651 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
15652
15653 static void
15654 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
15655 {
15656   if (aarch64_simd_decl_p (decl))
15657     {
15658       fprintf (stream, "\t.variant_pcs\t");
15659       assemble_name (stream, name);
15660       fprintf (stream, "\n");
15661     }
15662 }
15663
15664 /* The last .arch and .tune assembly strings that we printed.  */
15665 static std::string aarch64_last_printed_arch_string;
15666 static std::string aarch64_last_printed_tune_string;
15667
15668 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
15669    by the function fndecl.  */
15670
15671 void
15672 aarch64_declare_function_name (FILE *stream, const char* name,
15673                                 tree fndecl)
15674 {
15675   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15676
15677   struct cl_target_option *targ_options;
15678   if (target_parts)
15679     targ_options = TREE_TARGET_OPTION (target_parts);
15680   else
15681     targ_options = TREE_TARGET_OPTION (target_option_current_node);
15682   gcc_assert (targ_options);
15683
15684   const struct processor *this_arch
15685     = aarch64_get_arch (targ_options->x_explicit_arch);
15686
15687   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
15688   std::string extension
15689     = aarch64_get_extension_string_for_isa_flags (isa_flags,
15690                                                   this_arch->flags);
15691   /* Only update the assembler .arch string if it is distinct from the last
15692      such string we printed.  */
15693   std::string to_print = this_arch->name + extension;
15694   if (to_print != aarch64_last_printed_arch_string)
15695     {
15696       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
15697       aarch64_last_printed_arch_string = to_print;
15698     }
15699
15700   /* Print the cpu name we're tuning for in the comments, might be
15701      useful to readers of the generated asm.  Do it only when it changes
15702      from function to function and verbose assembly is requested.  */
15703   const struct processor *this_tune
15704     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
15705
15706   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
15707     {
15708       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
15709                    this_tune->name);
15710       aarch64_last_printed_tune_string = this_tune->name;
15711     }
15712
15713   aarch64_asm_output_variant_pcs (stream, fndecl, name);
15714
15715   /* Don't forget the type directive for ELF.  */
15716   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
15717   ASM_OUTPUT_LABEL (stream, name);
15718 }
15719
15720 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
15721
15722 void
15723 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
15724 {
15725   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
15726   const char *value = IDENTIFIER_POINTER (target);
15727   aarch64_asm_output_variant_pcs (stream, decl, name);
15728   ASM_OUTPUT_DEF (stream, name, value);
15729 }
15730
15731 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
15732    function symbol references.  */
15733
15734 void
15735 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
15736 {
15737   default_elf_asm_output_external (stream, decl, name);
15738   aarch64_asm_output_variant_pcs (stream, decl, name);
15739 }
15740
15741 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
15742    Used to output the .cfi_b_key_frame directive when signing the current
15743    function with the B key.  */
15744
15745 void
15746 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
15747 {
15748   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
15749       && aarch64_ra_sign_key == AARCH64_KEY_B)
15750         asm_fprintf (f, "\t.cfi_b_key_frame\n");
15751 }
15752
15753 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
15754
15755 static void
15756 aarch64_start_file (void)
15757 {
15758   struct cl_target_option *default_options
15759     = TREE_TARGET_OPTION (target_option_default_node);
15760
15761   const struct processor *default_arch
15762     = aarch64_get_arch (default_options->x_explicit_arch);
15763   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
15764   std::string extension
15765     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
15766                                                   default_arch->flags);
15767
15768    aarch64_last_printed_arch_string = default_arch->name + extension;
15769    aarch64_last_printed_tune_string = "";
15770    asm_fprintf (asm_out_file, "\t.arch %s\n",
15771                 aarch64_last_printed_arch_string.c_str ());
15772
15773    default_file_start ();
15774 }
15775
15776 /* Emit load exclusive.  */
15777
15778 static void
15779 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
15780                              rtx mem, rtx model_rtx)
15781 {
15782   emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
15783 }
15784
15785 /* Emit store exclusive.  */
15786
15787 static void
15788 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
15789                               rtx rval, rtx mem, rtx model_rtx)
15790 {
15791   emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
15792 }
15793
15794 /* Mark the previous jump instruction as unlikely.  */
15795
15796 static void
15797 aarch64_emit_unlikely_jump (rtx insn)
15798 {
15799   rtx_insn *jump = emit_jump_insn (insn);
15800   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
15801 }
15802
15803 /* Expand a compare and swap pattern.  */
15804
15805 void
15806 aarch64_expand_compare_and_swap (rtx operands[])
15807 {
15808   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
15809   machine_mode mode, r_mode;
15810
15811   bval = operands[0];
15812   rval = operands[1];
15813   mem = operands[2];
15814   oldval = operands[3];
15815   newval = operands[4];
15816   is_weak = operands[5];
15817   mod_s = operands[6];
15818   mod_f = operands[7];
15819   mode = GET_MODE (mem);
15820
15821   /* Normally the succ memory model must be stronger than fail, but in the
15822      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
15823      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
15824   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
15825       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
15826     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
15827
15828   r_mode = mode;
15829   if (mode == QImode || mode == HImode)
15830     {
15831       r_mode = SImode;
15832       rval = gen_reg_rtx (r_mode);
15833     }
15834
15835   if (TARGET_LSE)
15836     {
15837       /* The CAS insn requires oldval and rval overlap, but we need to
15838          have a copy of oldval saved across the operation to tell if
15839          the operation is successful.  */
15840       if (reg_overlap_mentioned_p (rval, oldval))
15841         rval = copy_to_mode_reg (r_mode, oldval);
15842       else
15843         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
15844
15845       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
15846                                                    newval, mod_s));
15847       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15848     }
15849   else
15850     {
15851       /* The oldval predicate varies by mode.  Test it and force to reg.  */
15852       insn_code code = code_for_aarch64_compare_and_swap (mode);
15853       if (!insn_data[code].operand[2].predicate (oldval, mode))
15854         oldval = force_reg (mode, oldval);
15855
15856       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
15857                                  is_weak, mod_s, mod_f));
15858       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
15859     }
15860
15861   if (r_mode != mode)
15862     rval = gen_lowpart (mode, rval);
15863   emit_move_insn (operands[1], rval);
15864
15865   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
15866   emit_insn (gen_rtx_SET (bval, x));
15867 }
15868
15869 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
15870    sequence implementing an atomic operation.  */
15871
15872 static void
15873 aarch64_emit_post_barrier (enum memmodel model)
15874 {
15875   const enum memmodel base_model = memmodel_base (model);
15876
15877   if (is_mm_sync (model)
15878       && (base_model == MEMMODEL_ACQUIRE
15879           || base_model == MEMMODEL_ACQ_REL
15880           || base_model == MEMMODEL_SEQ_CST))
15881     {
15882       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
15883     }
15884 }
15885
15886 /* Split a compare and swap pattern.  */
15887
15888 void
15889 aarch64_split_compare_and_swap (rtx operands[])
15890 {
15891   rtx rval, mem, oldval, newval, scratch;
15892   machine_mode mode;
15893   bool is_weak;
15894   rtx_code_label *label1, *label2;
15895   rtx x, cond;
15896   enum memmodel model;
15897   rtx model_rtx;
15898
15899   rval = operands[0];
15900   mem = operands[1];
15901   oldval = operands[2];
15902   newval = operands[3];
15903   is_weak = (operands[4] != const0_rtx);
15904   model_rtx = operands[5];
15905   scratch = operands[7];
15906   mode = GET_MODE (mem);
15907   model = memmodel_from_int (INTVAL (model_rtx));
15908
15909   /* When OLDVAL is zero and we want the strong version we can emit a tighter
15910     loop:
15911     .label1:
15912         LD[A]XR rval, [mem]
15913         CBNZ    rval, .label2
15914         ST[L]XR scratch, newval, [mem]
15915         CBNZ    scratch, .label1
15916     .label2:
15917         CMP     rval, 0.  */
15918   bool strong_zero_p = !is_weak && oldval == const0_rtx;
15919
15920   label1 = NULL;
15921   if (!is_weak)
15922     {
15923       label1 = gen_label_rtx ();
15924       emit_label (label1);
15925     }
15926   label2 = gen_label_rtx ();
15927
15928   /* The initial load can be relaxed for a __sync operation since a final
15929      barrier will be emitted to stop code hoisting.  */
15930   if (is_mm_sync (model))
15931     aarch64_emit_load_exclusive (mode, rval, mem,
15932                                  GEN_INT (MEMMODEL_RELAXED));
15933   else
15934     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
15935
15936   if (strong_zero_p)
15937     {
15938       if (aarch64_track_speculation)
15939         {
15940           /* Emit an explicit compare instruction, so that we can correctly
15941              track the condition codes.  */
15942           rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
15943           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15944         }
15945       else
15946         x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
15947
15948       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15949                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15950       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15951     }
15952   else
15953     {
15954       cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15955       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15956       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15957                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15958       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15959     }
15960
15961   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
15962
15963   if (!is_weak)
15964     {
15965       if (aarch64_track_speculation)
15966         {
15967           /* Emit an explicit compare instruction, so that we can correctly
15968              track the condition codes.  */
15969           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
15970           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15971         }
15972       else
15973         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
15974
15975       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15976                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
15977       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15978     }
15979   else
15980     {
15981       cond = gen_rtx_REG (CCmode, CC_REGNUM);
15982       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
15983       emit_insn (gen_rtx_SET (cond, x));
15984     }
15985
15986   emit_label (label2);
15987   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
15988      to set the condition flags.  If this is not used it will be removed by
15989      later passes.  */
15990   if (strong_zero_p)
15991     {
15992       cond = gen_rtx_REG (CCmode, CC_REGNUM);
15993       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
15994       emit_insn (gen_rtx_SET (cond, x));
15995     }
15996   /* Emit any final barrier needed for a __sync operation.  */
15997   if (is_mm_sync (model))
15998     aarch64_emit_post_barrier (model);
15999 }
16000
16001 /* Split an atomic operation.  */
16002
16003 void
16004 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
16005                          rtx value, rtx model_rtx, rtx cond)
16006 {
16007   machine_mode mode = GET_MODE (mem);
16008   machine_mode wmode = (mode == DImode ? DImode : SImode);
16009   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
16010   const bool is_sync = is_mm_sync (model);
16011   rtx_code_label *label;
16012   rtx x;
16013
16014   /* Split the atomic operation into a sequence.  */
16015   label = gen_label_rtx ();
16016   emit_label (label);
16017
16018   if (new_out)
16019     new_out = gen_lowpart (wmode, new_out);
16020   if (old_out)
16021     old_out = gen_lowpart (wmode, old_out);
16022   else
16023     old_out = new_out;
16024   value = simplify_gen_subreg (wmode, value, mode, 0);
16025
16026   /* The initial load can be relaxed for a __sync operation since a final
16027      barrier will be emitted to stop code hoisting.  */
16028  if (is_sync)
16029     aarch64_emit_load_exclusive (mode, old_out, mem,
16030                                  GEN_INT (MEMMODEL_RELAXED));
16031   else
16032     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
16033
16034   switch (code)
16035     {
16036     case SET:
16037       new_out = value;
16038       break;
16039
16040     case NOT:
16041       x = gen_rtx_AND (wmode, old_out, value);
16042       emit_insn (gen_rtx_SET (new_out, x));
16043       x = gen_rtx_NOT (wmode, new_out);
16044       emit_insn (gen_rtx_SET (new_out, x));
16045       break;
16046
16047     case MINUS:
16048       if (CONST_INT_P (value))
16049         {
16050           value = GEN_INT (-INTVAL (value));
16051           code = PLUS;
16052         }
16053       /* Fall through.  */
16054
16055     default:
16056       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
16057       emit_insn (gen_rtx_SET (new_out, x));
16058       break;
16059     }
16060
16061   aarch64_emit_store_exclusive (mode, cond, mem,
16062                                 gen_lowpart (mode, new_out), model_rtx);
16063
16064   if (aarch64_track_speculation)
16065     {
16066       /* Emit an explicit compare instruction, so that we can correctly
16067          track the condition codes.  */
16068       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
16069       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16070     }
16071   else
16072     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16073
16074   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16075                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
16076   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16077
16078   /* Emit any final barrier needed for a __sync operation.  */
16079   if (is_sync)
16080     aarch64_emit_post_barrier (model);
16081 }
16082
16083 static void
16084 aarch64_init_libfuncs (void)
16085 {
16086    /* Half-precision float operations.  The compiler handles all operations
16087      with NULL libfuncs by converting to SFmode.  */
16088
16089   /* Conversions.  */
16090   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
16091   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
16092
16093   /* Arithmetic.  */
16094   set_optab_libfunc (add_optab, HFmode, NULL);
16095   set_optab_libfunc (sdiv_optab, HFmode, NULL);
16096   set_optab_libfunc (smul_optab, HFmode, NULL);
16097   set_optab_libfunc (neg_optab, HFmode, NULL);
16098   set_optab_libfunc (sub_optab, HFmode, NULL);
16099
16100   /* Comparisons.  */
16101   set_optab_libfunc (eq_optab, HFmode, NULL);
16102   set_optab_libfunc (ne_optab, HFmode, NULL);
16103   set_optab_libfunc (lt_optab, HFmode, NULL);
16104   set_optab_libfunc (le_optab, HFmode, NULL);
16105   set_optab_libfunc (ge_optab, HFmode, NULL);
16106   set_optab_libfunc (gt_optab, HFmode, NULL);
16107   set_optab_libfunc (unord_optab, HFmode, NULL);
16108 }
16109
16110 /* Target hook for c_mode_for_suffix.  */
16111 static machine_mode
16112 aarch64_c_mode_for_suffix (char suffix)
16113 {
16114   if (suffix == 'q')
16115     return TFmode;
16116
16117   return VOIDmode;
16118 }
16119
16120 /* We can only represent floating point constants which will fit in
16121    "quarter-precision" values.  These values are characterised by
16122    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
16123    by:
16124
16125    (-1)^s * (n/16) * 2^r
16126
16127    Where:
16128      's' is the sign bit.
16129      'n' is an integer in the range 16 <= n <= 31.
16130      'r' is an integer in the range -3 <= r <= 4.  */
16131
16132 /* Return true iff X can be represented by a quarter-precision
16133    floating point immediate operand X.  Note, we cannot represent 0.0.  */
16134 bool
16135 aarch64_float_const_representable_p (rtx x)
16136 {
16137   /* This represents our current view of how many bits
16138      make up the mantissa.  */
16139   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
16140   int exponent;
16141   unsigned HOST_WIDE_INT mantissa, mask;
16142   REAL_VALUE_TYPE r, m;
16143   bool fail;
16144
16145   if (!CONST_DOUBLE_P (x))
16146     return false;
16147
16148   if (GET_MODE (x) == VOIDmode
16149       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
16150     return false;
16151
16152   r = *CONST_DOUBLE_REAL_VALUE (x);
16153
16154   /* We cannot represent infinities, NaNs or +/-zero.  We won't
16155      know if we have +zero until we analyse the mantissa, but we
16156      can reject the other invalid values.  */
16157   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
16158       || REAL_VALUE_MINUS_ZERO (r))
16159     return false;
16160
16161   /* Extract exponent.  */
16162   r = real_value_abs (&r);
16163   exponent = REAL_EXP (&r);
16164
16165   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
16166      highest (sign) bit, with a fixed binary point at bit point_pos.
16167      m1 holds the low part of the mantissa, m2 the high part.
16168      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
16169      bits for the mantissa, this can fail (low bits will be lost).  */
16170   real_ldexp (&m, &r, point_pos - exponent);
16171   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
16172
16173   /* If the low part of the mantissa has bits set we cannot represent
16174      the value.  */
16175   if (w.ulow () != 0)
16176     return false;
16177   /* We have rejected the lower HOST_WIDE_INT, so update our
16178      understanding of how many bits lie in the mantissa and
16179      look only at the high HOST_WIDE_INT.  */
16180   mantissa = w.elt (1);
16181   point_pos -= HOST_BITS_PER_WIDE_INT;
16182
16183   /* We can only represent values with a mantissa of the form 1.xxxx.  */
16184   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
16185   if ((mantissa & mask) != 0)
16186     return false;
16187
16188   /* Having filtered unrepresentable values, we may now remove all
16189      but the highest 5 bits.  */
16190   mantissa >>= point_pos - 5;
16191
16192   /* We cannot represent the value 0.0, so reject it.  This is handled
16193      elsewhere.  */
16194   if (mantissa == 0)
16195     return false;
16196
16197   /* Then, as bit 4 is always set, we can mask it off, leaving
16198      the mantissa in the range [0, 15].  */
16199   mantissa &= ~(1 << 4);
16200   gcc_assert (mantissa <= 15);
16201
16202   /* GCC internally does not use IEEE754-like encoding (where normalized
16203      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
16204      Our mantissa values are shifted 4 places to the left relative to
16205      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
16206      by 5 places to correct for GCC's representation.  */
16207   exponent = 5 - exponent;
16208
16209   return (exponent >= 0 && exponent <= 7);
16210 }
16211
16212 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
16213    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
16214    output MOVI/MVNI, ORR or BIC immediate.  */
16215 char*
16216 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
16217                                    enum simd_immediate_check which)
16218 {
16219   bool is_valid;
16220   static char templ[40];
16221   const char *mnemonic;
16222   const char *shift_op;
16223   unsigned int lane_count = 0;
16224   char element_char;
16225
16226   struct simd_immediate_info info;
16227
16228   /* This will return true to show const_vector is legal for use as either
16229      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
16230      It will also update INFO to show how the immediate should be generated.
16231      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
16232   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
16233   gcc_assert (is_valid);
16234
16235   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
16236   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
16237
16238   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
16239     {
16240       gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
16241       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
16242          move immediate path.  */
16243       if (aarch64_float_const_zero_rtx_p (info.value))
16244         info.value = GEN_INT (0);
16245       else
16246         {
16247           const unsigned int buf_size = 20;
16248           char float_buf[buf_size] = {'\0'};
16249           real_to_decimal_for_mode (float_buf,
16250                                     CONST_DOUBLE_REAL_VALUE (info.value),
16251                                     buf_size, buf_size, 1, info.elt_mode);
16252
16253           if (lane_count == 1)
16254             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
16255           else
16256             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
16257                       lane_count, element_char, float_buf);
16258           return templ;
16259         }
16260     }
16261
16262   gcc_assert (CONST_INT_P (info.value));
16263
16264   if (which == AARCH64_CHECK_MOV)
16265     {
16266       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
16267       shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
16268       if (lane_count == 1)
16269         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
16270                   mnemonic, UINTVAL (info.value));
16271       else if (info.shift)
16272         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
16273                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
16274                   element_char, UINTVAL (info.value), shift_op, info.shift);
16275       else
16276         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
16277                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
16278                   element_char, UINTVAL (info.value));
16279     }
16280   else
16281     {
16282       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
16283       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
16284       if (info.shift)
16285         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
16286                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
16287                   element_char, UINTVAL (info.value), "lsl", info.shift);
16288       else
16289         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
16290                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
16291                   element_char, UINTVAL (info.value));
16292     }
16293   return templ;
16294 }
16295
16296 char*
16297 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
16298 {
16299
16300   /* If a floating point number was passed and we desire to use it in an
16301      integer mode do the conversion to integer.  */
16302   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
16303     {
16304       unsigned HOST_WIDE_INT ival;
16305       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
16306           gcc_unreachable ();
16307       immediate = gen_int_mode (ival, mode);
16308     }
16309
16310   machine_mode vmode;
16311   /* use a 64 bit mode for everything except for DI/DF mode, where we use
16312      a 128 bit vector mode.  */
16313   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
16314
16315   vmode = aarch64_simd_container_mode (mode, width);
16316   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
16317   return aarch64_output_simd_mov_immediate (v_op, width);
16318 }
16319
16320 /* Return the output string to use for moving immediate CONST_VECTOR
16321    into an SVE register.  */
16322
16323 char *
16324 aarch64_output_sve_mov_immediate (rtx const_vector)
16325 {
16326   static char templ[40];
16327   struct simd_immediate_info info;
16328   char element_char;
16329
16330   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
16331   gcc_assert (is_valid);
16332
16333   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
16334
16335   if (info.step)
16336     {
16337       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
16338                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
16339                 element_char, INTVAL (info.value), INTVAL (info.step));
16340       return templ;
16341     }
16342
16343   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
16344     {
16345       if (aarch64_float_const_zero_rtx_p (info.value))
16346         info.value = GEN_INT (0);
16347       else
16348         {
16349           const int buf_size = 20;
16350           char float_buf[buf_size] = {};
16351           real_to_decimal_for_mode (float_buf,
16352                                     CONST_DOUBLE_REAL_VALUE (info.value),
16353                                     buf_size, buf_size, 1, info.elt_mode);
16354
16355           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
16356                     element_char, float_buf);
16357           return templ;
16358         }
16359     }
16360
16361   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
16362             element_char, INTVAL (info.value));
16363   return templ;
16364 }
16365
16366 /* Return the asm format for a PTRUE instruction whose destination has
16367    mode MODE.  SUFFIX is the element size suffix.  */
16368
16369 char *
16370 aarch64_output_ptrue (machine_mode mode, char suffix)
16371 {
16372   unsigned int nunits;
16373   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
16374   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
16375     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
16376   else
16377     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
16378   return buf;
16379 }
16380
16381 /* Split operands into moves from op[1] + op[2] into op[0].  */
16382
16383 void
16384 aarch64_split_combinev16qi (rtx operands[3])
16385 {
16386   unsigned int dest = REGNO (operands[0]);
16387   unsigned int src1 = REGNO (operands[1]);
16388   unsigned int src2 = REGNO (operands[2]);
16389   machine_mode halfmode = GET_MODE (operands[1]);
16390   unsigned int halfregs = REG_NREGS (operands[1]);
16391   rtx destlo, desthi;
16392
16393   gcc_assert (halfmode == V16QImode);
16394
16395   if (src1 == dest && src2 == dest + halfregs)
16396     {
16397       /* No-op move.  Can't split to nothing; emit something.  */
16398       emit_note (NOTE_INSN_DELETED);
16399       return;
16400     }
16401
16402   /* Preserve register attributes for variable tracking.  */
16403   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
16404   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
16405                                GET_MODE_SIZE (halfmode));
16406
16407   /* Special case of reversed high/low parts.  */
16408   if (reg_overlap_mentioned_p (operands[2], destlo)
16409       && reg_overlap_mentioned_p (operands[1], desthi))
16410     {
16411       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
16412       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
16413       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
16414     }
16415   else if (!reg_overlap_mentioned_p (operands[2], destlo))
16416     {
16417       /* Try to avoid unnecessary moves if part of the result
16418          is in the right place already.  */
16419       if (src1 != dest)
16420         emit_move_insn (destlo, operands[1]);
16421       if (src2 != dest + halfregs)
16422         emit_move_insn (desthi, operands[2]);
16423     }
16424   else
16425     {
16426       if (src2 != dest + halfregs)
16427         emit_move_insn (desthi, operands[2]);
16428       if (src1 != dest)
16429         emit_move_insn (destlo, operands[1]);
16430     }
16431 }
16432
16433 /* vec_perm support.  */
16434
16435 struct expand_vec_perm_d
16436 {
16437   rtx target, op0, op1;
16438   vec_perm_indices perm;
16439   machine_mode vmode;
16440   unsigned int vec_flags;
16441   bool one_vector_p;
16442   bool testing_p;
16443 };
16444
16445 /* Generate a variable permutation.  */
16446
16447 static void
16448 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
16449 {
16450   machine_mode vmode = GET_MODE (target);
16451   bool one_vector_p = rtx_equal_p (op0, op1);
16452
16453   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
16454   gcc_checking_assert (GET_MODE (op0) == vmode);
16455   gcc_checking_assert (GET_MODE (op1) == vmode);
16456   gcc_checking_assert (GET_MODE (sel) == vmode);
16457   gcc_checking_assert (TARGET_SIMD);
16458
16459   if (one_vector_p)
16460     {
16461       if (vmode == V8QImode)
16462         {
16463           /* Expand the argument to a V16QI mode by duplicating it.  */
16464           rtx pair = gen_reg_rtx (V16QImode);
16465           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
16466           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16467         }
16468       else
16469         {
16470           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
16471         }
16472     }
16473   else
16474     {
16475       rtx pair;
16476
16477       if (vmode == V8QImode)
16478         {
16479           pair = gen_reg_rtx (V16QImode);
16480           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
16481           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16482         }
16483       else
16484         {
16485           pair = gen_reg_rtx (OImode);
16486           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
16487           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
16488         }
16489     }
16490 }
16491
16492 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
16493    NELT is the number of elements in the vector.  */
16494
16495 void
16496 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
16497                          unsigned int nelt)
16498 {
16499   machine_mode vmode = GET_MODE (target);
16500   bool one_vector_p = rtx_equal_p (op0, op1);
16501   rtx mask;
16502
16503   /* The TBL instruction does not use a modulo index, so we must take care
16504      of that ourselves.  */
16505   mask = aarch64_simd_gen_const_vector_dup (vmode,
16506       one_vector_p ? nelt - 1 : 2 * nelt - 1);
16507   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
16508
16509   /* For big-endian, we also need to reverse the index within the vector
16510      (but not which vector).  */
16511   if (BYTES_BIG_ENDIAN)
16512     {
16513       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
16514       if (!one_vector_p)
16515         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
16516       sel = expand_simple_binop (vmode, XOR, sel, mask,
16517                                  NULL, 0, OPTAB_LIB_WIDEN);
16518     }
16519   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
16520 }
16521
16522 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
16523
16524 static void
16525 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
16526 {
16527   emit_insn (gen_rtx_SET (target,
16528                           gen_rtx_UNSPEC (GET_MODE (target),
16529                                           gen_rtvec (2, op0, op1), code)));
16530 }
16531
16532 /* Expand an SVE vec_perm with the given operands.  */
16533
16534 void
16535 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
16536 {
16537   machine_mode data_mode = GET_MODE (target);
16538   machine_mode sel_mode = GET_MODE (sel);
16539   /* Enforced by the pattern condition.  */
16540   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
16541
16542   /* Note: vec_perm indices are supposed to wrap when they go beyond the
16543      size of the two value vectors, i.e. the upper bits of the indices
16544      are effectively ignored.  SVE TBL instead produces 0 for any
16545      out-of-range indices, so we need to modulo all the vec_perm indices
16546      to ensure they are all in range.  */
16547   rtx sel_reg = force_reg (sel_mode, sel);
16548
16549   /* Check if the sel only references the first values vector.  */
16550   if (GET_CODE (sel) == CONST_VECTOR
16551       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
16552     {
16553       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
16554       return;
16555     }
16556
16557   /* Check if the two values vectors are the same.  */
16558   if (rtx_equal_p (op0, op1))
16559     {
16560       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
16561       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16562                                          NULL, 0, OPTAB_DIRECT);
16563       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
16564       return;
16565     }
16566
16567   /* Run TBL on for each value vector and combine the results.  */
16568
16569   rtx res0 = gen_reg_rtx (data_mode);
16570   rtx res1 = gen_reg_rtx (data_mode);
16571   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
16572   if (GET_CODE (sel) != CONST_VECTOR
16573       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
16574     {
16575       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
16576                                                        2 * nunits - 1);
16577       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16578                                      NULL, 0, OPTAB_DIRECT);
16579     }
16580   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
16581   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
16582                                      NULL, 0, OPTAB_DIRECT);
16583   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
16584   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
16585     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
16586   else
16587     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
16588 }
16589
16590 /* Recognize patterns suitable for the TRN instructions.  */
16591 static bool
16592 aarch64_evpc_trn (struct expand_vec_perm_d *d)
16593 {
16594   HOST_WIDE_INT odd;
16595   poly_uint64 nelt = d->perm.length ();
16596   rtx out, in0, in1, x;
16597   machine_mode vmode = d->vmode;
16598
16599   if (GET_MODE_UNIT_SIZE (vmode) > 8)
16600     return false;
16601
16602   /* Note that these are little-endian tests.
16603      We correct for big-endian later.  */
16604   if (!d->perm[0].is_constant (&odd)
16605       || (odd != 0 && odd != 1)
16606       || !d->perm.series_p (0, 2, odd, 2)
16607       || !d->perm.series_p (1, 2, nelt + odd, 2))
16608     return false;
16609
16610   /* Success!  */
16611   if (d->testing_p)
16612     return true;
16613
16614   in0 = d->op0;
16615   in1 = d->op1;
16616   /* We don't need a big-endian lane correction for SVE; see the comment
16617      at the head of aarch64-sve.md for details.  */
16618   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16619     {
16620       x = in0, in0 = in1, in1 = x;
16621       odd = !odd;
16622     }
16623   out = d->target;
16624
16625   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16626                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
16627   return true;
16628 }
16629
16630 /* Recognize patterns suitable for the UZP instructions.  */
16631 static bool
16632 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
16633 {
16634   HOST_WIDE_INT odd;
16635   rtx out, in0, in1, x;
16636   machine_mode vmode = d->vmode;
16637
16638   if (GET_MODE_UNIT_SIZE (vmode) > 8)
16639     return false;
16640
16641   /* Note that these are little-endian tests.
16642      We correct for big-endian later.  */
16643   if (!d->perm[0].is_constant (&odd)
16644       || (odd != 0 && odd != 1)
16645       || !d->perm.series_p (0, 1, odd, 2))
16646     return false;
16647
16648   /* Success!  */
16649   if (d->testing_p)
16650     return true;
16651
16652   in0 = d->op0;
16653   in1 = d->op1;
16654   /* We don't need a big-endian lane correction for SVE; see the comment
16655      at the head of aarch64-sve.md for details.  */
16656   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16657     {
16658       x = in0, in0 = in1, in1 = x;
16659       odd = !odd;
16660     }
16661   out = d->target;
16662
16663   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16664                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
16665   return true;
16666 }
16667
16668 /* Recognize patterns suitable for the ZIP instructions.  */
16669 static bool
16670 aarch64_evpc_zip (struct expand_vec_perm_d *d)
16671 {
16672   unsigned int high;
16673   poly_uint64 nelt = d->perm.length ();
16674   rtx out, in0, in1, x;
16675   machine_mode vmode = d->vmode;
16676
16677   if (GET_MODE_UNIT_SIZE (vmode) > 8)
16678     return false;
16679
16680   /* Note that these are little-endian tests.
16681      We correct for big-endian later.  */
16682   poly_uint64 first = d->perm[0];
16683   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
16684       || !d->perm.series_p (0, 2, first, 1)
16685       || !d->perm.series_p (1, 2, first + nelt, 1))
16686     return false;
16687   high = maybe_ne (first, 0U);
16688
16689   /* Success!  */
16690   if (d->testing_p)
16691     return true;
16692
16693   in0 = d->op0;
16694   in1 = d->op1;
16695   /* We don't need a big-endian lane correction for SVE; see the comment
16696      at the head of aarch64-sve.md for details.  */
16697   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16698     {
16699       x = in0, in0 = in1, in1 = x;
16700       high = !high;
16701     }
16702   out = d->target;
16703
16704   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16705                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
16706   return true;
16707 }
16708
16709 /* Recognize patterns for the EXT insn.  */
16710
16711 static bool
16712 aarch64_evpc_ext (struct expand_vec_perm_d *d)
16713 {
16714   HOST_WIDE_INT location;
16715   rtx offset;
16716
16717   /* The first element always refers to the first vector.
16718      Check if the extracted indices are increasing by one.  */
16719   if (d->vec_flags == VEC_SVE_PRED
16720       || !d->perm[0].is_constant (&location)
16721       || !d->perm.series_p (0, 1, location, 1))
16722     return false;
16723
16724   /* Success! */
16725   if (d->testing_p)
16726     return true;
16727
16728   /* The case where (location == 0) is a no-op for both big- and little-endian,
16729      and is removed by the mid-end at optimization levels -O1 and higher.
16730
16731      We don't need a big-endian lane correction for SVE; see the comment
16732      at the head of aarch64-sve.md for details.  */
16733   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
16734     {
16735       /* After setup, we want the high elements of the first vector (stored
16736          at the LSB end of the register), and the low elements of the second
16737          vector (stored at the MSB end of the register). So swap.  */
16738       std::swap (d->op0, d->op1);
16739       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
16740          to_constant () is safe since this is restricted to Advanced SIMD
16741          vectors.  */
16742       location = d->perm.length ().to_constant () - location;
16743     }
16744
16745   offset = GEN_INT (location);
16746   emit_set_insn (d->target,
16747                  gen_rtx_UNSPEC (d->vmode,
16748                                  gen_rtvec (3, d->op0, d->op1, offset),
16749                                  UNSPEC_EXT));
16750   return true;
16751 }
16752
16753 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
16754    within each 64-bit, 32-bit or 16-bit granule.  */
16755
16756 static bool
16757 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
16758 {
16759   HOST_WIDE_INT diff;
16760   unsigned int i, size, unspec;
16761   machine_mode pred_mode;
16762
16763   if (d->vec_flags == VEC_SVE_PRED
16764       || !d->one_vector_p
16765       || !d->perm[0].is_constant (&diff))
16766     return false;
16767
16768   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
16769   if (size == 8)
16770     {
16771       unspec = UNSPEC_REV64;
16772       pred_mode = VNx2BImode;
16773     }
16774   else if (size == 4)
16775     {
16776       unspec = UNSPEC_REV32;
16777       pred_mode = VNx4BImode;
16778     }
16779   else if (size == 2)
16780     {
16781       unspec = UNSPEC_REV16;
16782       pred_mode = VNx8BImode;
16783     }
16784   else
16785     return false;
16786
16787   unsigned int step = diff + 1;
16788   for (i = 0; i < step; ++i)
16789     if (!d->perm.series_p (i, step, diff - i, step))
16790       return false;
16791
16792   /* Success! */
16793   if (d->testing_p)
16794     return true;
16795
16796   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
16797   if (d->vec_flags == VEC_SVE_DATA)
16798     {
16799       rtx pred = aarch64_ptrue_reg (pred_mode);
16800       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
16801                             UNSPEC_MERGE_PTRUE);
16802     }
16803   emit_set_insn (d->target, src);
16804   return true;
16805 }
16806
16807 /* Recognize patterns for the REV insn, which reverses elements within
16808    a full vector.  */
16809
16810 static bool
16811 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
16812 {
16813   poly_uint64 nelt = d->perm.length ();
16814
16815   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
16816     return false;
16817
16818   if (!d->perm.series_p (0, 1, nelt - 1, -1))
16819     return false;
16820
16821   /* Success! */
16822   if (d->testing_p)
16823     return true;
16824
16825   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
16826   emit_set_insn (d->target, src);
16827   return true;
16828 }
16829
16830 static bool
16831 aarch64_evpc_dup (struct expand_vec_perm_d *d)
16832 {
16833   rtx out = d->target;
16834   rtx in0;
16835   HOST_WIDE_INT elt;
16836   machine_mode vmode = d->vmode;
16837   rtx lane;
16838
16839   if (d->vec_flags == VEC_SVE_PRED
16840       || d->perm.encoding ().encoded_nelts () != 1
16841       || !d->perm[0].is_constant (&elt))
16842     return false;
16843
16844   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
16845     return false;
16846
16847   /* Success! */
16848   if (d->testing_p)
16849     return true;
16850
16851   /* The generic preparation in aarch64_expand_vec_perm_const_1
16852      swaps the operand order and the permute indices if it finds
16853      d->perm[0] to be in the second operand.  Thus, we can always
16854      use d->op0 and need not do any extra arithmetic to get the
16855      correct lane number.  */
16856   in0 = d->op0;
16857   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
16858
16859   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
16860   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
16861   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
16862   return true;
16863 }
16864
16865 static bool
16866 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
16867 {
16868   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
16869   machine_mode vmode = d->vmode;
16870
16871   /* Make sure that the indices are constant.  */
16872   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
16873   for (unsigned int i = 0; i < encoded_nelts; ++i)
16874     if (!d->perm[i].is_constant ())
16875       return false;
16876
16877   if (d->testing_p)
16878     return true;
16879
16880   /* Generic code will try constant permutation twice.  Once with the
16881      original mode and again with the elements lowered to QImode.
16882      So wait and don't do the selector expansion ourselves.  */
16883   if (vmode != V8QImode && vmode != V16QImode)
16884     return false;
16885
16886   /* to_constant is safe since this routine is specific to Advanced SIMD
16887      vectors.  */
16888   unsigned int nelt = d->perm.length ().to_constant ();
16889   for (unsigned int i = 0; i < nelt; ++i)
16890     /* If big-endian and two vectors we end up with a weird mixed-endian
16891        mode on NEON.  Reverse the index within each word but not the word
16892        itself.  to_constant is safe because we checked is_constant above.  */
16893     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
16894                         ? d->perm[i].to_constant () ^ (nelt - 1)
16895                         : d->perm[i].to_constant ());
16896
16897   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16898   sel = force_reg (vmode, sel);
16899
16900   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
16901   return true;
16902 }
16903
16904 /* Try to implement D using an SVE TBL instruction.  */
16905
16906 static bool
16907 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
16908 {
16909   unsigned HOST_WIDE_INT nelt;
16910
16911   /* Permuting two variable-length vectors could overflow the
16912      index range.  */
16913   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
16914     return false;
16915
16916   if (d->testing_p)
16917     return true;
16918
16919   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
16920   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
16921   if (d->one_vector_p)
16922     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
16923   else
16924     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
16925   return true;
16926 }
16927
16928 static bool
16929 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
16930 {
16931   /* The pattern matching functions above are written to look for a small
16932      number to begin the sequence (0, 1, N/2).  If we begin with an index
16933      from the second operand, we can swap the operands.  */
16934   poly_int64 nelt = d->perm.length ();
16935   if (known_ge (d->perm[0], nelt))
16936     {
16937       d->perm.rotate_inputs (1);
16938       std::swap (d->op0, d->op1);
16939     }
16940
16941   if ((d->vec_flags == VEC_ADVSIMD
16942        || d->vec_flags == VEC_SVE_DATA
16943        || d->vec_flags == VEC_SVE_PRED)
16944       && known_gt (nelt, 1))
16945     {
16946       if (aarch64_evpc_rev_local (d))
16947         return true;
16948       else if (aarch64_evpc_rev_global (d))
16949         return true;
16950       else if (aarch64_evpc_ext (d))
16951         return true;
16952       else if (aarch64_evpc_dup (d))
16953         return true;
16954       else if (aarch64_evpc_zip (d))
16955         return true;
16956       else if (aarch64_evpc_uzp (d))
16957         return true;
16958       else if (aarch64_evpc_trn (d))
16959         return true;
16960       if (d->vec_flags == VEC_SVE_DATA)
16961         return aarch64_evpc_sve_tbl (d);
16962       else if (d->vec_flags == VEC_ADVSIMD)
16963         return aarch64_evpc_tbl (d);
16964     }
16965   return false;
16966 }
16967
16968 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
16969
16970 static bool
16971 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
16972                                   rtx op1, const vec_perm_indices &sel)
16973 {
16974   struct expand_vec_perm_d d;
16975
16976   /* Check whether the mask can be applied to a single vector.  */
16977   if (sel.ninputs () == 1
16978       || (op0 && rtx_equal_p (op0, op1)))
16979     d.one_vector_p = true;
16980   else if (sel.all_from_input_p (0))
16981     {
16982       d.one_vector_p = true;
16983       op1 = op0;
16984     }
16985   else if (sel.all_from_input_p (1))
16986     {
16987       d.one_vector_p = true;
16988       op0 = op1;
16989     }
16990   else
16991     d.one_vector_p = false;
16992
16993   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
16994                      sel.nelts_per_input ());
16995   d.vmode = vmode;
16996   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
16997   d.target = target;
16998   d.op0 = op0;
16999   d.op1 = op1;
17000   d.testing_p = !target;
17001
17002   if (!d.testing_p)
17003     return aarch64_expand_vec_perm_const_1 (&d);
17004
17005   rtx_insn *last = get_last_insn ();
17006   bool ret = aarch64_expand_vec_perm_const_1 (&d);
17007   gcc_assert (last == get_last_insn ());
17008
17009   return ret;
17010 }
17011
17012 /* Generate a byte permute mask for a register of mode MODE,
17013    which has NUNITS units.  */
17014
17015 rtx
17016 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
17017 {
17018   /* We have to reverse each vector because we dont have
17019      a permuted load that can reverse-load according to ABI rules.  */
17020   rtx mask;
17021   rtvec v = rtvec_alloc (16);
17022   unsigned int i, j;
17023   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
17024
17025   gcc_assert (BYTES_BIG_ENDIAN);
17026   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
17027
17028   for (i = 0; i < nunits; i++)
17029     for (j = 0; j < usize; j++)
17030       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
17031   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
17032   return force_reg (V16QImode, mask);
17033 }
17034
17035 /* Return true if X is a valid second operand for the SVE instruction
17036    that implements integer comparison OP_CODE.  */
17037
17038 static bool
17039 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
17040 {
17041   if (register_operand (x, VOIDmode))
17042     return true;
17043
17044   switch (op_code)
17045     {
17046     case LTU:
17047     case LEU:
17048     case GEU:
17049     case GTU:
17050       return aarch64_sve_cmp_immediate_p (x, false);
17051     case LT:
17052     case LE:
17053     case GE:
17054     case GT:
17055     case NE:
17056     case EQ:
17057       return aarch64_sve_cmp_immediate_p (x, true);
17058     default:
17059       gcc_unreachable ();
17060     }
17061 }
17062
17063 /* Use predicated SVE instructions to implement the equivalent of:
17064
17065      (set TARGET OP)
17066
17067    given that PTRUE is an all-true predicate of the appropriate mode.  */
17068
17069 static void
17070 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
17071 {
17072   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
17073                                gen_rtvec (2, ptrue, op),
17074                                UNSPEC_MERGE_PTRUE);
17075   rtx_insn *insn = emit_set_insn (target, unspec);
17076   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
17077 }
17078
17079 /* Likewise, but also clobber the condition codes.  */
17080
17081 static void
17082 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
17083 {
17084   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
17085                                gen_rtvec (2, ptrue, op),
17086                                UNSPEC_MERGE_PTRUE);
17087   rtx_insn *insn = emit_insn (gen_set_clobber_cc_nzc (target, unspec));
17088   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
17089 }
17090
17091 /* Return the UNSPEC_COND_* code for comparison CODE.  */
17092
17093 static unsigned int
17094 aarch64_unspec_cond_code (rtx_code code)
17095 {
17096   switch (code)
17097     {
17098     case NE:
17099       return UNSPEC_COND_NE;
17100     case EQ:
17101       return UNSPEC_COND_EQ;
17102     case LT:
17103       return UNSPEC_COND_LT;
17104     case GT:
17105       return UNSPEC_COND_GT;
17106     case LE:
17107       return UNSPEC_COND_LE;
17108     case GE:
17109       return UNSPEC_COND_GE;
17110     default:
17111       gcc_unreachable ();
17112     }
17113 }
17114
17115 /* Emit:
17116
17117       (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
17118
17119    where <X> is the operation associated with comparison CODE.  This form
17120    of instruction is used when (and (CODE OP0 OP1) PRED) would have different
17121    semantics, such as when PRED might not be all-true and when comparing
17122    inactive lanes could have side effects.  */
17123
17124 static void
17125 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
17126                                   rtx pred, rtx op0, rtx op1)
17127 {
17128   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
17129                                gen_rtvec (3, pred, op0, op1),
17130                                aarch64_unspec_cond_code (code));
17131   emit_set_insn (target, unspec);
17132 }
17133
17134 /* Expand an SVE integer comparison using the SVE equivalent of:
17135
17136      (set TARGET (CODE OP0 OP1)).  */
17137
17138 void
17139 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
17140 {
17141   machine_mode pred_mode = GET_MODE (target);
17142   machine_mode data_mode = GET_MODE (op0);
17143
17144   if (!aarch64_sve_cmp_operand_p (code, op1))
17145     op1 = force_reg (data_mode, op1);
17146
17147   rtx ptrue = aarch64_ptrue_reg (pred_mode);
17148   rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17149   aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
17150 }
17151
17152 /* Emit the SVE equivalent of:
17153
17154       (set TMP1 (CODE1 OP0 OP1))
17155       (set TMP2 (CODE2 OP0 OP1))
17156       (set TARGET (ior:PRED_MODE TMP1 TMP2))
17157
17158    PTRUE is an all-true predicate with the same mode as TARGET.  */
17159
17160 static void
17161 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
17162                            rtx ptrue, rtx op0, rtx op1)
17163 {
17164   machine_mode pred_mode = GET_MODE (ptrue);
17165   rtx tmp1 = gen_reg_rtx (pred_mode);
17166   aarch64_emit_sve_ptrue_op (tmp1, ptrue,
17167                              gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
17168   rtx tmp2 = gen_reg_rtx (pred_mode);
17169   aarch64_emit_sve_ptrue_op (tmp2, ptrue,
17170                              gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
17171   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
17172 }
17173
17174 /* Emit the SVE equivalent of:
17175
17176       (set TMP (CODE OP0 OP1))
17177       (set TARGET (not TMP))
17178
17179    PTRUE is an all-true predicate with the same mode as TARGET.  */
17180
17181 static void
17182 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
17183                                 rtx op0, rtx op1)
17184 {
17185   machine_mode pred_mode = GET_MODE (ptrue);
17186   rtx tmp = gen_reg_rtx (pred_mode);
17187   aarch64_emit_sve_ptrue_op (tmp, ptrue,
17188                              gen_rtx_fmt_ee (code, pred_mode, op0, op1));
17189   aarch64_emit_unop (target, one_cmpl_optab, tmp);
17190 }
17191
17192 /* Expand an SVE floating-point comparison using the SVE equivalent of:
17193
17194      (set TARGET (CODE OP0 OP1))
17195
17196    If CAN_INVERT_P is true, the caller can also handle inverted results;
17197    return true if the result is in fact inverted.  */
17198
17199 bool
17200 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
17201                                   rtx op0, rtx op1, bool can_invert_p)
17202 {
17203   machine_mode pred_mode = GET_MODE (target);
17204   machine_mode data_mode = GET_MODE (op0);
17205
17206   rtx ptrue = aarch64_ptrue_reg (pred_mode);
17207   switch (code)
17208     {
17209     case UNORDERED:
17210       /* UNORDERED has no immediate form.  */
17211       op1 = force_reg (data_mode, op1);
17212       /* fall through */
17213     case LT:
17214     case LE:
17215     case GT:
17216     case GE:
17217     case EQ:
17218     case NE:
17219       {
17220         /* There is native support for the comparison.  */
17221         rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17222         aarch64_emit_sve_ptrue_op (target, ptrue, cond);
17223         return false;
17224       }
17225
17226     case LTGT:
17227       /* This is a trapping operation (LT or GT).  */
17228       aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
17229       return false;
17230
17231     case UNEQ:
17232       if (!flag_trapping_math)
17233         {
17234           /* This would trap for signaling NaNs.  */
17235           op1 = force_reg (data_mode, op1);
17236           aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
17237           return false;
17238         }
17239       /* fall through */
17240     case UNLT:
17241     case UNLE:
17242     case UNGT:
17243     case UNGE:
17244       if (flag_trapping_math)
17245         {
17246           /* Work out which elements are ordered.  */
17247           rtx ordered = gen_reg_rtx (pred_mode);
17248           op1 = force_reg (data_mode, op1);
17249           aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
17250
17251           /* Test the opposite condition for the ordered elements,
17252              then invert the result.  */
17253           if (code == UNEQ)
17254             code = NE;
17255           else
17256             code = reverse_condition_maybe_unordered (code);
17257           if (can_invert_p)
17258             {
17259               aarch64_emit_sve_predicated_cond (target, code,
17260                                                 ordered, op0, op1);
17261               return true;
17262             }
17263           rtx tmp = gen_reg_rtx (pred_mode);
17264           aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
17265           aarch64_emit_unop (target, one_cmpl_optab, tmp);
17266           return false;
17267         }
17268       break;
17269
17270     case ORDERED:
17271       /* ORDERED has no immediate form.  */
17272       op1 = force_reg (data_mode, op1);
17273       break;
17274
17275     default:
17276       gcc_unreachable ();
17277     }
17278
17279   /* There is native support for the inverse comparison.  */
17280   code = reverse_condition_maybe_unordered (code);
17281   if (can_invert_p)
17282     {
17283       rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17284       aarch64_emit_sve_ptrue_op (target, ptrue, cond);
17285       return true;
17286     }
17287   aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
17288   return false;
17289 }
17290
17291 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
17292    of the data being selected and CMP_MODE is the mode of the values being
17293    compared.  */
17294
17295 void
17296 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
17297                           rtx *ops)
17298 {
17299   machine_mode pred_mode
17300     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
17301                              GET_MODE_SIZE (cmp_mode)).require ();
17302   rtx pred = gen_reg_rtx (pred_mode);
17303   if (FLOAT_MODE_P (cmp_mode))
17304     {
17305       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
17306                                             ops[4], ops[5], true))
17307         std::swap (ops[1], ops[2]);
17308     }
17309   else
17310     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
17311
17312   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
17313   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
17314 }
17315
17316 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
17317    true.  However due to issues with register allocation it is preferable
17318    to avoid tieing integer scalar and FP scalar modes.  Executing integer
17319    operations in general registers is better than treating them as scalar
17320    vector operations.  This reduces latency and avoids redundant int<->FP
17321    moves.  So tie modes if they are either the same class, or vector modes
17322    with other vector modes, vector structs or any scalar mode.  */
17323
17324 static bool
17325 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
17326 {
17327   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
17328     return true;
17329
17330   /* We specifically want to allow elements of "structure" modes to
17331      be tieable to the structure.  This more general condition allows
17332      other rarer situations too.  The reason we don't extend this to
17333      predicate modes is that there are no predicate structure modes
17334      nor any specific instructions for extracting part of a predicate
17335      register.  */
17336   if (aarch64_vector_data_mode_p (mode1)
17337       && aarch64_vector_data_mode_p (mode2))
17338     return true;
17339
17340   /* Also allow any scalar modes with vectors.  */
17341   if (aarch64_vector_mode_supported_p (mode1)
17342       || aarch64_vector_mode_supported_p (mode2))
17343     return true;
17344
17345   return false;
17346 }
17347
17348 /* Return a new RTX holding the result of moving POINTER forward by
17349    AMOUNT bytes.  */
17350
17351 static rtx
17352 aarch64_move_pointer (rtx pointer, poly_int64 amount)
17353 {
17354   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
17355
17356   return adjust_automodify_address (pointer, GET_MODE (pointer),
17357                                     next, amount);
17358 }
17359
17360 /* Return a new RTX holding the result of moving POINTER forward by the
17361    size of the mode it points to.  */
17362
17363 static rtx
17364 aarch64_progress_pointer (rtx pointer)
17365 {
17366   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
17367 }
17368
17369 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
17370    MODE bytes.  */
17371
17372 static void
17373 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
17374                                               machine_mode mode)
17375 {
17376   rtx reg = gen_reg_rtx (mode);
17377
17378   /* "Cast" the pointers to the correct mode.  */
17379   *src = adjust_address (*src, mode, 0);
17380   *dst = adjust_address (*dst, mode, 0);
17381   /* Emit the memcpy.  */
17382   emit_move_insn (reg, *src);
17383   emit_move_insn (*dst, reg);
17384   /* Move the pointers forward.  */
17385   *src = aarch64_progress_pointer (*src);
17386   *dst = aarch64_progress_pointer (*dst);
17387 }
17388
17389 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
17390    we succeed, otherwise return false.  */
17391
17392 bool
17393 aarch64_expand_cpymem (rtx *operands)
17394 {
17395   int n, mode_bits;
17396   rtx dst = operands[0];
17397   rtx src = operands[1];
17398   rtx base;
17399   machine_mode cur_mode = BLKmode, next_mode;
17400   bool speed_p = !optimize_function_for_size_p (cfun);
17401
17402   /* When optimizing for size, give a better estimate of the length of a
17403      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
17404      will always require an even number of instructions to do now.  And each
17405      operation requires both a load+store, so devide the max number by 2.  */
17406   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
17407
17408   /* We can't do anything smart if the amount to copy is not constant.  */
17409   if (!CONST_INT_P (operands[2]))
17410     return false;
17411
17412   n = INTVAL (operands[2]);
17413
17414   /* Try to keep the number of instructions low.  For all cases we will do at
17415      most two moves for the residual amount, since we'll always overlap the
17416      remainder.  */
17417   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
17418     return false;
17419
17420   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
17421   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
17422
17423   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
17424   src = adjust_automodify_address (src, VOIDmode, base, 0);
17425
17426   /* Convert n to bits to make the rest of the code simpler.  */
17427   n = n * BITS_PER_UNIT;
17428
17429   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
17430      larger than TImode, but we should not use them for loads/stores here.  */
17431   const int copy_limit = GET_MODE_BITSIZE (TImode);
17432
17433   while (n > 0)
17434     {
17435       /* Find the largest mode in which to do the copy in without over reading
17436          or writing.  */
17437       opt_scalar_int_mode mode_iter;
17438       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
17439         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
17440           cur_mode = mode_iter.require ();
17441
17442       gcc_assert (cur_mode != BLKmode);
17443
17444       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
17445       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
17446
17447       n -= mode_bits;
17448
17449       /* Do certain trailing copies as overlapping if it's going to be
17450          cheaper.  i.e. less instructions to do so.  For instance doing a 15
17451          byte copy it's more efficient to do two overlapping 8 byte copies than
17452          8 + 6 + 1.  */
17453       if (n > 0 && n <= 8 * BITS_PER_UNIT)
17454         {
17455           next_mode = smallest_mode_for_size (n, MODE_INT);
17456           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
17457           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
17458           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
17459           n = n_bits;
17460         }
17461     }
17462
17463   return true;
17464 }
17465
17466 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
17467    SImode stores.  Handle the case when the constant has identical
17468    bottom and top halves.  This is beneficial when the two stores can be
17469    merged into an STP and we avoid synthesising potentially expensive
17470    immediates twice.  Return true if such a split is possible.  */
17471
17472 bool
17473 aarch64_split_dimode_const_store (rtx dst, rtx src)
17474 {
17475   rtx lo = gen_lowpart (SImode, src);
17476   rtx hi = gen_highpart_mode (SImode, DImode, src);
17477
17478   bool size_p = optimize_function_for_size_p (cfun);
17479
17480   if (!rtx_equal_p (lo, hi))
17481     return false;
17482
17483   unsigned int orig_cost
17484     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
17485   unsigned int lo_cost
17486     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
17487
17488   /* We want to transform:
17489      MOV        x1, 49370
17490      MOVK       x1, 0x140, lsl 16
17491      MOVK       x1, 0xc0da, lsl 32
17492      MOVK       x1, 0x140, lsl 48
17493      STR        x1, [x0]
17494    into:
17495      MOV        w1, 49370
17496      MOVK       w1, 0x140, lsl 16
17497      STP        w1, w1, [x0]
17498    So we want to perform this only when we save two instructions
17499    or more.  When optimizing for size, however, accept any code size
17500    savings we can.  */
17501   if (size_p && orig_cost <= lo_cost)
17502     return false;
17503
17504   if (!size_p
17505       && (orig_cost <= lo_cost + 1))
17506     return false;
17507
17508   rtx mem_lo = adjust_address (dst, SImode, 0);
17509   if (!aarch64_mem_pair_operand (mem_lo, SImode))
17510     return false;
17511
17512   rtx tmp_reg = gen_reg_rtx (SImode);
17513   aarch64_expand_mov_immediate (tmp_reg, lo);
17514   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
17515   /* Don't emit an explicit store pair as this may not be always profitable.
17516      Let the sched-fusion logic decide whether to merge them.  */
17517   emit_move_insn (mem_lo, tmp_reg);
17518   emit_move_insn (mem_hi, tmp_reg);
17519
17520   return true;
17521 }
17522
17523 /* Generate RTL for a conditional branch with rtx comparison CODE in
17524    mode CC_MODE.  The destination of the unlikely conditional branch
17525    is LABEL_REF.  */
17526
17527 void
17528 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
17529                               rtx label_ref)
17530 {
17531   rtx x;
17532   x = gen_rtx_fmt_ee (code, VOIDmode,
17533                       gen_rtx_REG (cc_mode, CC_REGNUM),
17534                       const0_rtx);
17535
17536   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17537                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
17538                             pc_rtx);
17539   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17540 }
17541
17542 /* Generate DImode scratch registers for 128-bit (TImode) addition.
17543
17544    OP1 represents the TImode destination operand 1
17545    OP2 represents the TImode destination operand 2
17546    LOW_DEST represents the low half (DImode) of TImode operand 0
17547    LOW_IN1 represents the low half (DImode) of TImode operand 1
17548    LOW_IN2 represents the low half (DImode) of TImode operand 2
17549    HIGH_DEST represents the high half (DImode) of TImode operand 0
17550    HIGH_IN1 represents the high half (DImode) of TImode operand 1
17551    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
17552
17553 void
17554 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17555                             rtx *low_in1, rtx *low_in2,
17556                             rtx *high_dest, rtx *high_in1,
17557                             rtx *high_in2)
17558 {
17559   *low_dest = gen_reg_rtx (DImode);
17560   *low_in1 = gen_lowpart (DImode, op1);
17561   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17562                                   subreg_lowpart_offset (DImode, TImode));
17563   *high_dest = gen_reg_rtx (DImode);
17564   *high_in1 = gen_highpart (DImode, op1);
17565   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17566                                    subreg_highpart_offset (DImode, TImode));
17567 }
17568
17569 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
17570
17571    This function differs from 'arch64_addti_scratch_regs' in that
17572    OP1 can be an immediate constant (zero). We must call
17573    subreg_highpart_offset with DImode and TImode arguments, otherwise
17574    VOIDmode will be used for the const_int which generates an internal
17575    error from subreg_size_highpart_offset which does not expect a size of zero.
17576
17577    OP1 represents the TImode destination operand 1
17578    OP2 represents the TImode destination operand 2
17579    LOW_DEST represents the low half (DImode) of TImode operand 0
17580    LOW_IN1 represents the low half (DImode) of TImode operand 1
17581    LOW_IN2 represents the low half (DImode) of TImode operand 2
17582    HIGH_DEST represents the high half (DImode) of TImode operand 0
17583    HIGH_IN1 represents the high half (DImode) of TImode operand 1
17584    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
17585
17586
17587 void
17588 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17589                              rtx *low_in1, rtx *low_in2,
17590                              rtx *high_dest, rtx *high_in1,
17591                              rtx *high_in2)
17592 {
17593   *low_dest = gen_reg_rtx (DImode);
17594   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
17595                                   subreg_lowpart_offset (DImode, TImode));
17596
17597   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17598                                   subreg_lowpart_offset (DImode, TImode));
17599   *high_dest = gen_reg_rtx (DImode);
17600
17601   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
17602                                    subreg_highpart_offset (DImode, TImode));
17603   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17604                                    subreg_highpart_offset (DImode, TImode));
17605 }
17606
17607 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
17608
17609    OP0 represents the TImode destination operand 0
17610    LOW_DEST represents the low half (DImode) of TImode operand 0
17611    LOW_IN1 represents the low half (DImode) of TImode operand 1
17612    LOW_IN2 represents the low half (DImode) of TImode operand 2
17613    HIGH_DEST represents the high half (DImode) of TImode operand 0
17614    HIGH_IN1 represents the high half (DImode) of TImode operand 1
17615    HIGH_IN2 represents the high half (DImode) of TImode operand 2
17616    UNSIGNED_P is true if the operation is being performed on unsigned
17617    values.  */
17618 void
17619 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
17620                        rtx low_in2, rtx high_dest, rtx high_in1,
17621                        rtx high_in2, bool unsigned_p)
17622 {
17623   if (low_in2 == const0_rtx)
17624     {
17625       low_dest = low_in1;
17626       high_in2 = force_reg (DImode, high_in2);
17627       if (unsigned_p)
17628         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
17629       else
17630         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
17631     }
17632   else
17633     {
17634       if (CONST_INT_P (low_in2))
17635         {
17636           high_in2 = force_reg (DImode, high_in2);
17637           emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
17638                                               GEN_INT (-INTVAL (low_in2))));
17639         }
17640       else
17641         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
17642
17643       if (unsigned_p)
17644         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
17645       else
17646         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
17647     }
17648
17649   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
17650   emit_move_insn (gen_highpart (DImode, op0), high_dest);
17651
17652 }
17653
17654 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
17655
17656 static unsigned HOST_WIDE_INT
17657 aarch64_asan_shadow_offset (void)
17658 {
17659   if (TARGET_ILP32)
17660     return (HOST_WIDE_INT_1 << 29);
17661   else
17662     return (HOST_WIDE_INT_1 << 36);
17663 }
17664
17665 static rtx
17666 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
17667                         int code, tree treeop0, tree treeop1)
17668 {
17669   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17670   rtx op0, op1;
17671   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17672   insn_code icode;
17673   struct expand_operand ops[4];
17674
17675   start_sequence ();
17676   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17677
17678   op_mode = GET_MODE (op0);
17679   if (op_mode == VOIDmode)
17680     op_mode = GET_MODE (op1);
17681
17682   switch (op_mode)
17683     {
17684     case E_QImode:
17685     case E_HImode:
17686     case E_SImode:
17687       cmp_mode = SImode;
17688       icode = CODE_FOR_cmpsi;
17689       break;
17690
17691     case E_DImode:
17692       cmp_mode = DImode;
17693       icode = CODE_FOR_cmpdi;
17694       break;
17695
17696     case E_SFmode:
17697       cmp_mode = SFmode;
17698       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17699       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
17700       break;
17701
17702     case E_DFmode:
17703       cmp_mode = DFmode;
17704       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17705       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
17706       break;
17707
17708     default:
17709       end_sequence ();
17710       return NULL_RTX;
17711     }
17712
17713   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
17714   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
17715   if (!op0 || !op1)
17716     {
17717       end_sequence ();
17718       return NULL_RTX;
17719     }
17720   *prep_seq = get_insns ();
17721   end_sequence ();
17722
17723   create_fixed_operand (&ops[0], op0);
17724   create_fixed_operand (&ops[1], op1);
17725
17726   start_sequence ();
17727   if (!maybe_expand_insn (icode, 2, ops))
17728     {
17729       end_sequence ();
17730       return NULL_RTX;
17731     }
17732   *gen_seq = get_insns ();
17733   end_sequence ();
17734
17735   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
17736                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
17737 }
17738
17739 static rtx
17740 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
17741                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
17742 {
17743   rtx op0, op1, target;
17744   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17745   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17746   insn_code icode;
17747   struct expand_operand ops[6];
17748   int aarch64_cond;
17749
17750   push_to_sequence (*prep_seq);
17751   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17752
17753   op_mode = GET_MODE (op0);
17754   if (op_mode == VOIDmode)
17755     op_mode = GET_MODE (op1);
17756
17757   switch (op_mode)
17758     {
17759     case E_QImode:
17760     case E_HImode:
17761     case E_SImode:
17762       cmp_mode = SImode;
17763       icode = CODE_FOR_ccmpsi;
17764       break;
17765
17766     case E_DImode:
17767       cmp_mode = DImode;
17768       icode = CODE_FOR_ccmpdi;
17769       break;
17770
17771     case E_SFmode:
17772       cmp_mode = SFmode;
17773       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17774       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
17775       break;
17776
17777     case E_DFmode:
17778       cmp_mode = DFmode;
17779       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17780       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
17781       break;
17782
17783     default:
17784       end_sequence ();
17785       return NULL_RTX;
17786     }
17787
17788   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
17789   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
17790   if (!op0 || !op1)
17791     {
17792       end_sequence ();
17793       return NULL_RTX;
17794     }
17795   *prep_seq = get_insns ();
17796   end_sequence ();
17797
17798   target = gen_rtx_REG (cc_mode, CC_REGNUM);
17799   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
17800
17801   if (bit_code != AND)
17802     {
17803       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
17804                                                 GET_MODE (XEXP (prev, 0))),
17805                              VOIDmode, XEXP (prev, 0), const0_rtx);
17806       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
17807     }
17808
17809   create_fixed_operand (&ops[0], XEXP (prev, 0));
17810   create_fixed_operand (&ops[1], target);
17811   create_fixed_operand (&ops[2], op0);
17812   create_fixed_operand (&ops[3], op1);
17813   create_fixed_operand (&ops[4], prev);
17814   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
17815
17816   push_to_sequence (*gen_seq);
17817   if (!maybe_expand_insn (icode, 6, ops))
17818     {
17819       end_sequence ();
17820       return NULL_RTX;
17821     }
17822
17823   *gen_seq = get_insns ();
17824   end_sequence ();
17825
17826   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
17827 }
17828
17829 #undef TARGET_GEN_CCMP_FIRST
17830 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
17831
17832 #undef TARGET_GEN_CCMP_NEXT
17833 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
17834
17835 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
17836    instruction fusion of some sort.  */
17837
17838 static bool
17839 aarch64_macro_fusion_p (void)
17840 {
17841   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
17842 }
17843
17844
17845 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
17846    should be kept together during scheduling.  */
17847
17848 static bool
17849 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
17850 {
17851   rtx set_dest;
17852   rtx prev_set = single_set (prev);
17853   rtx curr_set = single_set (curr);
17854   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
17855   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
17856
17857   if (!aarch64_macro_fusion_p ())
17858     return false;
17859
17860   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
17861     {
17862       /* We are trying to match:
17863          prev (mov)  == (set (reg r0) (const_int imm16))
17864          curr (movk) == (set (zero_extract (reg r0)
17865                                            (const_int 16)
17866                                            (const_int 16))
17867                              (const_int imm16_1))  */
17868
17869       set_dest = SET_DEST (curr_set);
17870
17871       if (GET_CODE (set_dest) == ZERO_EXTRACT
17872           && CONST_INT_P (SET_SRC (curr_set))
17873           && CONST_INT_P (SET_SRC (prev_set))
17874           && CONST_INT_P (XEXP (set_dest, 2))
17875           && INTVAL (XEXP (set_dest, 2)) == 16
17876           && REG_P (XEXP (set_dest, 0))
17877           && REG_P (SET_DEST (prev_set))
17878           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
17879         {
17880           return true;
17881         }
17882     }
17883
17884   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
17885     {
17886
17887       /*  We're trying to match:
17888           prev (adrp) == (set (reg r1)
17889                               (high (symbol_ref ("SYM"))))
17890           curr (add) == (set (reg r0)
17891                              (lo_sum (reg r1)
17892                                      (symbol_ref ("SYM"))))
17893           Note that r0 need not necessarily be the same as r1, especially
17894           during pre-regalloc scheduling.  */
17895
17896       if (satisfies_constraint_Ush (SET_SRC (prev_set))
17897           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17898         {
17899           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
17900               && REG_P (XEXP (SET_SRC (curr_set), 0))
17901               && REGNO (XEXP (SET_SRC (curr_set), 0))
17902                  == REGNO (SET_DEST (prev_set))
17903               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
17904                               XEXP (SET_SRC (curr_set), 1)))
17905             return true;
17906         }
17907     }
17908
17909   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
17910     {
17911
17912       /* We're trying to match:
17913          prev (movk) == (set (zero_extract (reg r0)
17914                                            (const_int 16)
17915                                            (const_int 32))
17916                              (const_int imm16_1))
17917          curr (movk) == (set (zero_extract (reg r0)
17918                                            (const_int 16)
17919                                            (const_int 48))
17920                              (const_int imm16_2))  */
17921
17922       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
17923           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
17924           && REG_P (XEXP (SET_DEST (prev_set), 0))
17925           && REG_P (XEXP (SET_DEST (curr_set), 0))
17926           && REGNO (XEXP (SET_DEST (prev_set), 0))
17927              == REGNO (XEXP (SET_DEST (curr_set), 0))
17928           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
17929           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
17930           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
17931           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
17932           && CONST_INT_P (SET_SRC (prev_set))
17933           && CONST_INT_P (SET_SRC (curr_set)))
17934         return true;
17935
17936     }
17937   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
17938     {
17939       /* We're trying to match:
17940           prev (adrp) == (set (reg r0)
17941                               (high (symbol_ref ("SYM"))))
17942           curr (ldr) == (set (reg r1)
17943                              (mem (lo_sum (reg r0)
17944                                              (symbol_ref ("SYM")))))
17945                  or
17946           curr (ldr) == (set (reg r1)
17947                              (zero_extend (mem
17948                                            (lo_sum (reg r0)
17949                                                    (symbol_ref ("SYM"))))))  */
17950       if (satisfies_constraint_Ush (SET_SRC (prev_set))
17951           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17952         {
17953           rtx curr_src = SET_SRC (curr_set);
17954
17955           if (GET_CODE (curr_src) == ZERO_EXTEND)
17956             curr_src = XEXP (curr_src, 0);
17957
17958           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
17959               && REG_P (XEXP (XEXP (curr_src, 0), 0))
17960               && REGNO (XEXP (XEXP (curr_src, 0), 0))
17961                  == REGNO (SET_DEST (prev_set))
17962               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
17963                               XEXP (SET_SRC (prev_set), 0)))
17964               return true;
17965         }
17966     }
17967
17968   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
17969       && any_condjump_p (curr))
17970     {
17971       unsigned int condreg1, condreg2;
17972       rtx cc_reg_1;
17973       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
17974       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
17975
17976       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
17977           && prev
17978           && modified_in_p (cc_reg_1, prev))
17979         {
17980           enum attr_type prev_type = get_attr_type (prev);
17981
17982           /* FIXME: this misses some which is considered simple arthematic
17983              instructions for ThunderX.  Simple shifts are missed here.  */
17984           if (prev_type == TYPE_ALUS_SREG
17985               || prev_type == TYPE_ALUS_IMM
17986               || prev_type == TYPE_LOGICS_REG
17987               || prev_type == TYPE_LOGICS_IMM)
17988             return true;
17989         }
17990     }
17991
17992   if (prev_set
17993       && curr_set
17994       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
17995       && any_condjump_p (curr))
17996     {
17997       /* We're trying to match:
17998           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
17999           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
18000                                                          (const_int 0))
18001                                                  (label_ref ("SYM"))
18002                                                  (pc))  */
18003       if (SET_DEST (curr_set) == (pc_rtx)
18004           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
18005           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
18006           && REG_P (SET_DEST (prev_set))
18007           && REGNO (SET_DEST (prev_set))
18008              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
18009         {
18010           /* Fuse ALU operations followed by conditional branch instruction.  */
18011           switch (get_attr_type (prev))
18012             {
18013             case TYPE_ALU_IMM:
18014             case TYPE_ALU_SREG:
18015             case TYPE_ADC_REG:
18016             case TYPE_ADC_IMM:
18017             case TYPE_ADCS_REG:
18018             case TYPE_ADCS_IMM:
18019             case TYPE_LOGIC_REG:
18020             case TYPE_LOGIC_IMM:
18021             case TYPE_CSEL:
18022             case TYPE_ADR:
18023             case TYPE_MOV_IMM:
18024             case TYPE_SHIFT_REG:
18025             case TYPE_SHIFT_IMM:
18026             case TYPE_BFM:
18027             case TYPE_RBIT:
18028             case TYPE_REV:
18029             case TYPE_EXTEND:
18030               return true;
18031
18032             default:;
18033             }
18034         }
18035     }
18036
18037   return false;
18038 }
18039
18040 /* Return true iff the instruction fusion described by OP is enabled.  */
18041
18042 bool
18043 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
18044 {
18045   return (aarch64_tune_params.fusible_ops & op) != 0;
18046 }
18047
18048 /* If MEM is in the form of [base+offset], extract the two parts
18049    of address and set to BASE and OFFSET, otherwise return false
18050    after clearing BASE and OFFSET.  */
18051
18052 bool
18053 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
18054 {
18055   rtx addr;
18056
18057   gcc_assert (MEM_P (mem));
18058
18059   addr = XEXP (mem, 0);
18060
18061   if (REG_P (addr))
18062     {
18063       *base = addr;
18064       *offset = const0_rtx;
18065       return true;
18066     }
18067
18068   if (GET_CODE (addr) == PLUS
18069       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
18070     {
18071       *base = XEXP (addr, 0);
18072       *offset = XEXP (addr, 1);
18073       return true;
18074     }
18075
18076   *base = NULL_RTX;
18077   *offset = NULL_RTX;
18078
18079   return false;
18080 }
18081
18082 /* Types for scheduling fusion.  */
18083 enum sched_fusion_type
18084 {
18085   SCHED_FUSION_NONE = 0,
18086   SCHED_FUSION_LD_SIGN_EXTEND,
18087   SCHED_FUSION_LD_ZERO_EXTEND,
18088   SCHED_FUSION_LD,
18089   SCHED_FUSION_ST,
18090   SCHED_FUSION_NUM
18091 };
18092
18093 /* If INSN is a load or store of address in the form of [base+offset],
18094    extract the two parts and set to BASE and OFFSET.  Return scheduling
18095    fusion type this INSN is.  */
18096
18097 static enum sched_fusion_type
18098 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
18099 {
18100   rtx x, dest, src;
18101   enum sched_fusion_type fusion = SCHED_FUSION_LD;
18102
18103   gcc_assert (INSN_P (insn));
18104   x = PATTERN (insn);
18105   if (GET_CODE (x) != SET)
18106     return SCHED_FUSION_NONE;
18107
18108   src = SET_SRC (x);
18109   dest = SET_DEST (x);
18110
18111   machine_mode dest_mode = GET_MODE (dest);
18112
18113   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
18114     return SCHED_FUSION_NONE;
18115
18116   if (GET_CODE (src) == SIGN_EXTEND)
18117     {
18118       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
18119       src = XEXP (src, 0);
18120       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18121         return SCHED_FUSION_NONE;
18122     }
18123   else if (GET_CODE (src) == ZERO_EXTEND)
18124     {
18125       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
18126       src = XEXP (src, 0);
18127       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18128         return SCHED_FUSION_NONE;
18129     }
18130
18131   if (GET_CODE (src) == MEM && REG_P (dest))
18132     extract_base_offset_in_addr (src, base, offset);
18133   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
18134     {
18135       fusion = SCHED_FUSION_ST;
18136       extract_base_offset_in_addr (dest, base, offset);
18137     }
18138   else
18139     return SCHED_FUSION_NONE;
18140
18141   if (*base == NULL_RTX || *offset == NULL_RTX)
18142     fusion = SCHED_FUSION_NONE;
18143
18144   return fusion;
18145 }
18146
18147 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
18148
18149    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
18150    and PRI are only calculated for these instructions.  For other instruction,
18151    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
18152    type instruction fusion can be added by returning different priorities.
18153
18154    It's important that irrelevant instructions get the largest FUSION_PRI.  */
18155
18156 static void
18157 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
18158                                int *fusion_pri, int *pri)
18159 {
18160   int tmp, off_val;
18161   rtx base, offset;
18162   enum sched_fusion_type fusion;
18163
18164   gcc_assert (INSN_P (insn));
18165
18166   tmp = max_pri - 1;
18167   fusion = fusion_load_store (insn, &base, &offset);
18168   if (fusion == SCHED_FUSION_NONE)
18169     {
18170       *pri = tmp;
18171       *fusion_pri = tmp;
18172       return;
18173     }
18174
18175   /* Set FUSION_PRI according to fusion type and base register.  */
18176   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
18177
18178   /* Calculate PRI.  */
18179   tmp /= 2;
18180
18181   /* INSN with smaller offset goes first.  */
18182   off_val = (int)(INTVAL (offset));
18183   if (off_val >= 0)
18184     tmp -= (off_val & 0xfffff);
18185   else
18186     tmp += ((- off_val) & 0xfffff);
18187
18188   *pri = tmp;
18189   return;
18190 }
18191
18192 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
18193    Adjust priority of sha1h instructions so they are scheduled before
18194    other SHA1 instructions.  */
18195
18196 static int
18197 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
18198 {
18199   rtx x = PATTERN (insn);
18200
18201   if (GET_CODE (x) == SET)
18202     {
18203       x = SET_SRC (x);
18204
18205       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
18206         return priority + 10;
18207     }
18208
18209   return priority;
18210 }
18211
18212 /* Given OPERANDS of consecutive load/store, check if we can merge
18213    them into ldp/stp.  LOAD is true if they are load instructions.
18214    MODE is the mode of memory operands.  */
18215
18216 bool
18217 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
18218                                 machine_mode mode)
18219 {
18220   HOST_WIDE_INT offval_1, offval_2, msize;
18221   enum reg_class rclass_1, rclass_2;
18222   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
18223
18224   if (load)
18225     {
18226       mem_1 = operands[1];
18227       mem_2 = operands[3];
18228       reg_1 = operands[0];
18229       reg_2 = operands[2];
18230       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
18231       if (REGNO (reg_1) == REGNO (reg_2))
18232         return false;
18233     }
18234   else
18235     {
18236       mem_1 = operands[0];
18237       mem_2 = operands[2];
18238       reg_1 = operands[1];
18239       reg_2 = operands[3];
18240     }
18241
18242   /* The mems cannot be volatile.  */
18243   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
18244     return false;
18245
18246   /* If we have SImode and slow unaligned ldp,
18247      check the alignment to be at least 8 byte. */
18248   if (mode == SImode
18249       && (aarch64_tune_params.extra_tuning_flags
18250           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18251       && !optimize_size
18252       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
18253     return false;
18254
18255   /* Check if the addresses are in the form of [base+offset].  */
18256   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
18257   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
18258     return false;
18259   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
18260   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
18261     return false;
18262
18263   /* Check if the bases are same.  */
18264   if (!rtx_equal_p (base_1, base_2))
18265     return false;
18266
18267   /* The operands must be of the same size.  */
18268   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
18269                          GET_MODE_SIZE (GET_MODE (mem_2))));
18270
18271   offval_1 = INTVAL (offset_1);
18272   offval_2 = INTVAL (offset_2);
18273   /* We should only be trying this for fixed-sized modes.  There is no
18274      SVE LDP/STP instruction.  */
18275   msize = GET_MODE_SIZE (mode).to_constant ();
18276   /* Check if the offsets are consecutive.  */
18277   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
18278     return false;
18279
18280   /* Check if the addresses are clobbered by load.  */
18281   if (load)
18282     {
18283       if (reg_mentioned_p (reg_1, mem_1))
18284         return false;
18285
18286       /* In increasing order, the last load can clobber the address.  */
18287       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
18288         return false;
18289     }
18290
18291   /* One of the memory accesses must be a mempair operand.
18292      If it is not the first one, they need to be swapped by the
18293      peephole.  */
18294   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
18295        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
18296     return false;
18297
18298   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
18299     rclass_1 = FP_REGS;
18300   else
18301     rclass_1 = GENERAL_REGS;
18302
18303   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
18304     rclass_2 = FP_REGS;
18305   else
18306     rclass_2 = GENERAL_REGS;
18307
18308   /* Check if the registers are of same class.  */
18309   if (rclass_1 != rclass_2)
18310     return false;
18311
18312   return true;
18313 }
18314
18315 /* Given OPERANDS of consecutive load/store that can be merged,
18316    swap them if they are not in ascending order.  */
18317 void
18318 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
18319 {
18320   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
18321   HOST_WIDE_INT offval_1, offval_2;
18322
18323   if (load)
18324     {
18325       mem_1 = operands[1];
18326       mem_2 = operands[3];
18327     }
18328   else
18329     {
18330       mem_1 = operands[0];
18331       mem_2 = operands[2];
18332     }
18333
18334   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
18335   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
18336
18337   offval_1 = INTVAL (offset_1);
18338   offval_2 = INTVAL (offset_2);
18339
18340   if (offval_1 > offval_2)
18341     {
18342       /* Irrespective of whether this is a load or a store,
18343          we do the same swap.  */
18344       std::swap (operands[0], operands[2]);
18345       std::swap (operands[1], operands[3]);
18346     }
18347 }
18348
18349 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
18350    comparison between the two.  */
18351 int
18352 aarch64_host_wide_int_compare (const void *x, const void *y)
18353 {
18354   return wi::cmps (* ((const HOST_WIDE_INT *) x),
18355                    * ((const HOST_WIDE_INT *) y));
18356 }
18357
18358 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
18359    other pointing to a REG rtx containing an offset, compare the offsets
18360    of the two pairs.
18361
18362    Return:
18363
18364         1 iff offset (X) > offset (Y)
18365         0 iff offset (X) == offset (Y)
18366         -1 iff offset (X) < offset (Y)  */
18367 int
18368 aarch64_ldrstr_offset_compare (const void *x, const void *y)
18369 {
18370   const rtx * operands_1 = (const rtx *) x;
18371   const rtx * operands_2 = (const rtx *) y;
18372   rtx mem_1, mem_2, base, offset_1, offset_2;
18373
18374   if (MEM_P (operands_1[0]))
18375     mem_1 = operands_1[0];
18376   else
18377     mem_1 = operands_1[1];
18378
18379   if (MEM_P (operands_2[0]))
18380     mem_2 = operands_2[0];
18381   else
18382     mem_2 = operands_2[1];
18383
18384   /* Extract the offsets.  */
18385   extract_base_offset_in_addr (mem_1, &base, &offset_1);
18386   extract_base_offset_in_addr (mem_2, &base, &offset_2);
18387
18388   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
18389
18390   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
18391 }
18392
18393 /* Given OPERANDS of consecutive load/store, check if we can merge
18394    them into ldp/stp by adjusting the offset.  LOAD is true if they
18395    are load instructions.  MODE is the mode of memory operands.
18396
18397    Given below consecutive stores:
18398
18399      str  w1, [xb, 0x100]
18400      str  w1, [xb, 0x104]
18401      str  w1, [xb, 0x108]
18402      str  w1, [xb, 0x10c]
18403
18404    Though the offsets are out of the range supported by stp, we can
18405    still pair them after adjusting the offset, like:
18406
18407      add  scratch, xb, 0x100
18408      stp  w1, w1, [scratch]
18409      stp  w1, w1, [scratch, 0x8]
18410
18411    The peephole patterns detecting this opportunity should guarantee
18412    the scratch register is avaliable.  */
18413
18414 bool
18415 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
18416                                        scalar_mode mode)
18417 {
18418   const int num_insns = 4;
18419   enum reg_class rclass;
18420   HOST_WIDE_INT offvals[num_insns], msize;
18421   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
18422
18423   if (load)
18424     {
18425       for (int i = 0; i < num_insns; i++)
18426         {
18427           reg[i] = operands[2 * i];
18428           mem[i] = operands[2 * i + 1];
18429
18430           gcc_assert (REG_P (reg[i]));
18431         }
18432
18433       /* Do not attempt to merge the loads if the loads clobber each other.  */
18434       for (int i = 0; i < 8; i += 2)
18435         for (int j = i + 2; j < 8; j += 2)
18436           if (reg_overlap_mentioned_p (operands[i], operands[j]))
18437             return false;
18438     }
18439   else
18440     for (int i = 0; i < num_insns; i++)
18441       {
18442         mem[i] = operands[2 * i];
18443         reg[i] = operands[2 * i + 1];
18444       }
18445
18446   /* Skip if memory operand is by itself valid for ldp/stp.  */
18447   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
18448     return false;
18449
18450   for (int i = 0; i < num_insns; i++)
18451     {
18452       /* The mems cannot be volatile.  */
18453       if (MEM_VOLATILE_P (mem[i]))
18454         return false;
18455
18456       /* Check if the addresses are in the form of [base+offset].  */
18457       extract_base_offset_in_addr (mem[i], base + i, offset + i);
18458       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
18459         return false;
18460     }
18461
18462   /* Check if the registers are of same class.  */
18463   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
18464     ? FP_REGS : GENERAL_REGS;
18465
18466   for (int i = 1; i < num_insns; i++)
18467     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
18468       {
18469         if (rclass != FP_REGS)
18470           return false;
18471       }
18472     else
18473       {
18474         if (rclass != GENERAL_REGS)
18475           return false;
18476       }
18477
18478   /* Only the last register in the order in which they occur
18479      may be clobbered by the load.  */
18480   if (rclass == GENERAL_REGS && load)
18481     for (int i = 0; i < num_insns - 1; i++)
18482       if (reg_mentioned_p (reg[i], mem[i]))
18483         return false;
18484
18485   /* Check if the bases are same.  */
18486   for (int i = 0; i < num_insns - 1; i++)
18487     if (!rtx_equal_p (base[i], base[i + 1]))
18488       return false;
18489
18490   for (int i = 0; i < num_insns; i++)
18491     offvals[i] = INTVAL (offset[i]);
18492
18493   msize = GET_MODE_SIZE (mode);
18494
18495   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
18496   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
18497          aarch64_host_wide_int_compare);
18498
18499   if (!(offvals[1] == offvals[0] + msize
18500         && offvals[3] == offvals[2] + msize))
18501     return false;
18502
18503   /* Check that offsets are within range of each other.  The ldp/stp
18504      instructions have 7 bit immediate offsets, so use 0x80.  */
18505   if (offvals[2] - offvals[0] >= msize * 0x80)
18506     return false;
18507
18508   /* The offsets must be aligned with respect to each other.  */
18509   if (offvals[0] % msize != offvals[2] % msize)
18510     return false;
18511
18512   /* If we have SImode and slow unaligned ldp,
18513      check the alignment to be at least 8 byte. */
18514   if (mode == SImode
18515       && (aarch64_tune_params.extra_tuning_flags
18516           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18517       && !optimize_size
18518       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
18519     return false;
18520
18521   return true;
18522 }
18523
18524 /* Given OPERANDS of consecutive load/store, this function pairs them
18525    into LDP/STP after adjusting the offset.  It depends on the fact
18526    that the operands can be sorted so the offsets are correct for STP.
18527    MODE is the mode of memory operands.  CODE is the rtl operator
18528    which should be applied to all memory operands, it's SIGN_EXTEND,
18529    ZERO_EXTEND or UNKNOWN.  */
18530
18531 bool
18532 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
18533                              scalar_mode mode, RTX_CODE code)
18534 {
18535   rtx base, offset_1, offset_3, t1, t2;
18536   rtx mem_1, mem_2, mem_3, mem_4;
18537   rtx temp_operands[8];
18538   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
18539                 stp_off_upper_limit, stp_off_lower_limit, msize;
18540
18541   /* We make changes on a copy as we may still bail out.  */
18542   for (int i = 0; i < 8; i ++)
18543     temp_operands[i] = operands[i];
18544
18545   /* Sort the operands.  */
18546   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
18547
18548   if (load)
18549     {
18550       mem_1 = temp_operands[1];
18551       mem_2 = temp_operands[3];
18552       mem_3 = temp_operands[5];
18553       mem_4 = temp_operands[7];
18554     }
18555   else
18556     {
18557       mem_1 = temp_operands[0];
18558       mem_2 = temp_operands[2];
18559       mem_3 = temp_operands[4];
18560       mem_4 = temp_operands[6];
18561       gcc_assert (code == UNKNOWN);
18562     }
18563
18564   extract_base_offset_in_addr (mem_1, &base, &offset_1);
18565   extract_base_offset_in_addr (mem_3, &base, &offset_3);
18566   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
18567               && offset_3 != NULL_RTX);
18568
18569   /* Adjust offset so it can fit in LDP/STP instruction.  */
18570   msize = GET_MODE_SIZE (mode);
18571   stp_off_upper_limit = msize * (0x40 - 1);
18572   stp_off_lower_limit = - msize * 0x40;
18573
18574   off_val_1 = INTVAL (offset_1);
18575   off_val_3 = INTVAL (offset_3);
18576
18577   /* The base offset is optimally half way between the two STP/LDP offsets.  */
18578   if (msize <= 4)
18579     base_off = (off_val_1 + off_val_3) / 2;
18580   else
18581     /* However, due to issues with negative LDP/STP offset generation for
18582        larger modes, for DF, DI and vector modes. we must not use negative
18583        addresses smaller than 9 signed unadjusted bits can store.  This
18584        provides the most range in this case.  */
18585     base_off = off_val_1;
18586
18587   /* Adjust the base so that it is aligned with the addresses but still
18588      optimal.  */
18589   if (base_off % msize != off_val_1 % msize)
18590     /* Fix the offset, bearing in mind we want to make it bigger not
18591        smaller.  */
18592     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18593   else if (msize <= 4)
18594     /* The negative range of LDP/STP is one larger than the positive range.  */
18595     base_off += msize;
18596
18597   /* Check if base offset is too big or too small.  We can attempt to resolve
18598      this issue by setting it to the maximum value and seeing if the offsets
18599      still fit.  */
18600   if (base_off >= 0x1000)
18601     {
18602       base_off = 0x1000 - 1;
18603       /* We must still make sure that the base offset is aligned with respect
18604          to the address.  But it may may not be made any bigger.  */
18605       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18606     }
18607
18608   /* Likewise for the case where the base is too small.  */
18609   if (base_off <= -0x1000)
18610     {
18611       base_off = -0x1000 + 1;
18612       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18613     }
18614
18615   /* Offset of the first STP/LDP.  */
18616   new_off_1 = off_val_1 - base_off;
18617
18618   /* Offset of the second STP/LDP.  */
18619   new_off_3 = off_val_3 - base_off;
18620
18621   /* The offsets must be within the range of the LDP/STP instructions.  */
18622   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
18623       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
18624     return false;
18625
18626   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
18627                                                   new_off_1), true);
18628   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
18629                                                   new_off_1 + msize), true);
18630   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
18631                                                   new_off_3), true);
18632   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
18633                                                   new_off_3 + msize), true);
18634
18635   if (!aarch64_mem_pair_operand (mem_1, mode)
18636       || !aarch64_mem_pair_operand (mem_3, mode))
18637     return false;
18638
18639   if (code == ZERO_EXTEND)
18640     {
18641       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
18642       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
18643       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
18644       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
18645     }
18646   else if (code == SIGN_EXTEND)
18647     {
18648       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
18649       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
18650       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
18651       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
18652     }
18653
18654   if (load)
18655     {
18656       operands[0] = temp_operands[0];
18657       operands[1] = mem_1;
18658       operands[2] = temp_operands[2];
18659       operands[3] = mem_2;
18660       operands[4] = temp_operands[4];
18661       operands[5] = mem_3;
18662       operands[6] = temp_operands[6];
18663       operands[7] = mem_4;
18664     }
18665   else
18666     {
18667       operands[0] = mem_1;
18668       operands[1] = temp_operands[1];
18669       operands[2] = mem_2;
18670       operands[3] = temp_operands[3];
18671       operands[4] = mem_3;
18672       operands[5] = temp_operands[5];
18673       operands[6] = mem_4;
18674       operands[7] = temp_operands[7];
18675     }
18676
18677   /* Emit adjusting instruction.  */
18678   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
18679   /* Emit ldp/stp instructions.  */
18680   t1 = gen_rtx_SET (operands[0], operands[1]);
18681   t2 = gen_rtx_SET (operands[2], operands[3]);
18682   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18683   t1 = gen_rtx_SET (operands[4], operands[5]);
18684   t2 = gen_rtx_SET (operands[6], operands[7]);
18685   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18686   return true;
18687 }
18688
18689 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
18690    it isn't worth branching around empty masked ops (including masked
18691    stores).  */
18692
18693 static bool
18694 aarch64_empty_mask_is_expensive (unsigned)
18695 {
18696   return false;
18697 }
18698
18699 /* Return 1 if pseudo register should be created and used to hold
18700    GOT address for PIC code.  */
18701
18702 bool
18703 aarch64_use_pseudo_pic_reg (void)
18704 {
18705   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
18706 }
18707
18708 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
18709
18710 static int
18711 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
18712 {
18713   switch (XINT (x, 1))
18714     {
18715     case UNSPEC_GOTSMALLPIC:
18716     case UNSPEC_GOTSMALLPIC28K:
18717     case UNSPEC_GOTTINYPIC:
18718       return 0;
18719     default:
18720       break;
18721     }
18722
18723   return default_unspec_may_trap_p (x, flags);
18724 }
18725
18726
18727 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
18728    return the log2 of that value.  Otherwise return -1.  */
18729
18730 int
18731 aarch64_fpconst_pow_of_2 (rtx x)
18732 {
18733   const REAL_VALUE_TYPE *r;
18734
18735   if (!CONST_DOUBLE_P (x))
18736     return -1;
18737
18738   r = CONST_DOUBLE_REAL_VALUE (x);
18739
18740   if (REAL_VALUE_NEGATIVE (*r)
18741       || REAL_VALUE_ISNAN (*r)
18742       || REAL_VALUE_ISINF (*r)
18743       || !real_isinteger (r, DFmode))
18744     return -1;
18745
18746   return exact_log2 (real_to_integer (r));
18747 }
18748
18749 /* If X is a vector of equal CONST_DOUBLE values and that value is
18750    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
18751
18752 int
18753 aarch64_vec_fpconst_pow_of_2 (rtx x)
18754 {
18755   int nelts;
18756   if (GET_CODE (x) != CONST_VECTOR
18757       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
18758     return -1;
18759
18760   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
18761     return -1;
18762
18763   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
18764   if (firstval <= 0)
18765     return -1;
18766
18767   for (int i = 1; i < nelts; i++)
18768     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
18769       return -1;
18770
18771   return firstval;
18772 }
18773
18774 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
18775    to float.
18776
18777    __fp16 always promotes through this hook.
18778    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
18779    through the generic excess precision logic rather than here.  */
18780
18781 static tree
18782 aarch64_promoted_type (const_tree t)
18783 {
18784   if (SCALAR_FLOAT_TYPE_P (t)
18785       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
18786     return float_type_node;
18787
18788   return NULL_TREE;
18789 }
18790
18791 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
18792
18793 static bool
18794 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
18795                            optimization_type opt_type)
18796 {
18797   switch (op)
18798     {
18799     case rsqrt_optab:
18800       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
18801
18802     default:
18803       return true;
18804     }
18805 }
18806
18807 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
18808
18809 static unsigned int
18810 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
18811                                         int *offset)
18812 {
18813   /* Polynomial invariant 1 == (VG / 2) - 1.  */
18814   gcc_assert (i == 1);
18815   *factor = 2;
18816   *offset = 1;
18817   return AARCH64_DWARF_VG;
18818 }
18819
18820 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
18821    if MODE is HFmode, and punt to the generic implementation otherwise.  */
18822
18823 static bool
18824 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
18825 {
18826   return (mode == HFmode
18827           ? true
18828           : default_libgcc_floating_mode_supported_p (mode));
18829 }
18830
18831 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
18832    if MODE is HFmode, and punt to the generic implementation otherwise.  */
18833
18834 static bool
18835 aarch64_scalar_mode_supported_p (scalar_mode mode)
18836 {
18837   return (mode == HFmode
18838           ? true
18839           : default_scalar_mode_supported_p (mode));
18840 }
18841
18842 /* Set the value of FLT_EVAL_METHOD.
18843    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
18844
18845     0: evaluate all operations and constants, whose semantic type has at
18846        most the range and precision of type float, to the range and
18847        precision of float; evaluate all other operations and constants to
18848        the range and precision of the semantic type;
18849
18850     N, where _FloatN is a supported interchange floating type
18851        evaluate all operations and constants, whose semantic type has at
18852        most the range and precision of _FloatN type, to the range and
18853        precision of the _FloatN type; evaluate all other operations and
18854        constants to the range and precision of the semantic type;
18855
18856    If we have the ARMv8.2-A extensions then we support _Float16 in native
18857    precision, so we should set this to 16.  Otherwise, we support the type,
18858    but want to evaluate expressions in float precision, so set this to
18859    0.  */
18860
18861 static enum flt_eval_method
18862 aarch64_excess_precision (enum excess_precision_type type)
18863 {
18864   switch (type)
18865     {
18866       case EXCESS_PRECISION_TYPE_FAST:
18867       case EXCESS_PRECISION_TYPE_STANDARD:
18868         /* We can calculate either in 16-bit range and precision or
18869            32-bit range and precision.  Make that decision based on whether
18870            we have native support for the ARMv8.2-A 16-bit floating-point
18871            instructions or not.  */
18872         return (TARGET_FP_F16INST
18873                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
18874                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
18875       case EXCESS_PRECISION_TYPE_IMPLICIT:
18876         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
18877       default:
18878         gcc_unreachable ();
18879     }
18880   return FLT_EVAL_METHOD_UNPREDICTABLE;
18881 }
18882
18883 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
18884    scheduled for speculative execution.  Reject the long-running division
18885    and square-root instructions.  */
18886
18887 static bool
18888 aarch64_sched_can_speculate_insn (rtx_insn *insn)
18889 {
18890   switch (get_attr_type (insn))
18891     {
18892       case TYPE_SDIV:
18893       case TYPE_UDIV:
18894       case TYPE_FDIVS:
18895       case TYPE_FDIVD:
18896       case TYPE_FSQRTS:
18897       case TYPE_FSQRTD:
18898       case TYPE_NEON_FP_SQRT_S:
18899       case TYPE_NEON_FP_SQRT_D:
18900       case TYPE_NEON_FP_SQRT_S_Q:
18901       case TYPE_NEON_FP_SQRT_D_Q:
18902       case TYPE_NEON_FP_DIV_S:
18903       case TYPE_NEON_FP_DIV_D:
18904       case TYPE_NEON_FP_DIV_S_Q:
18905       case TYPE_NEON_FP_DIV_D_Q:
18906         return false;
18907       default:
18908         return true;
18909     }
18910 }
18911
18912 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
18913
18914 static int
18915 aarch64_compute_pressure_classes (reg_class *classes)
18916 {
18917   int i = 0;
18918   classes[i++] = GENERAL_REGS;
18919   classes[i++] = FP_REGS;
18920   /* PR_REGS isn't a useful pressure class because many predicate pseudo
18921      registers need to go in PR_LO_REGS at some point during their
18922      lifetime.  Splitting it into two halves has the effect of making
18923      all predicates count against PR_LO_REGS, so that we try whenever
18924      possible to restrict the number of live predicates to 8.  This
18925      greatly reduces the amount of spilling in certain loops.  */
18926   classes[i++] = PR_LO_REGS;
18927   classes[i++] = PR_HI_REGS;
18928   return i;
18929 }
18930
18931 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
18932
18933 static bool
18934 aarch64_can_change_mode_class (machine_mode from,
18935                                machine_mode to, reg_class_t)
18936 {
18937   if (BYTES_BIG_ENDIAN)
18938     {
18939       bool from_sve_p = aarch64_sve_data_mode_p (from);
18940       bool to_sve_p = aarch64_sve_data_mode_p (to);
18941
18942       /* Don't allow changes between SVE data modes and non-SVE modes.
18943          See the comment at the head of aarch64-sve.md for details.  */
18944       if (from_sve_p != to_sve_p)
18945         return false;
18946
18947       /* Don't allow changes in element size: lane 0 of the new vector
18948          would not then be lane 0 of the old vector.  See the comment
18949          above aarch64_maybe_expand_sve_subreg_move for a more detailed
18950          description.
18951
18952          In the worst case, this forces a register to be spilled in
18953          one mode and reloaded in the other, which handles the
18954          endianness correctly.  */
18955       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
18956         return false;
18957     }
18958   return true;
18959 }
18960
18961 /* Implement TARGET_EARLY_REMAT_MODES.  */
18962
18963 static void
18964 aarch64_select_early_remat_modes (sbitmap modes)
18965 {
18966   /* SVE values are not normally live across a call, so it should be
18967      worth doing early rematerialization even in VL-specific mode.  */
18968   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
18969     {
18970       machine_mode mode = (machine_mode) i;
18971       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
18972       if (vec_flags & VEC_ANY_SVE)
18973         bitmap_set_bit (modes, i);
18974     }
18975 }
18976
18977 /* Override the default target speculation_safe_value.  */
18978 static rtx
18979 aarch64_speculation_safe_value (machine_mode mode,
18980                                 rtx result, rtx val, rtx failval)
18981 {
18982   /* Maybe we should warn if falling back to hard barriers.  They are
18983      likely to be noticably more expensive than the alternative below.  */
18984   if (!aarch64_track_speculation)
18985     return default_speculation_safe_value (mode, result, val, failval);
18986
18987   if (!REG_P (val))
18988     val = copy_to_mode_reg (mode, val);
18989
18990   if (!aarch64_reg_or_zero (failval, mode))
18991     failval = copy_to_mode_reg (mode, failval);
18992
18993   emit_insn (gen_despeculate_copy (mode, result, val, failval));
18994   return result;
18995 }
18996
18997 /* Implement TARGET_ESTIMATED_POLY_VALUE.
18998    Look into the tuning structure for an estimate.
18999    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
19000    Advanced SIMD 128 bits.  */
19001
19002 static HOST_WIDE_INT
19003 aarch64_estimated_poly_value (poly_int64 val)
19004 {
19005   enum aarch64_sve_vector_bits_enum width_source
19006     = aarch64_tune_params.sve_width;
19007
19008   /* If we still don't have an estimate, use the default.  */
19009   if (width_source == SVE_SCALABLE)
19010     return default_estimated_poly_value (val);
19011
19012   HOST_WIDE_INT over_128 = width_source - 128;
19013   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
19014 }
19015
19016
19017 /* Return true for types that could be supported as SIMD return or
19018    argument types.  */
19019
19020 static bool
19021 supported_simd_type (tree t)
19022 {
19023   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
19024     {
19025       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
19026       return s == 1 || s == 2 || s == 4 || s == 8;
19027     }
19028   return false;
19029 }
19030
19031 /* Return true for types that currently are supported as SIMD return
19032    or argument types.  */
19033
19034 static bool
19035 currently_supported_simd_type (tree t, tree b)
19036 {
19037   if (COMPLEX_FLOAT_TYPE_P (t))
19038     return false;
19039
19040   if (TYPE_SIZE (t) != TYPE_SIZE (b))
19041     return false;
19042
19043   return supported_simd_type (t);
19044 }
19045
19046 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
19047
19048 static int
19049 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
19050                                         struct cgraph_simd_clone *clonei,
19051                                         tree base_type, int num)
19052 {
19053   tree t, ret_type, arg_type;
19054   unsigned int elt_bits, vec_bits, count;
19055
19056   if (!TARGET_SIMD)
19057     return 0;
19058
19059   if (clonei->simdlen
19060       && (clonei->simdlen < 2
19061           || clonei->simdlen > 1024
19062           || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
19063     {
19064       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19065                   "unsupported simdlen %d", clonei->simdlen);
19066       return 0;
19067     }
19068
19069   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
19070   if (TREE_CODE (ret_type) != VOID_TYPE
19071       && !currently_supported_simd_type (ret_type, base_type))
19072     {
19073       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
19074         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19075                     "GCC does not currently support mixed size types "
19076                     "for %<simd%> functions");
19077       else if (supported_simd_type (ret_type))
19078         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19079                     "GCC does not currently support return type %qT "
19080                     "for %<simd%> functions", ret_type);
19081       else
19082         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19083                     "unsupported return type %qT for %<simd%> functions",
19084                     ret_type);
19085       return 0;
19086     }
19087
19088   for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
19089     {
19090       arg_type = TREE_TYPE (t);
19091
19092       if (!currently_supported_simd_type (arg_type, base_type))
19093         {
19094           if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
19095             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19096                         "GCC does not currently support mixed size types "
19097                         "for %<simd%> functions");
19098           else
19099             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19100                         "GCC does not currently support argument type %qT "
19101                         "for %<simd%> functions", arg_type);
19102           return 0;
19103         }
19104     }
19105
19106   clonei->vecsize_mangle = 'n';
19107   clonei->mask_mode = VOIDmode;
19108   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
19109   if (clonei->simdlen == 0)
19110     {
19111       count = 2;
19112       vec_bits = (num == 0 ? 64 : 128);
19113       clonei->simdlen = vec_bits / elt_bits;
19114     }
19115   else
19116     {
19117       count = 1;
19118       vec_bits = clonei->simdlen * elt_bits;
19119       if (vec_bits != 64 && vec_bits != 128)
19120         {
19121           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19122                       "GCC does not currently support simdlen %d for type %qT",
19123                       clonei->simdlen, base_type);
19124           return 0;
19125         }
19126     }
19127   clonei->vecsize_int = vec_bits;
19128   clonei->vecsize_float = vec_bits;
19129   return count;
19130 }
19131
19132 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
19133
19134 static void
19135 aarch64_simd_clone_adjust (struct cgraph_node *node)
19136 {
19137   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
19138      use the correct ABI.  */
19139
19140   tree t = TREE_TYPE (node->decl);
19141   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
19142                                         TYPE_ATTRIBUTES (t));
19143 }
19144
19145 /* Implement TARGET_SIMD_CLONE_USABLE.  */
19146
19147 static int
19148 aarch64_simd_clone_usable (struct cgraph_node *node)
19149 {
19150   switch (node->simdclone->vecsize_mangle)
19151     {
19152     case 'n':
19153       if (!TARGET_SIMD)
19154         return -1;
19155       return 0;
19156     default:
19157       gcc_unreachable ();
19158     }
19159 }
19160
19161 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
19162
19163 static int
19164 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
19165 {
19166   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
19167       != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
19168     return 0;
19169   return 1;
19170 }
19171
19172 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
19173
19174 static const char *
19175 aarch64_get_multilib_abi_name (void)
19176 {
19177   if (TARGET_BIG_END)
19178     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
19179   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
19180 }
19181
19182 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
19183    global variable based guard use the default else
19184    return a null tree.  */
19185 static tree
19186 aarch64_stack_protect_guard (void)
19187 {
19188   if (aarch64_stack_protector_guard == SSP_GLOBAL)
19189     return default_stack_protect_guard ();
19190
19191   return NULL_TREE;
19192 }
19193
19194 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
19195    section at the end if needed.  */
19196 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
19197 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
19198 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
19199 void
19200 aarch64_file_end_indicate_exec_stack ()
19201 {
19202   file_end_indicate_exec_stack ();
19203
19204   unsigned feature_1_and = 0;
19205   if (aarch64_bti_enabled ())
19206     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
19207
19208   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
19209     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
19210
19211   if (feature_1_and)
19212     {
19213       /* Generate .note.gnu.property section.  */
19214       switch_to_section (get_section (".note.gnu.property",
19215                                       SECTION_NOTYPE, NULL));
19216
19217       /* PT_NOTE header: namesz, descsz, type.
19218          namesz = 4 ("GNU\0")
19219          descsz = 16 (Size of the program property array)
19220                   [(12 + padding) * Number of array elements]
19221          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
19222       assemble_align (POINTER_SIZE);
19223       assemble_integer (GEN_INT (4), 4, 32, 1);
19224       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
19225       assemble_integer (GEN_INT (5), 4, 32, 1);
19226
19227       /* PT_NOTE name.  */
19228       assemble_string ("GNU", 4);
19229
19230       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
19231          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
19232          datasz = 4
19233          data   = feature_1_and.  */
19234       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
19235       assemble_integer (GEN_INT (4), 4, 32, 1);
19236       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
19237
19238       /* Pad the size of the note to the required alignment.  */
19239       assemble_align (POINTER_SIZE);
19240     }
19241 }
19242 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
19243 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
19244 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
19245
19246 /* Target-specific selftests.  */
19247
19248 #if CHECKING_P
19249
19250 namespace selftest {
19251
19252 /* Selftest for the RTL loader.
19253    Verify that the RTL loader copes with a dump from
19254    print_rtx_function.  This is essentially just a test that class
19255    function_reader can handle a real dump, but it also verifies
19256    that lookup_reg_by_dump_name correctly handles hard regs.
19257    The presence of hard reg names in the dump means that the test is
19258    target-specific, hence it is in this file.  */
19259
19260 static void
19261 aarch64_test_loading_full_dump ()
19262 {
19263   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
19264
19265   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
19266
19267   rtx_insn *insn_1 = get_insn_by_uid (1);
19268   ASSERT_EQ (NOTE, GET_CODE (insn_1));
19269
19270   rtx_insn *insn_15 = get_insn_by_uid (15);
19271   ASSERT_EQ (INSN, GET_CODE (insn_15));
19272   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
19273
19274   /* Verify crtl->return_rtx.  */
19275   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
19276   ASSERT_EQ (0, REGNO (crtl->return_rtx));
19277   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
19278 }
19279
19280 /* Run all target-specific selftests.  */
19281
19282 static void
19283 aarch64_run_selftests (void)
19284 {
19285   aarch64_test_loading_full_dump ();
19286 }
19287
19288 } // namespace selftest
19289
19290 #endif /* #if CHECKING_P */
19291
19292 #undef TARGET_STACK_PROTECT_GUARD
19293 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
19294
19295 #undef TARGET_ADDRESS_COST
19296 #define TARGET_ADDRESS_COST aarch64_address_cost
19297
19298 /* This hook will determines whether unnamed bitfields affect the alignment
19299    of the containing structure.  The hook returns true if the structure
19300    should inherit the alignment requirements of an unnamed bitfield's
19301    type.  */
19302 #undef TARGET_ALIGN_ANON_BITFIELD
19303 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
19304
19305 #undef TARGET_ASM_ALIGNED_DI_OP
19306 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
19307
19308 #undef TARGET_ASM_ALIGNED_HI_OP
19309 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
19310
19311 #undef TARGET_ASM_ALIGNED_SI_OP
19312 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
19313
19314 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
19315 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
19316   hook_bool_const_tree_hwi_hwi_const_tree_true
19317
19318 #undef TARGET_ASM_FILE_START
19319 #define TARGET_ASM_FILE_START aarch64_start_file
19320
19321 #undef TARGET_ASM_OUTPUT_MI_THUNK
19322 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
19323
19324 #undef TARGET_ASM_SELECT_RTX_SECTION
19325 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
19326
19327 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
19328 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
19329
19330 #undef TARGET_BUILD_BUILTIN_VA_LIST
19331 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
19332
19333 #undef TARGET_CALLEE_COPIES
19334 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
19335
19336 #undef TARGET_CAN_ELIMINATE
19337 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
19338
19339 #undef TARGET_CAN_INLINE_P
19340 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
19341
19342 #undef TARGET_CANNOT_FORCE_CONST_MEM
19343 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
19344
19345 #undef TARGET_CASE_VALUES_THRESHOLD
19346 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
19347
19348 #undef TARGET_CONDITIONAL_REGISTER_USAGE
19349 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
19350
19351 /* Only the least significant bit is used for initialization guard
19352    variables.  */
19353 #undef TARGET_CXX_GUARD_MASK_BIT
19354 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
19355
19356 #undef TARGET_C_MODE_FOR_SUFFIX
19357 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
19358
19359 #ifdef TARGET_BIG_ENDIAN_DEFAULT
19360 #undef  TARGET_DEFAULT_TARGET_FLAGS
19361 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
19362 #endif
19363
19364 #undef TARGET_CLASS_MAX_NREGS
19365 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
19366
19367 #undef TARGET_BUILTIN_DECL
19368 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
19369
19370 #undef TARGET_BUILTIN_RECIPROCAL
19371 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
19372
19373 #undef TARGET_C_EXCESS_PRECISION
19374 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
19375
19376 #undef  TARGET_EXPAND_BUILTIN
19377 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
19378
19379 #undef TARGET_EXPAND_BUILTIN_VA_START
19380 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
19381
19382 #undef TARGET_FOLD_BUILTIN
19383 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
19384
19385 #undef TARGET_FUNCTION_ARG
19386 #define TARGET_FUNCTION_ARG aarch64_function_arg
19387
19388 #undef TARGET_FUNCTION_ARG_ADVANCE
19389 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
19390
19391 #undef TARGET_FUNCTION_ARG_BOUNDARY
19392 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
19393
19394 #undef TARGET_FUNCTION_ARG_PADDING
19395 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
19396
19397 #undef TARGET_GET_RAW_RESULT_MODE
19398 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
19399 #undef TARGET_GET_RAW_ARG_MODE
19400 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
19401
19402 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
19403 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
19404
19405 #undef TARGET_FUNCTION_VALUE
19406 #define TARGET_FUNCTION_VALUE aarch64_function_value
19407
19408 #undef TARGET_FUNCTION_VALUE_REGNO_P
19409 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
19410
19411 #undef TARGET_GIMPLE_FOLD_BUILTIN
19412 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
19413
19414 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
19415 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
19416
19417 #undef  TARGET_INIT_BUILTINS
19418 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
19419
19420 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
19421 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
19422   aarch64_ira_change_pseudo_allocno_class
19423
19424 #undef TARGET_LEGITIMATE_ADDRESS_P
19425 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
19426
19427 #undef TARGET_LEGITIMATE_CONSTANT_P
19428 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
19429
19430 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
19431 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
19432   aarch64_legitimize_address_displacement
19433
19434 #undef TARGET_LIBGCC_CMP_RETURN_MODE
19435 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
19436
19437 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
19438 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
19439 aarch64_libgcc_floating_mode_supported_p
19440
19441 #undef TARGET_MANGLE_TYPE
19442 #define TARGET_MANGLE_TYPE aarch64_mangle_type
19443
19444 #undef TARGET_MEMORY_MOVE_COST
19445 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
19446
19447 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
19448 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
19449
19450 #undef TARGET_MUST_PASS_IN_STACK
19451 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
19452
19453 /* This target hook should return true if accesses to volatile bitfields
19454    should use the narrowest mode possible.  It should return false if these
19455    accesses should use the bitfield container type.  */
19456 #undef TARGET_NARROW_VOLATILE_BITFIELD
19457 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
19458
19459 #undef  TARGET_OPTION_OVERRIDE
19460 #define TARGET_OPTION_OVERRIDE aarch64_override_options
19461
19462 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
19463 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
19464   aarch64_override_options_after_change
19465
19466 #undef TARGET_OPTION_SAVE
19467 #define TARGET_OPTION_SAVE aarch64_option_save
19468
19469 #undef TARGET_OPTION_RESTORE
19470 #define TARGET_OPTION_RESTORE aarch64_option_restore
19471
19472 #undef TARGET_OPTION_PRINT
19473 #define TARGET_OPTION_PRINT aarch64_option_print
19474
19475 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
19476 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
19477
19478 #undef TARGET_SET_CURRENT_FUNCTION
19479 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
19480
19481 #undef TARGET_PASS_BY_REFERENCE
19482 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
19483
19484 #undef TARGET_PREFERRED_RELOAD_CLASS
19485 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
19486
19487 #undef TARGET_SCHED_REASSOCIATION_WIDTH
19488 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
19489
19490 #undef TARGET_PROMOTED_TYPE
19491 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
19492
19493 #undef TARGET_SECONDARY_RELOAD
19494 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
19495
19496 #undef TARGET_SHIFT_TRUNCATION_MASK
19497 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
19498
19499 #undef TARGET_SETUP_INCOMING_VARARGS
19500 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
19501
19502 #undef TARGET_STRUCT_VALUE_RTX
19503 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
19504
19505 #undef TARGET_REGISTER_MOVE_COST
19506 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
19507
19508 #undef TARGET_RETURN_IN_MEMORY
19509 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
19510
19511 #undef TARGET_RETURN_IN_MSB
19512 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
19513
19514 #undef TARGET_RTX_COSTS
19515 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
19516
19517 #undef TARGET_SCALAR_MODE_SUPPORTED_P
19518 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
19519
19520 #undef TARGET_SCHED_ISSUE_RATE
19521 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
19522
19523 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
19524 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
19525   aarch64_sched_first_cycle_multipass_dfa_lookahead
19526
19527 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
19528 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
19529   aarch64_first_cycle_multipass_dfa_lookahead_guard
19530
19531 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
19532 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
19533   aarch64_get_separate_components
19534
19535 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
19536 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
19537   aarch64_components_for_bb
19538
19539 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
19540 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
19541   aarch64_disqualify_components
19542
19543 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
19544 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
19545   aarch64_emit_prologue_components
19546
19547 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
19548 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
19549   aarch64_emit_epilogue_components
19550
19551 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
19552 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
19553   aarch64_set_handled_components
19554
19555 #undef TARGET_TRAMPOLINE_INIT
19556 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
19557
19558 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
19559 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
19560
19561 #undef TARGET_VECTOR_MODE_SUPPORTED_P
19562 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
19563
19564 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
19565 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
19566   aarch64_builtin_support_vector_misalignment
19567
19568 #undef TARGET_ARRAY_MODE
19569 #define TARGET_ARRAY_MODE aarch64_array_mode
19570
19571 #undef TARGET_ARRAY_MODE_SUPPORTED_P
19572 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
19573
19574 #undef TARGET_VECTORIZE_ADD_STMT_COST
19575 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
19576
19577 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
19578 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
19579   aarch64_builtin_vectorization_cost
19580
19581 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
19582 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
19583
19584 #undef TARGET_VECTORIZE_BUILTINS
19585 #define TARGET_VECTORIZE_BUILTINS
19586
19587 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
19588 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
19589   aarch64_builtin_vectorized_function
19590
19591 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
19592 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
19593   aarch64_autovectorize_vector_sizes
19594
19595 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
19596 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
19597   aarch64_atomic_assign_expand_fenv
19598
19599 /* Section anchor support.  */
19600
19601 #undef TARGET_MIN_ANCHOR_OFFSET
19602 #define TARGET_MIN_ANCHOR_OFFSET -256
19603
19604 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
19605    byte offset; we can do much more for larger data types, but have no way
19606    to determine the size of the access.  We assume accesses are aligned.  */
19607 #undef TARGET_MAX_ANCHOR_OFFSET
19608 #define TARGET_MAX_ANCHOR_OFFSET 4095
19609
19610 #undef TARGET_VECTOR_ALIGNMENT
19611 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
19612
19613 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
19614 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
19615   aarch64_vectorize_preferred_vector_alignment
19616 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
19617 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
19618   aarch64_simd_vector_alignment_reachable
19619
19620 /* vec_perm support.  */
19621
19622 #undef TARGET_VECTORIZE_VEC_PERM_CONST
19623 #define TARGET_VECTORIZE_VEC_PERM_CONST \
19624   aarch64_vectorize_vec_perm_const
19625
19626 #undef TARGET_VECTORIZE_GET_MASK_MODE
19627 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
19628 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
19629 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
19630   aarch64_empty_mask_is_expensive
19631 #undef TARGET_PREFERRED_ELSE_VALUE
19632 #define TARGET_PREFERRED_ELSE_VALUE \
19633   aarch64_preferred_else_value
19634
19635 #undef TARGET_INIT_LIBFUNCS
19636 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
19637
19638 #undef TARGET_FIXED_CONDITION_CODE_REGS
19639 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
19640
19641 #undef TARGET_FLAGS_REGNUM
19642 #define TARGET_FLAGS_REGNUM CC_REGNUM
19643
19644 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
19645 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
19646
19647 #undef TARGET_ASAN_SHADOW_OFFSET
19648 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
19649
19650 #undef TARGET_LEGITIMIZE_ADDRESS
19651 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
19652
19653 #undef TARGET_SCHED_CAN_SPECULATE_INSN
19654 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
19655
19656 #undef TARGET_CAN_USE_DOLOOP_P
19657 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
19658
19659 #undef TARGET_SCHED_ADJUST_PRIORITY
19660 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
19661
19662 #undef TARGET_SCHED_MACRO_FUSION_P
19663 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
19664
19665 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
19666 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
19667
19668 #undef TARGET_SCHED_FUSION_PRIORITY
19669 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
19670
19671 #undef TARGET_UNSPEC_MAY_TRAP_P
19672 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
19673
19674 #undef TARGET_USE_PSEUDO_PIC_REG
19675 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
19676
19677 #undef TARGET_PRINT_OPERAND
19678 #define TARGET_PRINT_OPERAND aarch64_print_operand
19679
19680 #undef TARGET_PRINT_OPERAND_ADDRESS
19681 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
19682
19683 #undef TARGET_OPTAB_SUPPORTED_P
19684 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
19685
19686 #undef TARGET_OMIT_STRUCT_RETURN_REG
19687 #define TARGET_OMIT_STRUCT_RETURN_REG true
19688
19689 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
19690 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
19691   aarch64_dwarf_poly_indeterminate_value
19692
19693 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
19694 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
19695 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
19696
19697 #undef TARGET_HARD_REGNO_NREGS
19698 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
19699 #undef TARGET_HARD_REGNO_MODE_OK
19700 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
19701
19702 #undef TARGET_MODES_TIEABLE_P
19703 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
19704
19705 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
19706 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
19707   aarch64_hard_regno_call_part_clobbered
19708
19709 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
19710 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
19711   aarch64_remove_extra_call_preserved_regs
19712
19713 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
19714 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
19715   aarch64_return_call_with_max_clobbers
19716
19717 #undef TARGET_CONSTANT_ALIGNMENT
19718 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
19719
19720 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
19721 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
19722   aarch64_stack_clash_protection_alloca_probe_range
19723
19724 #undef TARGET_COMPUTE_PRESSURE_CLASSES
19725 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
19726
19727 #undef TARGET_CAN_CHANGE_MODE_CLASS
19728 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
19729
19730 #undef TARGET_SELECT_EARLY_REMAT_MODES
19731 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
19732
19733 #undef TARGET_SPECULATION_SAFE_VALUE
19734 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
19735
19736 #undef TARGET_ESTIMATED_POLY_VALUE
19737 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
19738
19739 #undef TARGET_ATTRIBUTE_TABLE
19740 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
19741
19742 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
19743 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
19744   aarch64_simd_clone_compute_vecsize_and_simdlen
19745
19746 #undef TARGET_SIMD_CLONE_ADJUST
19747 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
19748
19749 #undef TARGET_SIMD_CLONE_USABLE
19750 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
19751
19752 #undef TARGET_COMP_TYPE_ATTRIBUTES
19753 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
19754
19755 #undef TARGET_GET_MULTILIB_ABI_NAME
19756 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
19757
19758 #if CHECKING_P
19759 #undef TARGET_RUN_TARGET_SELFTESTS
19760 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
19761 #endif /* #if CHECKING_P */
19762
19763 #undef TARGET_ASM_POST_CFI_STARTPROC
19764 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
19765
19766 struct gcc_target targetm = TARGET_INITIALIZER;
19767
19768 #include "gt-aarch64.h"