gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2019 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "cgraph.h"
  44 #include "diagnostic.h"
  45 #include "insn-attr.h"
  46 #include "alias.h"
  47 #include "fold-const.h"
  48 #include "stor-layout.h"
  49 #include "calls.h"
  50 #include "varasm.h"
  51 #include "output.h"
  52 #include "flags.h"
  53 #include "explow.h"
  54 #include "expr.h"
  55 #include "reload.h"
  56 #include "langhooks.h"
  57 #include "opts.h"
  58 #include "params.h"
  59 #include "gimplify.h"
  60 #include "dwarf2.h"
  61 #include "gimple-iterator.h"
  62 #include "tree-vectorizer.h"
  63 #include "aarch64-cost-tables.h"
  64 #include "dumpfile.h"
  65 #include "builtins.h"
  66 #include "rtl-iter.h"
  67 #include "tm-constrs.h"
  68 #include "sched-int.h"
  69 #include "target-globals.h"
  70 #include "common/common-target.h"
  71 #include "cfgrtl.h"
  72 #include "selftest.h"
  73 #include "selftest-rtl.h"
  74 #include "rtx-vector-builder.h"
  75 #include "intl.h"
  76
  77 /* This file should be included last.  */
  78 #include "target-def.h"
  79
  80 /* Defined for convenience.  */
  81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  82
  83 /* Information about a legitimate vector immediate operand.  */
  84 struct simd_immediate_info
  85 {
  86   enum insn_type { MOV, MVN };
  87   enum modifier_type { LSL, MSL };
  88
  89   simd_immediate_info () {}
  90   simd_immediate_info (scalar_float_mode, rtx);
  91   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  92                        insn_type = MOV, modifier_type = LSL,
  93                        unsigned int = 0);
  94   simd_immediate_info (scalar_mode, rtx, rtx);
  95
  96   /* The mode of the elements.  */
  97   scalar_mode elt_mode;
  98
  99   /* The value of each element if all elements are the same, or the
 100      first value if the constant is a series.  */
 101   rtx value;
 102
 103   /* The value of the step if the constant is a series, null otherwise.  */
 104   rtx step;
 105
 106   /* The instruction to use to move the immediate into a vector.  */
 107   insn_type insn;
 108
 109   /* The kind of shift modifier to use, and the number of bits to shift.
 110      This is (LSL, 0) if no shift is needed.  */
 111   modifier_type modifier;
 112   unsigned int shift;
 113 };
 114
 115 /* Construct a floating-point immediate in which each element has mode
 116    ELT_MODE_IN and value VALUE_IN.  */
 117 inline simd_immediate_info
 118 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 119   : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
 120     modifier (LSL), shift (0)
 121 {}
 122
 123 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 124    and value VALUE_IN.  The other parameters are as for the structure
 125    fields.  */
 126 inline simd_immediate_info
 127 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 128                        unsigned HOST_WIDE_INT value_in,
 129                        insn_type insn_in, modifier_type modifier_in,
 130                        unsigned int shift_in)
 131   : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
 132     step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
 133 {}
 134
 135 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 136    and where element I is equal to VALUE_IN + I * STEP_IN.  */
 137 inline simd_immediate_info
 138 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
 139   : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
 140     modifier (LSL), shift (0)
 141 {}
 142
 143 /* The current code model.  */
 144 enum aarch64_code_model aarch64_cmodel;
 145
 146 /* The number of 64-bit elements in an SVE vector.  */
 147 poly_uint16 aarch64_sve_vg;
 148
 149 #ifdef HAVE_AS_TLS
 150 #undef TARGET_HAVE_TLS
 151 #define TARGET_HAVE_TLS 1
 152 #endif
 153
 154 static bool aarch64_composite_type_p (const_tree, machine_mode);
 155 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 156                                                      const_tree,
 157                                                      machine_mode *, int *,
 158                                                      bool *);
 159 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 160 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 161 static void aarch64_override_options_after_change (void);
 162 static bool aarch64_vector_mode_supported_p (machine_mode);
 163 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 164 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 165                                                          const_tree type,
 166                                                          int misalignment,
 167                                                          bool is_packed);
 168 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 169 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 170                                             aarch64_addr_query_type);
 171 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 172
 173 /* Major revision number of the ARM Architecture implemented by the target.  */
 174 unsigned aarch64_architecture_version;
 175
 176 /* The processor for which instructions should be scheduled.  */
 177 enum aarch64_processor aarch64_tune = cortexa53;
 178
 179 /* Mask to specify which instruction scheduling options should be used.  */
 180 uint64_t aarch64_tune_flags = 0;
 181
 182 /* Global flag for PC relative loads.  */
 183 bool aarch64_pcrelative_literal_loads;
 184
 185 /* Global flag for whether frame pointer is enabled.  */
 186 bool aarch64_use_frame_pointer;
 187
 188 #define BRANCH_PROTECT_STR_MAX 255
 189 char *accepted_branch_protection_string = NULL;
 190
 191 static enum aarch64_parse_opt_result
 192 aarch64_parse_branch_protection (const char*, char**);
 193
 194 /* Support for command line parsing of boolean flags in the tuning
 195    structures.  */
 196 struct aarch64_flag_desc
 197 {
 198   const char* name;
 199   unsigned int flag;
 200 };
 201
 202 #define AARCH64_FUSION_PAIR(name, internal_name) \
 203   { name, AARCH64_FUSE_##internal_name },
 204 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 205 {
 206   { "none", AARCH64_FUSE_NOTHING },
 207 #include "aarch64-fusion-pairs.def"
 208   { "all", AARCH64_FUSE_ALL },
 209   { NULL, AARCH64_FUSE_NOTHING }
 210 };
 211
 212 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 213   { name, AARCH64_EXTRA_TUNE_##internal_name },
 214 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 215 {
 216   { "none", AARCH64_EXTRA_TUNE_NONE },
 217 #include "aarch64-tuning-flags.def"
 218   { "all", AARCH64_EXTRA_TUNE_ALL },
 219   { NULL, AARCH64_EXTRA_TUNE_NONE }
 220 };
 221
 222 /* Tuning parameters.  */
 223
 224 static const struct cpu_addrcost_table generic_addrcost_table =
 225 {
 226     {
 227       1, /* hi  */
 228       0, /* si  */
 229       0, /* di  */
 230       1, /* ti  */
 231     },
 232   0, /* pre_modify  */
 233   0, /* post_modify  */
 234   0, /* register_offset  */
 235   0, /* register_sextend  */
 236   0, /* register_zextend  */
 237   0 /* imm_offset  */
 238 };
 239
 240 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 241 {
 242     {
 243       0, /* hi  */
 244       0, /* si  */
 245       0, /* di  */
 246       2, /* ti  */
 247     },
 248   0, /* pre_modify  */
 249   0, /* post_modify  */
 250   1, /* register_offset  */
 251   1, /* register_sextend  */
 252   2, /* register_zextend  */
 253   0, /* imm_offset  */
 254 };
 255
 256 static const struct cpu_addrcost_table xgene1_addrcost_table =
 257 {
 258     {
 259       1, /* hi  */
 260       0, /* si  */
 261       0, /* di  */
 262       1, /* ti  */
 263     },
 264   1, /* pre_modify  */
 265   1, /* post_modify  */
 266   0, /* register_offset  */
 267   1, /* register_sextend  */
 268   1, /* register_zextend  */
 269   0, /* imm_offset  */
 270 };
 271
 272 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 273 {
 274     {
 275       1, /* hi  */
 276       1, /* si  */
 277       1, /* di  */
 278       2, /* ti  */
 279     },
 280   0, /* pre_modify  */
 281   0, /* post_modify  */
 282   2, /* register_offset  */
 283   3, /* register_sextend  */
 284   3, /* register_zextend  */
 285   0, /* imm_offset  */
 286 };
 287
 288 static const struct cpu_addrcost_table tsv110_addrcost_table =
 289 {
 290     {
 291       1, /* hi  */
 292       0, /* si  */
 293       0, /* di  */
 294       1, /* ti  */
 295     },
 296   0, /* pre_modify  */
 297   0, /* post_modify  */
 298   0, /* register_offset  */
 299   1, /* register_sextend  */
 300   1, /* register_zextend  */
 301   0, /* imm_offset  */
 302 };
 303
 304 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 305 {
 306     {
 307       1, /* hi  */
 308       1, /* si  */
 309       1, /* di  */
 310       2, /* ti  */
 311     },
 312   1, /* pre_modify  */
 313   1, /* post_modify  */
 314   3, /* register_offset  */
 315   3, /* register_sextend  */
 316   3, /* register_zextend  */
 317   2, /* imm_offset  */
 318 };
 319
 320 static const struct cpu_regmove_cost generic_regmove_cost =
 321 {
 322   1, /* GP2GP  */
 323   /* Avoid the use of slow int<->fp moves for spilling by setting
 324      their cost higher than memmov_cost.  */
 325   5, /* GP2FP  */
 326   5, /* FP2GP  */
 327   2 /* FP2FP  */
 328 };
 329
 330 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 331 {
 332   1, /* GP2GP  */
 333   /* Avoid the use of slow int<->fp moves for spilling by setting
 334      their cost higher than memmov_cost.  */
 335   5, /* GP2FP  */
 336   5, /* FP2GP  */
 337   2 /* FP2FP  */
 338 };
 339
 340 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 341 {
 342   1, /* GP2GP  */
 343   /* Avoid the use of slow int<->fp moves for spilling by setting
 344      their cost higher than memmov_cost.  */
 345   5, /* GP2FP  */
 346   5, /* FP2GP  */
 347   2 /* FP2FP  */
 348 };
 349
 350 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 351 {
 352   1, /* GP2GP  */
 353   /* Avoid the use of slow int<->fp moves for spilling by setting
 354      their cost higher than memmov_cost (actual, 4 and 9).  */
 355   9, /* GP2FP  */
 356   9, /* FP2GP  */
 357   1 /* FP2FP  */
 358 };
 359
 360 static const struct cpu_regmove_cost thunderx_regmove_cost =
 361 {
 362   2, /* GP2GP  */
 363   2, /* GP2FP  */
 364   6, /* FP2GP  */
 365   4 /* FP2FP  */
 366 };
 367
 368 static const struct cpu_regmove_cost xgene1_regmove_cost =
 369 {
 370   1, /* GP2GP  */
 371   /* Avoid the use of slow int<->fp moves for spilling by setting
 372      their cost higher than memmov_cost.  */
 373   8, /* GP2FP  */
 374   8, /* FP2GP  */
 375   2 /* FP2FP  */
 376 };
 377
 378 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 379 {
 380   2, /* GP2GP  */
 381   /* Avoid the use of int<->fp moves for spilling.  */
 382   6, /* GP2FP  */
 383   6, /* FP2GP  */
 384   4 /* FP2FP  */
 385 };
 386
 387 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 388 {
 389   1, /* GP2GP  */
 390   /* Avoid the use of int<->fp moves for spilling.  */
 391   8, /* GP2FP  */
 392   8, /* FP2GP  */
 393   4  /* FP2FP  */
 394 };
 395
 396 static const struct cpu_regmove_cost tsv110_regmove_cost =
 397 {
 398   1, /* GP2GP  */
 399   /* Avoid the use of slow int<->fp moves for spilling by setting
 400      their cost higher than memmov_cost.  */
 401   2, /* GP2FP  */
 402   3, /* FP2GP  */
 403   2  /* FP2FP  */
 404 };
 405
 406 /* Generic costs for vector insn classes.  */
 407 static const struct cpu_vector_cost generic_vector_cost =
 408 {
 409   1, /* scalar_int_stmt_cost  */
 410   1, /* scalar_fp_stmt_cost  */
 411   1, /* scalar_load_cost  */
 412   1, /* scalar_store_cost  */
 413   1, /* vec_int_stmt_cost  */
 414   1, /* vec_fp_stmt_cost  */
 415   2, /* vec_permute_cost  */
 416   1, /* vec_to_scalar_cost  */
 417   1, /* scalar_to_vec_cost  */
 418   1, /* vec_align_load_cost  */
 419   1, /* vec_unalign_load_cost  */
 420   1, /* vec_unalign_store_cost  */
 421   1, /* vec_store_cost  */
 422   3, /* cond_taken_branch_cost  */
 423   1 /* cond_not_taken_branch_cost  */
 424 };
 425
 426 /* QDF24XX costs for vector insn classes.  */
 427 static const struct cpu_vector_cost qdf24xx_vector_cost =
 428 {
 429   1, /* scalar_int_stmt_cost  */
 430   1, /* scalar_fp_stmt_cost  */
 431   1, /* scalar_load_cost  */
 432   1, /* scalar_store_cost  */
 433   1, /* vec_int_stmt_cost  */
 434   3, /* vec_fp_stmt_cost  */
 435   2, /* vec_permute_cost  */
 436   1, /* vec_to_scalar_cost  */
 437   1, /* scalar_to_vec_cost  */
 438   1, /* vec_align_load_cost  */
 439   1, /* vec_unalign_load_cost  */
 440   1, /* vec_unalign_store_cost  */
 441   1, /* vec_store_cost  */
 442   3, /* cond_taken_branch_cost  */
 443   1 /* cond_not_taken_branch_cost  */
 444 };
 445
 446 /* ThunderX costs for vector insn classes.  */
 447 static const struct cpu_vector_cost thunderx_vector_cost =
 448 {
 449   1, /* scalar_int_stmt_cost  */
 450   1, /* scalar_fp_stmt_cost  */
 451   3, /* scalar_load_cost  */
 452   1, /* scalar_store_cost  */
 453   4, /* vec_int_stmt_cost  */
 454   1, /* vec_fp_stmt_cost  */
 455   4, /* vec_permute_cost  */
 456   2, /* vec_to_scalar_cost  */
 457   2, /* scalar_to_vec_cost  */
 458   3, /* vec_align_load_cost  */
 459   5, /* vec_unalign_load_cost  */
 460   5, /* vec_unalign_store_cost  */
 461   1, /* vec_store_cost  */
 462   3, /* cond_taken_branch_cost  */
 463   3 /* cond_not_taken_branch_cost  */
 464 };
 465
 466 static const struct cpu_vector_cost tsv110_vector_cost =
 467 {
 468   1, /* scalar_int_stmt_cost  */
 469   1, /* scalar_fp_stmt_cost  */
 470   5, /* scalar_load_cost  */
 471   1, /* scalar_store_cost  */
 472   2, /* vec_int_stmt_cost  */
 473   2, /* vec_fp_stmt_cost  */
 474   2, /* vec_permute_cost  */
 475   3, /* vec_to_scalar_cost  */
 476   2, /* scalar_to_vec_cost  */
 477   5, /* vec_align_load_cost  */
 478   5, /* vec_unalign_load_cost  */
 479   1, /* vec_unalign_store_cost  */
 480   1, /* vec_store_cost  */
 481   1, /* cond_taken_branch_cost  */
 482   1 /* cond_not_taken_branch_cost  */
 483 };
 484
 485 /* Generic costs for vector insn classes.  */
 486 static const struct cpu_vector_cost cortexa57_vector_cost =
 487 {
 488   1, /* scalar_int_stmt_cost  */
 489   1, /* scalar_fp_stmt_cost  */
 490   4, /* scalar_load_cost  */
 491   1, /* scalar_store_cost  */
 492   2, /* vec_int_stmt_cost  */
 493   2, /* vec_fp_stmt_cost  */
 494   3, /* vec_permute_cost  */
 495   8, /* vec_to_scalar_cost  */
 496   8, /* scalar_to_vec_cost  */
 497   4, /* vec_align_load_cost  */
 498   4, /* vec_unalign_load_cost  */
 499   1, /* vec_unalign_store_cost  */
 500   1, /* vec_store_cost  */
 501   1, /* cond_taken_branch_cost  */
 502   1 /* cond_not_taken_branch_cost  */
 503 };
 504
 505 static const struct cpu_vector_cost exynosm1_vector_cost =
 506 {
 507   1, /* scalar_int_stmt_cost  */
 508   1, /* scalar_fp_stmt_cost  */
 509   5, /* scalar_load_cost  */
 510   1, /* scalar_store_cost  */
 511   3, /* vec_int_stmt_cost  */
 512   3, /* vec_fp_stmt_cost  */
 513   3, /* vec_permute_cost  */
 514   3, /* vec_to_scalar_cost  */
 515   3, /* scalar_to_vec_cost  */
 516   5, /* vec_align_load_cost  */
 517   5, /* vec_unalign_load_cost  */
 518   1, /* vec_unalign_store_cost  */
 519   1, /* vec_store_cost  */
 520   1, /* cond_taken_branch_cost  */
 521   1 /* cond_not_taken_branch_cost  */
 522 };
 523
 524 /* Generic costs for vector insn classes.  */
 525 static const struct cpu_vector_cost xgene1_vector_cost =
 526 {
 527   1, /* scalar_int_stmt_cost  */
 528   1, /* scalar_fp_stmt_cost  */
 529   5, /* scalar_load_cost  */
 530   1, /* scalar_store_cost  */
 531   2, /* vec_int_stmt_cost  */
 532   2, /* vec_fp_stmt_cost  */
 533   2, /* vec_permute_cost  */
 534   4, /* vec_to_scalar_cost  */
 535   4, /* scalar_to_vec_cost  */
 536   10, /* vec_align_load_cost  */
 537   10, /* vec_unalign_load_cost  */
 538   2, /* vec_unalign_store_cost  */
 539   2, /* vec_store_cost  */
 540   2, /* cond_taken_branch_cost  */
 541   1 /* cond_not_taken_branch_cost  */
 542 };
 543
 544 /* Costs for vector insn classes for Vulcan.  */
 545 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 546 {
 547   1, /* scalar_int_stmt_cost  */
 548   6, /* scalar_fp_stmt_cost  */
 549   4, /* scalar_load_cost  */
 550   1, /* scalar_store_cost  */
 551   5, /* vec_int_stmt_cost  */
 552   6, /* vec_fp_stmt_cost  */
 553   3, /* vec_permute_cost  */
 554   6, /* vec_to_scalar_cost  */
 555   5, /* scalar_to_vec_cost  */
 556   8, /* vec_align_load_cost  */
 557   8, /* vec_unalign_load_cost  */
 558   4, /* vec_unalign_store_cost  */
 559   4, /* vec_store_cost  */
 560   2, /* cond_taken_branch_cost  */
 561   1  /* cond_not_taken_branch_cost  */
 562 };
 563
 564 /* Generic costs for branch instructions.  */
 565 static const struct cpu_branch_cost generic_branch_cost =
 566 {
 567   1,  /* Predictable.  */
 568   3   /* Unpredictable.  */
 569 };
 570
 571 /* Generic approximation modes.  */
 572 static const cpu_approx_modes generic_approx_modes =
 573 {
 574   AARCH64_APPROX_NONE,  /* division  */
 575   AARCH64_APPROX_NONE,  /* sqrt  */
 576   AARCH64_APPROX_NONE   /* recip_sqrt  */
 577 };
 578
 579 /* Approximation modes for Exynos M1.  */
 580 static const cpu_approx_modes exynosm1_approx_modes =
 581 {
 582   AARCH64_APPROX_NONE,  /* division  */
 583   AARCH64_APPROX_ALL,   /* sqrt  */
 584   AARCH64_APPROX_ALL    /* recip_sqrt  */
 585 };
 586
 587 /* Approximation modes for X-Gene 1.  */
 588 static const cpu_approx_modes xgene1_approx_modes =
 589 {
 590   AARCH64_APPROX_NONE,  /* division  */
 591   AARCH64_APPROX_NONE,  /* sqrt  */
 592   AARCH64_APPROX_ALL    /* recip_sqrt  */
 593 };
 594
 595 /* Generic prefetch settings (which disable prefetch).  */
 596 static const cpu_prefetch_tune generic_prefetch_tune =
 597 {
 598   0,                    /* num_slots  */
 599   -1,                   /* l1_cache_size  */
 600   -1,                   /* l1_cache_line_size  */
 601   -1,                   /* l2_cache_size  */
 602   true,                 /* prefetch_dynamic_strides */
 603   -1,                   /* minimum_stride */
 604   -1                    /* default_opt_level  */
 605 };
 606
 607 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 608 {
 609   0,                    /* num_slots  */
 610   -1,                   /* l1_cache_size  */
 611   64,                   /* l1_cache_line_size  */
 612   -1,                   /* l2_cache_size  */
 613   true,                 /* prefetch_dynamic_strides */
 614   -1,                   /* minimum_stride */
 615   -1                    /* default_opt_level  */
 616 };
 617
 618 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 619 {
 620   4,                    /* num_slots  */
 621   32,                   /* l1_cache_size  */
 622   64,                   /* l1_cache_line_size  */
 623   512,                  /* l2_cache_size  */
 624   false,                /* prefetch_dynamic_strides */
 625   2048,                 /* minimum_stride */
 626   3                     /* default_opt_level  */
 627 };
 628
 629 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 630 {
 631   8,                    /* num_slots  */
 632   32,                   /* l1_cache_size  */
 633   128,                  /* l1_cache_line_size  */
 634   16*1024,              /* l2_cache_size  */
 635   true,                 /* prefetch_dynamic_strides */
 636   -1,                   /* minimum_stride */
 637   3                     /* default_opt_level  */
 638 };
 639
 640 static const cpu_prefetch_tune thunderx_prefetch_tune =
 641 {
 642   8,                    /* num_slots  */
 643   32,                   /* l1_cache_size  */
 644   128,                  /* l1_cache_line_size  */
 645   -1,                   /* l2_cache_size  */
 646   true,                 /* prefetch_dynamic_strides */
 647   -1,                   /* minimum_stride */
 648   -1                    /* default_opt_level  */
 649 };
 650
 651 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 652 {
 653   8,                    /* num_slots  */
 654   32,                   /* l1_cache_size  */
 655   64,                   /* l1_cache_line_size  */
 656   256,                  /* l2_cache_size  */
 657   true,                 /* prefetch_dynamic_strides */
 658   -1,                   /* minimum_stride */
 659   -1                    /* default_opt_level  */
 660 };
 661
 662 static const cpu_prefetch_tune tsv110_prefetch_tune =
 663 {
 664   0,                    /* num_slots  */
 665   64,                   /* l1_cache_size  */
 666   64,                   /* l1_cache_line_size  */
 667   512,                  /* l2_cache_size  */
 668   true,                 /* prefetch_dynamic_strides */
 669   -1,                   /* minimum_stride */
 670   -1                    /* default_opt_level  */
 671 };
 672
 673 static const cpu_prefetch_tune xgene1_prefetch_tune =
 674 {
 675   8,                    /* num_slots  */
 676   32,                   /* l1_cache_size  */
 677   64,                   /* l1_cache_line_size  */
 678   256,                  /* l2_cache_size  */
 679   true,                 /* prefetch_dynamic_strides */
 680   -1,                   /* minimum_stride */
 681   -1                    /* default_opt_level  */
 682 };
 683
 684 static const struct tune_params generic_tunings =
 685 {
 686   &cortexa57_extra_costs,
 687   &generic_addrcost_table,
 688   &generic_regmove_cost,
 689   &generic_vector_cost,
 690   &generic_branch_cost,
 691   &generic_approx_modes,
 692   SVE_NOT_IMPLEMENTED, /* sve_width  */
 693   4, /* memmov_cost  */
 694   2, /* issue_rate  */
 695   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 696   "8",  /* function_align.  */
 697   "4",  /* jump_align.  */
 698   "8",  /* loop_align.  */
 699   2,    /* int_reassoc_width.  */
 700   4,    /* fp_reassoc_width.  */
 701   1,    /* vec_reassoc_width.  */
 702   2,    /* min_div_recip_mul_sf.  */
 703   2,    /* min_div_recip_mul_df.  */
 704   0,    /* max_case_values.  */
 705   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 706   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 707   &generic_prefetch_tune
 708 };
 709
 710 static const struct tune_params cortexa35_tunings =
 711 {
 712   &cortexa53_extra_costs,
 713   &generic_addrcost_table,
 714   &cortexa53_regmove_cost,
 715   &generic_vector_cost,
 716   &generic_branch_cost,
 717   &generic_approx_modes,
 718   SVE_NOT_IMPLEMENTED, /* sve_width  */
 719   4, /* memmov_cost  */
 720   1, /* issue_rate  */
 721   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 722    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 723   "16", /* function_align.  */
 724   "4",  /* jump_align.  */
 725   "8",  /* loop_align.  */
 726   2,    /* int_reassoc_width.  */
 727   4,    /* fp_reassoc_width.  */
 728   1,    /* vec_reassoc_width.  */
 729   2,    /* min_div_recip_mul_sf.  */
 730   2,    /* min_div_recip_mul_df.  */
 731   0,    /* max_case_values.  */
 732   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 733   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 734   &generic_prefetch_tune
 735 };
 736
 737 static const struct tune_params cortexa53_tunings =
 738 {
 739   &cortexa53_extra_costs,
 740   &generic_addrcost_table,
 741   &cortexa53_regmove_cost,
 742   &generic_vector_cost,
 743   &generic_branch_cost,
 744   &generic_approx_modes,
 745   SVE_NOT_IMPLEMENTED, /* sve_width  */
 746   4, /* memmov_cost  */
 747   2, /* issue_rate  */
 748   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 749    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 750   "16", /* function_align.  */
 751   "4",  /* jump_align.  */
 752   "8",  /* loop_align.  */
 753   2,    /* int_reassoc_width.  */
 754   4,    /* fp_reassoc_width.  */
 755   1,    /* vec_reassoc_width.  */
 756   2,    /* min_div_recip_mul_sf.  */
 757   2,    /* min_div_recip_mul_df.  */
 758   0,    /* max_case_values.  */
 759   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 760   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 761   &generic_prefetch_tune
 762 };
 763
 764 static const struct tune_params cortexa57_tunings =
 765 {
 766   &cortexa57_extra_costs,
 767   &generic_addrcost_table,
 768   &cortexa57_regmove_cost,
 769   &cortexa57_vector_cost,
 770   &generic_branch_cost,
 771   &generic_approx_modes,
 772   SVE_NOT_IMPLEMENTED, /* sve_width  */
 773   4, /* memmov_cost  */
 774   3, /* issue_rate  */
 775   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 776    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 777   "16", /* function_align.  */
 778   "4",  /* jump_align.  */
 779   "8",  /* loop_align.  */
 780   2,    /* int_reassoc_width.  */
 781   4,    /* fp_reassoc_width.  */
 782   1,    /* vec_reassoc_width.  */
 783   2,    /* min_div_recip_mul_sf.  */
 784   2,    /* min_div_recip_mul_df.  */
 785   0,    /* max_case_values.  */
 786   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 787   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 788   &generic_prefetch_tune
 789 };
 790
 791 static const struct tune_params cortexa72_tunings =
 792 {
 793   &cortexa57_extra_costs,
 794   &generic_addrcost_table,
 795   &cortexa57_regmove_cost,
 796   &cortexa57_vector_cost,
 797   &generic_branch_cost,
 798   &generic_approx_modes,
 799   SVE_NOT_IMPLEMENTED, /* sve_width  */
 800   4, /* memmov_cost  */
 801   3, /* issue_rate  */
 802   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 803    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 804   "16", /* function_align.  */
 805   "4",  /* jump_align.  */
 806   "8",  /* loop_align.  */
 807   2,    /* int_reassoc_width.  */
 808   4,    /* fp_reassoc_width.  */
 809   1,    /* vec_reassoc_width.  */
 810   2,    /* min_div_recip_mul_sf.  */
 811   2,    /* min_div_recip_mul_df.  */
 812   0,    /* max_case_values.  */
 813   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 814   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 815   &generic_prefetch_tune
 816 };
 817
 818 static const struct tune_params cortexa73_tunings =
 819 {
 820   &cortexa57_extra_costs,
 821   &generic_addrcost_table,
 822   &cortexa57_regmove_cost,
 823   &cortexa57_vector_cost,
 824   &generic_branch_cost,
 825   &generic_approx_modes,
 826   SVE_NOT_IMPLEMENTED, /* sve_width  */
 827   4, /* memmov_cost.  */
 828   2, /* issue_rate.  */
 829   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 830    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 831   "16", /* function_align.  */
 832   "4",  /* jump_align.  */
 833   "8",  /* loop_align.  */
 834   2,    /* int_reassoc_width.  */
 835   4,    /* fp_reassoc_width.  */
 836   1,    /* vec_reassoc_width.  */
 837   2,    /* min_div_recip_mul_sf.  */
 838   2,    /* min_div_recip_mul_df.  */
 839   0,    /* max_case_values.  */
 840   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 841   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 842   &generic_prefetch_tune
 843 };
 844
 845
 846
 847 static const struct tune_params exynosm1_tunings =
 848 {
 849   &exynosm1_extra_costs,
 850   &exynosm1_addrcost_table,
 851   &exynosm1_regmove_cost,
 852   &exynosm1_vector_cost,
 853   &generic_branch_cost,
 854   &exynosm1_approx_modes,
 855   SVE_NOT_IMPLEMENTED, /* sve_width  */
 856   4,    /* memmov_cost  */
 857   3,    /* issue_rate  */
 858   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 859   "4",  /* function_align.  */
 860   "4",  /* jump_align.  */
 861   "4",  /* loop_align.  */
 862   2,    /* int_reassoc_width.  */
 863   4,    /* fp_reassoc_width.  */
 864   1,    /* vec_reassoc_width.  */
 865   2,    /* min_div_recip_mul_sf.  */
 866   2,    /* min_div_recip_mul_df.  */
 867   48,   /* max_case_values.  */
 868   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 869   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 870   &exynosm1_prefetch_tune
 871 };
 872
 873 static const struct tune_params thunderxt88_tunings =
 874 {
 875   &thunderx_extra_costs,
 876   &generic_addrcost_table,
 877   &thunderx_regmove_cost,
 878   &thunderx_vector_cost,
 879   &generic_branch_cost,
 880   &generic_approx_modes,
 881   SVE_NOT_IMPLEMENTED, /* sve_width  */
 882   6, /* memmov_cost  */
 883   2, /* issue_rate  */
 884   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 885   "8",  /* function_align.  */
 886   "8",  /* jump_align.  */
 887   "8",  /* loop_align.  */
 888   2,    /* int_reassoc_width.  */
 889   4,    /* fp_reassoc_width.  */
 890   1,    /* vec_reassoc_width.  */
 891   2,    /* min_div_recip_mul_sf.  */
 892   2,    /* min_div_recip_mul_df.  */
 893   0,    /* max_case_values.  */
 894   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 895   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 896   &thunderxt88_prefetch_tune
 897 };
 898
 899 static const struct tune_params thunderx_tunings =
 900 {
 901   &thunderx_extra_costs,
 902   &generic_addrcost_table,
 903   &thunderx_regmove_cost,
 904   &thunderx_vector_cost,
 905   &generic_branch_cost,
 906   &generic_approx_modes,
 907   SVE_NOT_IMPLEMENTED, /* sve_width  */
 908   6, /* memmov_cost  */
 909   2, /* issue_rate  */
 910   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 911   "8",  /* function_align.  */
 912   "8",  /* jump_align.  */
 913   "8",  /* loop_align.  */
 914   2,    /* int_reassoc_width.  */
 915   4,    /* fp_reassoc_width.  */
 916   1,    /* vec_reassoc_width.  */
 917   2,    /* min_div_recip_mul_sf.  */
 918   2,    /* min_div_recip_mul_df.  */
 919   0,    /* max_case_values.  */
 920   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 921   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 922    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 923   &thunderx_prefetch_tune
 924 };
 925
 926 static const struct tune_params tsv110_tunings =
 927 {
 928   &tsv110_extra_costs,
 929   &tsv110_addrcost_table,
 930   &tsv110_regmove_cost,
 931   &tsv110_vector_cost,
 932   &generic_branch_cost,
 933   &generic_approx_modes,
 934   SVE_NOT_IMPLEMENTED, /* sve_width  */
 935   4,    /* memmov_cost  */
 936   4,    /* issue_rate  */
 937   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
 938    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 939   "16", /* function_align.  */
 940   "4",  /* jump_align.  */
 941   "8",  /* loop_align.  */
 942   2,    /* int_reassoc_width.  */
 943   4,    /* fp_reassoc_width.  */
 944   1,    /* vec_reassoc_width.  */
 945   2,    /* min_div_recip_mul_sf.  */
 946   2,    /* min_div_recip_mul_df.  */
 947   0,    /* max_case_values.  */
 948   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 949   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
 950   &tsv110_prefetch_tune
 951 };
 952
 953 static const struct tune_params xgene1_tunings =
 954 {
 955   &xgene1_extra_costs,
 956   &xgene1_addrcost_table,
 957   &xgene1_regmove_cost,
 958   &xgene1_vector_cost,
 959   &generic_branch_cost,
 960   &xgene1_approx_modes,
 961   SVE_NOT_IMPLEMENTED, /* sve_width  */
 962   6, /* memmov_cost  */
 963   4, /* issue_rate  */
 964   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 965   "16", /* function_align.  */
 966   "16", /* jump_align.  */
 967   "16", /* loop_align.  */
 968   2,    /* int_reassoc_width.  */
 969   4,    /* fp_reassoc_width.  */
 970   1,    /* vec_reassoc_width.  */
 971   2,    /* min_div_recip_mul_sf.  */
 972   2,    /* min_div_recip_mul_df.  */
 973   17,   /* max_case_values.  */
 974   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 975   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
 976   &xgene1_prefetch_tune
 977 };
 978
 979 static const struct tune_params emag_tunings =
 980 {
 981   &xgene1_extra_costs,
 982   &xgene1_addrcost_table,
 983   &xgene1_regmove_cost,
 984   &xgene1_vector_cost,
 985   &generic_branch_cost,
 986   &xgene1_approx_modes,
 987   SVE_NOT_IMPLEMENTED,
 988   6, /* memmov_cost  */
 989   4, /* issue_rate  */
 990   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 991   "16", /* function_align.  */
 992   "16", /* jump_align.  */
 993   "16", /* loop_align.  */
 994   2,    /* int_reassoc_width.  */
 995   4,    /* fp_reassoc_width.  */
 996   1,    /* vec_reassoc_width.  */
 997   2,    /* min_div_recip_mul_sf.  */
 998   2,    /* min_div_recip_mul_df.  */
 999   17,   /* max_case_values.  */
1000   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1001   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1002   &xgene1_prefetch_tune
1003 };
1004
1005 static const struct tune_params qdf24xx_tunings =
1006 {
1007   &qdf24xx_extra_costs,
1008   &qdf24xx_addrcost_table,
1009   &qdf24xx_regmove_cost,
1010   &qdf24xx_vector_cost,
1011   &generic_branch_cost,
1012   &generic_approx_modes,
1013   SVE_NOT_IMPLEMENTED, /* sve_width  */
1014   4, /* memmov_cost  */
1015   4, /* issue_rate  */
1016   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1017    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1018   "16", /* function_align.  */
1019   "8",  /* jump_align.  */
1020   "16", /* loop_align.  */
1021   2,    /* int_reassoc_width.  */
1022   4,    /* fp_reassoc_width.  */
1023   1,    /* vec_reassoc_width.  */
1024   2,    /* min_div_recip_mul_sf.  */
1025   2,    /* min_div_recip_mul_df.  */
1026   0,    /* max_case_values.  */
1027   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1028   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1029   &qdf24xx_prefetch_tune
1030 };
1031
1032 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1033    for now.  */
1034 static const struct tune_params saphira_tunings =
1035 {
1036   &generic_extra_costs,
1037   &generic_addrcost_table,
1038   &generic_regmove_cost,
1039   &generic_vector_cost,
1040   &generic_branch_cost,
1041   &generic_approx_modes,
1042   SVE_NOT_IMPLEMENTED, /* sve_width  */
1043   4, /* memmov_cost  */
1044   4, /* issue_rate  */
1045   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1046    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1047   "16", /* function_align.  */
1048   "8",  /* jump_align.  */
1049   "16", /* loop_align.  */
1050   2,    /* int_reassoc_width.  */
1051   4,    /* fp_reassoc_width.  */
1052   1,    /* vec_reassoc_width.  */
1053   2,    /* min_div_recip_mul_sf.  */
1054   2,    /* min_div_recip_mul_df.  */
1055   0,    /* max_case_values.  */
1056   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1057   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1058   &generic_prefetch_tune
1059 };
1060
1061 static const struct tune_params thunderx2t99_tunings =
1062 {
1063   &thunderx2t99_extra_costs,
1064   &thunderx2t99_addrcost_table,
1065   &thunderx2t99_regmove_cost,
1066   &thunderx2t99_vector_cost,
1067   &generic_branch_cost,
1068   &generic_approx_modes,
1069   SVE_NOT_IMPLEMENTED, /* sve_width  */
1070   4, /* memmov_cost.  */
1071   4, /* issue_rate.  */
1072   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1073    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
1074   "16", /* function_align.  */
1075   "8",  /* jump_align.  */
1076   "16", /* loop_align.  */
1077   3,    /* int_reassoc_width.  */
1078   2,    /* fp_reassoc_width.  */
1079   2,    /* vec_reassoc_width.  */
1080   2,    /* min_div_recip_mul_sf.  */
1081   2,    /* min_div_recip_mul_df.  */
1082   0,    /* max_case_values.  */
1083   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1084   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1085   &thunderx2t99_prefetch_tune
1086 };
1087
1088 static const struct tune_params neoversen1_tunings =
1089 {
1090   &cortexa57_extra_costs,
1091   &generic_addrcost_table,
1092   &generic_regmove_cost,
1093   &cortexa57_vector_cost,
1094   &generic_branch_cost,
1095   &generic_approx_modes,
1096   SVE_NOT_IMPLEMENTED, /* sve_width  */
1097   4, /* memmov_cost  */
1098   3, /* issue_rate  */
1099   AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
1100   "32:16",      /* function_align.  */
1101   "32:16",      /* jump_align.  */
1102   "32:16",      /* loop_align.  */
1103   2,    /* int_reassoc_width.  */
1104   4,    /* fp_reassoc_width.  */
1105   2,    /* vec_reassoc_width.  */
1106   2,    /* min_div_recip_mul_sf.  */
1107   2,    /* min_div_recip_mul_df.  */
1108   0,    /* max_case_values.  */
1109   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1110   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1111   &generic_prefetch_tune
1112 };
1113
1114 /* Support for fine-grained override of the tuning structures.  */
1115 struct aarch64_tuning_override_function
1116 {
1117   const char* name;
1118   void (*parse_override)(const char*, struct tune_params*);
1119 };
1120
1121 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1122 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1123 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1124
1125 static const struct aarch64_tuning_override_function
1126 aarch64_tuning_override_functions[] =
1127 {
1128   { "fuse", aarch64_parse_fuse_string },
1129   { "tune", aarch64_parse_tune_string },
1130   { "sve_width", aarch64_parse_sve_width_string },
1131   { NULL, NULL }
1132 };
1133
1134 /* A processor implementing AArch64.  */
1135 struct processor
1136 {
1137   const char *const name;
1138   enum aarch64_processor ident;
1139   enum aarch64_processor sched_core;
1140   enum aarch64_arch arch;
1141   unsigned architecture_version;
1142   const uint64_t flags;
1143   const struct tune_params *const tune;
1144 };
1145
1146 /* Architectures implementing AArch64.  */
1147 static const struct processor all_architectures[] =
1148 {
1149 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1150   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1151 #include "aarch64-arches.def"
1152   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1153 };
1154
1155 /* Processor cores implementing AArch64.  */
1156 static const struct processor all_cores[] =
1157 {
1158 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1159   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1160   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1161   FLAGS, &COSTS##_tunings},
1162 #include "aarch64-cores.def"
1163   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1164     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1165   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1166 };
1167
1168
1169 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1170    handling code or by target attributes.  */
1171 static const struct processor *selected_arch;
1172 static const struct processor *selected_cpu;
1173 static const struct processor *selected_tune;
1174
1175 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1176
1177 /* The current tuning set.  */
1178 struct tune_params aarch64_tune_params = generic_tunings;
1179
1180 /* Table of machine attributes.  */
1181 static const struct attribute_spec aarch64_attribute_table[] =
1182 {
1183   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1184        affects_type_identity, handler, exclude } */
1185   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,  NULL, NULL },
1186   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1187 };
1188
1189 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1190
1191 /* An ISA extension in the co-processor and main instruction set space.  */
1192 struct aarch64_option_extension
1193 {
1194   const char *const name;
1195   const unsigned long flags_on;
1196   const unsigned long flags_off;
1197 };
1198
1199 typedef enum aarch64_cond_code
1200 {
1201   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1202   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1203   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1204 }
1205 aarch64_cc;
1206
1207 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1208
1209 struct aarch64_branch_protect_type
1210 {
1211   /* The type's name that the user passes to the branch-protection option
1212     string.  */
1213   const char* name;
1214   /* Function to handle the protection type and set global variables.
1215     First argument is the string token corresponding with this type and the
1216     second argument is the next token in the option string.
1217     Return values:
1218     * AARCH64_PARSE_OK: Handling was sucessful.
1219     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1220       should print an error.
1221     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1222       own error.  */
1223   enum aarch64_parse_opt_result (*handler)(char*, char*);
1224   /* A list of types that can follow this type in the option string.  */
1225   const aarch64_branch_protect_type* subtypes;
1226   unsigned int num_subtypes;
1227 };
1228
1229 static enum aarch64_parse_opt_result
1230 aarch64_handle_no_branch_protection (char* str, char* rest)
1231 {
1232   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1233   aarch64_enable_bti = 0;
1234   if (rest)
1235     {
1236       error ("unexpected %<%s%> after %<%s%>", rest, str);
1237       return AARCH64_PARSE_INVALID_FEATURE;
1238     }
1239   return AARCH64_PARSE_OK;
1240 }
1241
1242 static enum aarch64_parse_opt_result
1243 aarch64_handle_standard_branch_protection (char* str, char* rest)
1244 {
1245   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1246   aarch64_ra_sign_key = AARCH64_KEY_A;
1247   aarch64_enable_bti = 1;
1248   if (rest)
1249     {
1250       error ("unexpected %<%s%> after %<%s%>", rest, str);
1251       return AARCH64_PARSE_INVALID_FEATURE;
1252     }
1253   return AARCH64_PARSE_OK;
1254 }
1255
1256 static enum aarch64_parse_opt_result
1257 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1258                                     char* rest ATTRIBUTE_UNUSED)
1259 {
1260   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1261   aarch64_ra_sign_key = AARCH64_KEY_A;
1262   return AARCH64_PARSE_OK;
1263 }
1264
1265 static enum aarch64_parse_opt_result
1266 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1267                               char* rest ATTRIBUTE_UNUSED)
1268 {
1269   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1270   return AARCH64_PARSE_OK;
1271 }
1272
1273 static enum aarch64_parse_opt_result
1274 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1275                               char* rest ATTRIBUTE_UNUSED)
1276 {
1277   aarch64_ra_sign_key = AARCH64_KEY_B;
1278   return AARCH64_PARSE_OK;
1279 }
1280
1281 static enum aarch64_parse_opt_result
1282 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1283                                     char* rest ATTRIBUTE_UNUSED)
1284 {
1285   aarch64_enable_bti = 1;
1286   return AARCH64_PARSE_OK;
1287 }
1288
1289 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1290   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1291   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1292   { NULL, NULL, NULL, 0 }
1293 };
1294
1295 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1296   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1297   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1298   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1299     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1300   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1301   { NULL, NULL, NULL, 0 }
1302 };
1303
1304 /* The condition codes of the processor, and the inverse function.  */
1305 static const char * const aarch64_condition_codes[] =
1306 {
1307   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1308   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1309 };
1310
1311 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1312 const char *
1313 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1314                         const char * branch_format)
1315 {
1316     rtx_code_label * tmp_label = gen_label_rtx ();
1317     char label_buf[256];
1318     char buffer[128];
1319     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1320                                  CODE_LABEL_NUMBER (tmp_label));
1321     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1322     rtx dest_label = operands[pos_label];
1323     operands[pos_label] = tmp_label;
1324
1325     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1326     output_asm_insn (buffer, operands);
1327
1328     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1329     operands[pos_label] = dest_label;
1330     output_asm_insn (buffer, operands);
1331     return "";
1332 }
1333
1334 void
1335 aarch64_err_no_fpadvsimd (machine_mode mode)
1336 {
1337   if (TARGET_GENERAL_REGS_ONLY)
1338     if (FLOAT_MODE_P (mode))
1339       error ("%qs is incompatible with the use of floating-point types",
1340              "-mgeneral-regs-only");
1341     else
1342       error ("%qs is incompatible with the use of vector types",
1343              "-mgeneral-regs-only");
1344   else
1345     if (FLOAT_MODE_P (mode))
1346       error ("%qs feature modifier is incompatible with the use of"
1347              " floating-point types", "+nofp");
1348     else
1349       error ("%qs feature modifier is incompatible with the use of"
1350              " vector types", "+nofp");
1351 }
1352
1353 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1354    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1355    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1356    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1357    and GENERAL_REGS is lower than the memory cost (in this case the best class
1358    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1359    cost results in bad allocations with many redundant int<->FP moves which
1360    are expensive on various cores.
1361    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1362    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1363    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1364    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1365    The result of this is that it is no longer inefficient to have a higher
1366    memory move cost than the register move cost.
1367 */
1368
1369 static reg_class_t
1370 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1371                                          reg_class_t best_class)
1372 {
1373   machine_mode mode;
1374
1375   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1376       || !reg_class_subset_p (FP_REGS, allocno_class))
1377     return allocno_class;
1378
1379   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1380       || !reg_class_subset_p (FP_REGS, best_class))
1381     return best_class;
1382
1383   mode = PSEUDO_REGNO_MODE (regno);
1384   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1385 }
1386
1387 static unsigned int
1388 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1389 {
1390   if (GET_MODE_UNIT_SIZE (mode) == 4)
1391     return aarch64_tune_params.min_div_recip_mul_sf;
1392   return aarch64_tune_params.min_div_recip_mul_df;
1393 }
1394
1395 /* Return the reassociation width of treeop OPC with mode MODE.  */
1396 static int
1397 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1398 {
1399   if (VECTOR_MODE_P (mode))
1400     return aarch64_tune_params.vec_reassoc_width;
1401   if (INTEGRAL_MODE_P (mode))
1402     return aarch64_tune_params.int_reassoc_width;
1403   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1404   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1405     return aarch64_tune_params.fp_reassoc_width;
1406   return 1;
1407 }
1408
1409 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1410 unsigned
1411 aarch64_dbx_register_number (unsigned regno)
1412 {
1413    if (GP_REGNUM_P (regno))
1414      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1415    else if (regno == SP_REGNUM)
1416      return AARCH64_DWARF_SP;
1417    else if (FP_REGNUM_P (regno))
1418      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1419    else if (PR_REGNUM_P (regno))
1420      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1421    else if (regno == VG_REGNUM)
1422      return AARCH64_DWARF_VG;
1423
1424    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1425       equivalent DWARF register.  */
1426    return DWARF_FRAME_REGISTERS;
1427 }
1428
1429 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1430 static bool
1431 aarch64_advsimd_struct_mode_p (machine_mode mode)
1432 {
1433   return (TARGET_SIMD
1434           && (mode == OImode || mode == CImode || mode == XImode));
1435 }
1436
1437 /* Return true if MODE is an SVE predicate mode.  */
1438 static bool
1439 aarch64_sve_pred_mode_p (machine_mode mode)
1440 {
1441   return (TARGET_SVE
1442           && (mode == VNx16BImode
1443               || mode == VNx8BImode
1444               || mode == VNx4BImode
1445               || mode == VNx2BImode));
1446 }
1447
1448 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1449 const unsigned int VEC_ADVSIMD  = 1;
1450 const unsigned int VEC_SVE_DATA = 2;
1451 const unsigned int VEC_SVE_PRED = 4;
1452 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1453    a structure of 2, 3 or 4 vectors.  */
1454 const unsigned int VEC_STRUCT   = 8;
1455 /* Useful combinations of the above.  */
1456 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1457 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1458
1459 /* Return a set of flags describing the vector properties of mode MODE.
1460    Ignore modes that are not supported by the current target.  */
1461 static unsigned int
1462 aarch64_classify_vector_mode (machine_mode mode)
1463 {
1464   if (aarch64_advsimd_struct_mode_p (mode))
1465     return VEC_ADVSIMD | VEC_STRUCT;
1466
1467   if (aarch64_sve_pred_mode_p (mode))
1468     return VEC_SVE_PRED;
1469
1470   scalar_mode inner = GET_MODE_INNER (mode);
1471   if (VECTOR_MODE_P (mode)
1472       && (inner == QImode
1473           || inner == HImode
1474           || inner == HFmode
1475           || inner == SImode
1476           || inner == SFmode
1477           || inner == DImode
1478           || inner == DFmode))
1479     {
1480       if (TARGET_SVE)
1481         {
1482           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1483             return VEC_SVE_DATA;
1484           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1485               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1486               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1487             return VEC_SVE_DATA | VEC_STRUCT;
1488         }
1489
1490       /* This includes V1DF but not V1DI (which doesn't exist).  */
1491       if (TARGET_SIMD
1492           && (known_eq (GET_MODE_BITSIZE (mode), 64)
1493               || known_eq (GET_MODE_BITSIZE (mode), 128)))
1494         return VEC_ADVSIMD;
1495     }
1496
1497   return 0;
1498 }
1499
1500 /* Return true if MODE is any of the data vector modes, including
1501    structure modes.  */
1502 static bool
1503 aarch64_vector_data_mode_p (machine_mode mode)
1504 {
1505   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1506 }
1507
1508 /* Return true if MODE is an SVE data vector mode; either a single vector
1509    or a structure of vectors.  */
1510 static bool
1511 aarch64_sve_data_mode_p (machine_mode mode)
1512 {
1513   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1514 }
1515
1516 /* Implement target hook TARGET_ARRAY_MODE.  */
1517 static opt_machine_mode
1518 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1519 {
1520   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1521       && IN_RANGE (nelems, 2, 4))
1522     return mode_for_vector (GET_MODE_INNER (mode),
1523                             GET_MODE_NUNITS (mode) * nelems);
1524
1525   return opt_machine_mode ();
1526 }
1527
1528 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1529 static bool
1530 aarch64_array_mode_supported_p (machine_mode mode,
1531                                 unsigned HOST_WIDE_INT nelems)
1532 {
1533   if (TARGET_SIMD
1534       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1535           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1536       && (nelems >= 2 && nelems <= 4))
1537     return true;
1538
1539   return false;
1540 }
1541
1542 /* Return the SVE predicate mode to use for elements that have
1543    ELEM_NBYTES bytes, if such a mode exists.  */
1544
1545 opt_machine_mode
1546 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1547 {
1548   if (TARGET_SVE)
1549     {
1550       if (elem_nbytes == 1)
1551         return VNx16BImode;
1552       if (elem_nbytes == 2)
1553         return VNx8BImode;
1554       if (elem_nbytes == 4)
1555         return VNx4BImode;
1556       if (elem_nbytes == 8)
1557         return VNx2BImode;
1558     }
1559   return opt_machine_mode ();
1560 }
1561
1562 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1563
1564 static opt_machine_mode
1565 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1566 {
1567   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1568     {
1569       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1570       machine_mode pred_mode;
1571       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1572         return pred_mode;
1573     }
1574
1575   return default_get_mask_mode (nunits, nbytes);
1576 }
1577
1578 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1579    prefer to use the first arithmetic operand as the else value if
1580    the else value doesn't matter, since that exactly matches the SVE
1581    destructive merging form.  For ternary operations we could either
1582    pick the first operand and use FMAD-like instructions or the last
1583    operand and use FMLA-like instructions; the latter seems more
1584    natural.  */
1585
1586 static tree
1587 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1588 {
1589   return nops == 3 ? ops[2] : ops[0];
1590 }
1591
1592 /* Implement TARGET_HARD_REGNO_NREGS.  */
1593
1594 static unsigned int
1595 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1596 {
1597   /* ??? Logically we should only need to provide a value when
1598      HARD_REGNO_MODE_OK says that the combination is valid,
1599      but at the moment we need to handle all modes.  Just ignore
1600      any runtime parts for registers that can't store them.  */
1601   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1602   switch (aarch64_regno_regclass (regno))
1603     {
1604     case FP_REGS:
1605     case FP_LO_REGS:
1606       if (aarch64_sve_data_mode_p (mode))
1607         return exact_div (GET_MODE_SIZE (mode),
1608                           BYTES_PER_SVE_VECTOR).to_constant ();
1609       return CEIL (lowest_size, UNITS_PER_VREG);
1610     case PR_REGS:
1611     case PR_LO_REGS:
1612     case PR_HI_REGS:
1613       return 1;
1614     default:
1615       return CEIL (lowest_size, UNITS_PER_WORD);
1616     }
1617   gcc_unreachable ();
1618 }
1619
1620 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1621
1622 static bool
1623 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1624 {
1625   if (GET_MODE_CLASS (mode) == MODE_CC)
1626     return regno == CC_REGNUM;
1627
1628   if (regno == VG_REGNUM)
1629     /* This must have the same size as _Unwind_Word.  */
1630     return mode == DImode;
1631
1632   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1633   if (vec_flags & VEC_SVE_PRED)
1634     return PR_REGNUM_P (regno);
1635
1636   if (PR_REGNUM_P (regno))
1637     return 0;
1638
1639   if (regno == SP_REGNUM)
1640     /* The purpose of comparing with ptr_mode is to support the
1641        global register variable associated with the stack pointer
1642        register via the syntax of asm ("wsp") in ILP32.  */
1643     return mode == Pmode || mode == ptr_mode;
1644
1645   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1646     return mode == Pmode;
1647
1648   if (GP_REGNUM_P (regno))
1649     {
1650       if (known_le (GET_MODE_SIZE (mode), 8))
1651         return true;
1652       else if (known_le (GET_MODE_SIZE (mode), 16))
1653         return (regno & 1) == 0;
1654     }
1655   else if (FP_REGNUM_P (regno))
1656     {
1657       if (vec_flags & VEC_STRUCT)
1658         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1659       else
1660         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1661     }
1662
1663   return false;
1664 }
1665
1666 /* Return true if this is a definition of a vectorized simd function.  */
1667
1668 static bool
1669 aarch64_simd_decl_p (tree fndecl)
1670 {
1671   tree fntype;
1672
1673   if (fndecl == NULL)
1674     return false;
1675   fntype = TREE_TYPE (fndecl);
1676   if (fntype == NULL)
1677     return false;
1678
1679   /* Functions with the aarch64_vector_pcs attribute use the simd ABI.  */
1680   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1681     return true;
1682
1683   return false;
1684 }
1685
1686 /* Return the mode a register save/restore should use.  DImode for integer
1687    registers, DFmode for FP registers in non-SIMD functions (they only save
1688    the bottom half of a 128 bit register), or TFmode for FP registers in
1689    SIMD functions.  */
1690
1691 static machine_mode
1692 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1693 {
1694   return GP_REGNUM_P (regno)
1695            ? E_DImode
1696            : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1697 }
1698
1699 /* Return true if the instruction is a call to a SIMD function, false
1700    if it is not a SIMD function or if we do not know anything about
1701    the function.  */
1702
1703 static bool
1704 aarch64_simd_call_p (rtx_insn *insn)
1705 {
1706   rtx symbol;
1707   rtx call;
1708   tree fndecl;
1709
1710   gcc_assert (CALL_P (insn));
1711   call = get_call_rtx_from (insn);
1712   symbol = XEXP (XEXP (call, 0), 0);
1713   if (GET_CODE (symbol) != SYMBOL_REF)
1714     return false;
1715   fndecl = SYMBOL_REF_DECL (symbol);
1716   if (!fndecl)
1717     return false;
1718
1719   return aarch64_simd_decl_p (fndecl);
1720 }
1721
1722 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS.  If INSN calls
1723    a function that uses the SIMD ABI, take advantage of the extra
1724    call-preserved registers that the ABI provides.  */
1725
1726 void
1727 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1728                                           HARD_REG_SET *return_set)
1729 {
1730   if (aarch64_simd_call_p (insn))
1731     {
1732       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1733         if (FP_SIMD_SAVED_REGNUM_P (regno))
1734           CLEAR_HARD_REG_BIT (*return_set, regno);
1735     }
1736 }
1737
1738 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1739    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1740    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1741
1742 static bool
1743 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1744                                         machine_mode mode)
1745 {
1746   bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1747   return FP_REGNUM_P (regno)
1748          && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1749 }
1750
1751 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS.  */
1752
1753 rtx_insn *
1754 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1755 {
1756   gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1757
1758   if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1759     return call_1;
1760   else
1761     return call_2;
1762 }
1763
1764 /* Implement REGMODE_NATURAL_SIZE.  */
1765 poly_uint64
1766 aarch64_regmode_natural_size (machine_mode mode)
1767 {
1768   /* The natural size for SVE data modes is one SVE data vector,
1769      and similarly for predicates.  We can't independently modify
1770      anything smaller than that.  */
1771   /* ??? For now, only do this for variable-width SVE registers.
1772      Doing it for constant-sized registers breaks lower-subreg.c.  */
1773   /* ??? And once that's fixed, we should probably have similar
1774      code for Advanced SIMD.  */
1775   if (!aarch64_sve_vg.is_constant ())
1776     {
1777       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1778       if (vec_flags & VEC_SVE_PRED)
1779         return BYTES_PER_SVE_PRED;
1780       if (vec_flags & VEC_SVE_DATA)
1781         return BYTES_PER_SVE_VECTOR;
1782     }
1783   return UNITS_PER_WORD;
1784 }
1785
1786 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1787 machine_mode
1788 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1789                                      machine_mode mode)
1790 {
1791   /* The predicate mode determines which bits are significant and
1792      which are "don't care".  Decreasing the number of lanes would
1793      lose data while increasing the number of lanes would make bits
1794      unnecessarily significant.  */
1795   if (PR_REGNUM_P (regno))
1796     return mode;
1797   if (known_ge (GET_MODE_SIZE (mode), 4))
1798     return mode;
1799   else
1800     return SImode;
1801 }
1802
1803 /* Return true if I's bits are consecutive ones from the MSB.  */
1804 bool
1805 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1806 {
1807   return exact_log2 (-i) != HOST_WIDE_INT_M1;
1808 }
1809
1810 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1811    that strcpy from constants will be faster.  */
1812
1813 static HOST_WIDE_INT
1814 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1815 {
1816   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1817     return MAX (align, BITS_PER_WORD);
1818   return align;
1819 }
1820
1821 /* Return true if calls to DECL should be treated as
1822    long-calls (ie called via a register).  */
1823 static bool
1824 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1825 {
1826   return false;
1827 }
1828
1829 /* Return true if calls to symbol-ref SYM should be treated as
1830    long-calls (ie called via a register).  */
1831 bool
1832 aarch64_is_long_call_p (rtx sym)
1833 {
1834   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1835 }
1836
1837 /* Return true if calls to symbol-ref SYM should not go through
1838    plt stubs.  */
1839
1840 bool
1841 aarch64_is_noplt_call_p (rtx sym)
1842 {
1843   const_tree decl = SYMBOL_REF_DECL (sym);
1844
1845   if (flag_pic
1846       && decl
1847       && (!flag_plt
1848           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1849       && !targetm.binds_local_p (decl))
1850     return true;
1851
1852   return false;
1853 }
1854
1855 /* Return true if the offsets to a zero/sign-extract operation
1856    represent an expression that matches an extend operation.  The
1857    operands represent the paramters from
1858
1859    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1860 bool
1861 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1862                                 rtx extract_imm)
1863 {
1864   HOST_WIDE_INT mult_val, extract_val;
1865
1866   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1867     return false;
1868
1869   mult_val = INTVAL (mult_imm);
1870   extract_val = INTVAL (extract_imm);
1871
1872   if (extract_val > 8
1873       && extract_val < GET_MODE_BITSIZE (mode)
1874       && exact_log2 (extract_val & ~7) > 0
1875       && (extract_val & 7) <= 4
1876       && mult_val == (1 << (extract_val & 7)))
1877     return true;
1878
1879   return false;
1880 }
1881
1882 /* Emit an insn that's a simple single-set.  Both the operands must be
1883    known to be valid.  */
1884 inline static rtx_insn *
1885 emit_set_insn (rtx x, rtx y)
1886 {
1887   return emit_insn (gen_rtx_SET (x, y));
1888 }
1889
1890 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1891    return the rtx for register 0 in the proper mode.  */
1892 rtx
1893 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1894 {
1895   machine_mode mode = SELECT_CC_MODE (code, x, y);
1896   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1897
1898   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1899   return cc_reg;
1900 }
1901
1902 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
1903
1904 static rtx
1905 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
1906                                   machine_mode y_mode)
1907 {
1908   if (y_mode == E_QImode || y_mode == E_HImode)
1909     {
1910       if (CONST_INT_P (y))
1911         y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
1912       else
1913         {
1914           rtx t, cc_reg;
1915           machine_mode cc_mode;
1916
1917           t = gen_rtx_ZERO_EXTEND (SImode, y);
1918           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
1919           cc_mode = CC_SWPmode;
1920           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
1921           emit_set_insn (cc_reg, t);
1922           return cc_reg;
1923         }
1924     }
1925
1926   return aarch64_gen_compare_reg (code, x, y);
1927 }
1928
1929 /* Build the SYMBOL_REF for __tls_get_addr.  */
1930
1931 static GTY(()) rtx tls_get_addr_libfunc;
1932
1933 rtx
1934 aarch64_tls_get_addr (void)
1935 {
1936   if (!tls_get_addr_libfunc)
1937     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1938   return tls_get_addr_libfunc;
1939 }
1940
1941 /* Return the TLS model to use for ADDR.  */
1942
1943 static enum tls_model
1944 tls_symbolic_operand_type (rtx addr)
1945 {
1946   enum tls_model tls_kind = TLS_MODEL_NONE;
1947   if (GET_CODE (addr) == CONST)
1948     {
1949       poly_int64 addend;
1950       rtx sym = strip_offset (addr, &addend);
1951       if (GET_CODE (sym) == SYMBOL_REF)
1952         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1953     }
1954   else if (GET_CODE (addr) == SYMBOL_REF)
1955     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1956
1957   return tls_kind;
1958 }
1959
1960 /* We'll allow lo_sum's in addresses in our legitimate addresses
1961    so that combine would take care of combining addresses where
1962    necessary, but for generation purposes, we'll generate the address
1963    as :
1964    RTL                               Absolute
1965    tmp = hi (symbol_ref);            adrp  x1, foo
1966    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1967                                      nop
1968
1969    PIC                               TLS
1970    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1971    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1972                                      bl   __tls_get_addr
1973                                      nop
1974
1975    Load TLS symbol, depending on TLS mechanism and TLS access model.
1976
1977    Global Dynamic - Traditional TLS:
1978    adrp tmp, :tlsgd:imm
1979    add  dest, tmp, #:tlsgd_lo12:imm
1980    bl   __tls_get_addr
1981
1982    Global Dynamic - TLS Descriptors:
1983    adrp dest, :tlsdesc:imm
1984    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1985    add  dest, dest, #:tlsdesc_lo12:imm
1986    blr  tmp
1987    mrs  tp, tpidr_el0
1988    add  dest, dest, tp
1989
1990    Initial Exec:
1991    mrs  tp, tpidr_el0
1992    adrp tmp, :gottprel:imm
1993    ldr  dest, [tmp, #:gottprel_lo12:imm]
1994    add  dest, dest, tp
1995
1996    Local Exec:
1997    mrs  tp, tpidr_el0
1998    add  t0, tp, #:tprel_hi12:imm, lsl #12
1999    add  t0, t0, #:tprel_lo12_nc:imm
2000 */
2001
2002 static void
2003 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2004                                    enum aarch64_symbol_type type)
2005 {
2006   switch (type)
2007     {
2008     case SYMBOL_SMALL_ABSOLUTE:
2009       {
2010         /* In ILP32, the mode of dest can be either SImode or DImode.  */
2011         rtx tmp_reg = dest;
2012         machine_mode mode = GET_MODE (dest);
2013
2014         gcc_assert (mode == Pmode || mode == ptr_mode);
2015
2016         if (can_create_pseudo_p ())
2017           tmp_reg = gen_reg_rtx (mode);
2018
2019         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2020         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2021         return;
2022       }
2023
2024     case SYMBOL_TINY_ABSOLUTE:
2025       emit_insn (gen_rtx_SET (dest, imm));
2026       return;
2027
2028     case SYMBOL_SMALL_GOT_28K:
2029       {
2030         machine_mode mode = GET_MODE (dest);
2031         rtx gp_rtx = pic_offset_table_rtx;
2032         rtx insn;
2033         rtx mem;
2034
2035         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2036            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2037            decide rtx costs, in which case pic_offset_table_rtx is not
2038            initialized.  For that case no need to generate the first adrp
2039            instruction as the final cost for global variable access is
2040            one instruction.  */
2041         if (gp_rtx != NULL)
2042           {
2043             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2044                using the page base as GOT base, the first page may be wasted,
2045                in the worst scenario, there is only 28K space for GOT).
2046
2047                The generate instruction sequence for accessing global variable
2048                is:
2049
2050                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2051
2052                Only one instruction needed. But we must initialize
2053                pic_offset_table_rtx properly.  We generate initialize insn for
2054                every global access, and allow CSE to remove all redundant.
2055
2056                The final instruction sequences will look like the following
2057                for multiply global variables access.
2058
2059                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2060
2061                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2062                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2063                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2064                  ...  */
2065
2066             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2067             crtl->uses_pic_offset_table = 1;
2068             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2069
2070             if (mode != GET_MODE (gp_rtx))
2071              gp_rtx = gen_lowpart (mode, gp_rtx);
2072
2073           }
2074
2075         if (mode == ptr_mode)
2076           {
2077             if (mode == DImode)
2078               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2079             else
2080               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2081
2082             mem = XVECEXP (SET_SRC (insn), 0, 0);
2083           }
2084         else
2085           {
2086             gcc_assert (mode == Pmode);
2087
2088             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2089             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2090           }
2091
2092         /* The operand is expected to be MEM.  Whenever the related insn
2093            pattern changed, above code which calculate mem should be
2094            updated.  */
2095         gcc_assert (GET_CODE (mem) == MEM);
2096         MEM_READONLY_P (mem) = 1;
2097         MEM_NOTRAP_P (mem) = 1;
2098         emit_insn (insn);
2099         return;
2100       }
2101
2102     case SYMBOL_SMALL_GOT_4G:
2103       {
2104         /* In ILP32, the mode of dest can be either SImode or DImode,
2105            while the got entry is always of SImode size.  The mode of
2106            dest depends on how dest is used: if dest is assigned to a
2107            pointer (e.g. in the memory), it has SImode; it may have
2108            DImode if dest is dereferenced to access the memeory.
2109            This is why we have to handle three different ldr_got_small
2110            patterns here (two patterns for ILP32).  */
2111
2112         rtx insn;
2113         rtx mem;
2114         rtx tmp_reg = dest;
2115         machine_mode mode = GET_MODE (dest);
2116
2117         if (can_create_pseudo_p ())
2118           tmp_reg = gen_reg_rtx (mode);
2119
2120         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2121         if (mode == ptr_mode)
2122           {
2123             if (mode == DImode)
2124               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2125             else
2126               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2127
2128             mem = XVECEXP (SET_SRC (insn), 0, 0);
2129           }
2130         else
2131           {
2132             gcc_assert (mode == Pmode);
2133
2134             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2135             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2136           }
2137
2138         gcc_assert (GET_CODE (mem) == MEM);
2139         MEM_READONLY_P (mem) = 1;
2140         MEM_NOTRAP_P (mem) = 1;
2141         emit_insn (insn);
2142         return;
2143       }
2144
2145     case SYMBOL_SMALL_TLSGD:
2146       {
2147         rtx_insn *insns;
2148         machine_mode mode = GET_MODE (dest);
2149         rtx result = gen_rtx_REG (mode, R0_REGNUM);
2150
2151         start_sequence ();
2152         if (TARGET_ILP32)
2153           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2154         else
2155           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2156         insns = get_insns ();
2157         end_sequence ();
2158
2159         RTL_CONST_CALL_P (insns) = 1;
2160         emit_libcall_block (insns, dest, result, imm);
2161         return;
2162       }
2163
2164     case SYMBOL_SMALL_TLSDESC:
2165       {
2166         machine_mode mode = GET_MODE (dest);
2167         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2168         rtx tp;
2169
2170         gcc_assert (mode == Pmode || mode == ptr_mode);
2171
2172         /* In ILP32, the got entry is always of SImode size.  Unlike
2173            small GOT, the dest is fixed at reg 0.  */
2174         if (TARGET_ILP32)
2175           emit_insn (gen_tlsdesc_small_si (imm));
2176         else
2177           emit_insn (gen_tlsdesc_small_di (imm));
2178         tp = aarch64_load_tp (NULL);
2179
2180         if (mode != Pmode)
2181           tp = gen_lowpart (mode, tp);
2182
2183         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2184         if (REG_P (dest))
2185           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2186         return;
2187       }
2188
2189     case SYMBOL_SMALL_TLSIE:
2190       {
2191         /* In ILP32, the mode of dest can be either SImode or DImode,
2192            while the got entry is always of SImode size.  The mode of
2193            dest depends on how dest is used: if dest is assigned to a
2194            pointer (e.g. in the memory), it has SImode; it may have
2195            DImode if dest is dereferenced to access the memeory.
2196            This is why we have to handle three different tlsie_small
2197            patterns here (two patterns for ILP32).  */
2198         machine_mode mode = GET_MODE (dest);
2199         rtx tmp_reg = gen_reg_rtx (mode);
2200         rtx tp = aarch64_load_tp (NULL);
2201
2202         if (mode == ptr_mode)
2203           {
2204             if (mode == DImode)
2205               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2206             else
2207               {
2208                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2209                 tp = gen_lowpart (mode, tp);
2210               }
2211           }
2212         else
2213           {
2214             gcc_assert (mode == Pmode);
2215             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2216           }
2217
2218         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2219         if (REG_P (dest))
2220           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2221         return;
2222       }
2223
2224     case SYMBOL_TLSLE12:
2225     case SYMBOL_TLSLE24:
2226     case SYMBOL_TLSLE32:
2227     case SYMBOL_TLSLE48:
2228       {
2229         machine_mode mode = GET_MODE (dest);
2230         rtx tp = aarch64_load_tp (NULL);
2231
2232         if (mode != Pmode)
2233           tp = gen_lowpart (mode, tp);
2234
2235         switch (type)
2236           {
2237           case SYMBOL_TLSLE12:
2238             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2239                         (dest, tp, imm));
2240             break;
2241           case SYMBOL_TLSLE24:
2242             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2243                         (dest, tp, imm));
2244           break;
2245           case SYMBOL_TLSLE32:
2246             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2247                         (dest, imm));
2248             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2249                         (dest, dest, tp));
2250           break;
2251           case SYMBOL_TLSLE48:
2252             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2253                         (dest, imm));
2254             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2255                         (dest, dest, tp));
2256             break;
2257           default:
2258             gcc_unreachable ();
2259           }
2260
2261         if (REG_P (dest))
2262           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2263         return;
2264       }
2265
2266     case SYMBOL_TINY_GOT:
2267       emit_insn (gen_ldr_got_tiny (dest, imm));
2268       return;
2269
2270     case SYMBOL_TINY_TLSIE:
2271       {
2272         machine_mode mode = GET_MODE (dest);
2273         rtx tp = aarch64_load_tp (NULL);
2274
2275         if (mode == ptr_mode)
2276           {
2277             if (mode == DImode)
2278               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2279             else
2280               {
2281                 tp = gen_lowpart (mode, tp);
2282                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2283               }
2284           }
2285         else
2286           {
2287             gcc_assert (mode == Pmode);
2288             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2289           }
2290
2291         if (REG_P (dest))
2292           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2293         return;
2294       }
2295
2296     default:
2297       gcc_unreachable ();
2298     }
2299 }
2300
2301 /* Emit a move from SRC to DEST.  Assume that the move expanders can
2302    handle all moves if !can_create_pseudo_p ().  The distinction is
2303    important because, unlike emit_move_insn, the move expanders know
2304    how to force Pmode objects into the constant pool even when the
2305    constant pool address is not itself legitimate.  */
2306 static rtx
2307 aarch64_emit_move (rtx dest, rtx src)
2308 {
2309   return (can_create_pseudo_p ()
2310           ? emit_move_insn (dest, src)
2311           : emit_move_insn_1 (dest, src));
2312 }
2313
2314 /* Apply UNOPTAB to OP and store the result in DEST.  */
2315
2316 static void
2317 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2318 {
2319   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2320   if (dest != tmp)
2321     emit_move_insn (dest, tmp);
2322 }
2323
2324 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2325
2326 static void
2327 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2328 {
2329   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2330                           OPTAB_DIRECT);
2331   if (dest != tmp)
2332     emit_move_insn (dest, tmp);
2333 }
2334
2335 /* Split a 128-bit move operation into two 64-bit move operations,
2336    taking care to handle partial overlap of register to register
2337    copies.  Special cases are needed when moving between GP regs and
2338    FP regs.  SRC can be a register, constant or memory; DST a register
2339    or memory.  If either operand is memory it must not have any side
2340    effects.  */
2341 void
2342 aarch64_split_128bit_move (rtx dst, rtx src)
2343 {
2344   rtx dst_lo, dst_hi;
2345   rtx src_lo, src_hi;
2346
2347   machine_mode mode = GET_MODE (dst);
2348
2349   gcc_assert (mode == TImode || mode == TFmode);
2350   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2351   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2352
2353   if (REG_P (dst) && REG_P (src))
2354     {
2355       int src_regno = REGNO (src);
2356       int dst_regno = REGNO (dst);
2357
2358       /* Handle FP <-> GP regs.  */
2359       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2360         {
2361           src_lo = gen_lowpart (word_mode, src);
2362           src_hi = gen_highpart (word_mode, src);
2363
2364           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2365           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2366           return;
2367         }
2368       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2369         {
2370           dst_lo = gen_lowpart (word_mode, dst);
2371           dst_hi = gen_highpart (word_mode, dst);
2372
2373           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2374           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2375           return;
2376         }
2377     }
2378
2379   dst_lo = gen_lowpart (word_mode, dst);
2380   dst_hi = gen_highpart (word_mode, dst);
2381   src_lo = gen_lowpart (word_mode, src);
2382   src_hi = gen_highpart_mode (word_mode, mode, src);
2383
2384   /* At most one pairing may overlap.  */
2385   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2386     {
2387       aarch64_emit_move (dst_hi, src_hi);
2388       aarch64_emit_move (dst_lo, src_lo);
2389     }
2390   else
2391     {
2392       aarch64_emit_move (dst_lo, src_lo);
2393       aarch64_emit_move (dst_hi, src_hi);
2394     }
2395 }
2396
2397 bool
2398 aarch64_split_128bit_move_p (rtx dst, rtx src)
2399 {
2400   return (! REG_P (src)
2401           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2402 }
2403
2404 /* Split a complex SIMD combine.  */
2405
2406 void
2407 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2408 {
2409   machine_mode src_mode = GET_MODE (src1);
2410   machine_mode dst_mode = GET_MODE (dst);
2411
2412   gcc_assert (VECTOR_MODE_P (dst_mode));
2413   gcc_assert (register_operand (dst, dst_mode)
2414               && register_operand (src1, src_mode)
2415               && register_operand (src2, src_mode));
2416
2417   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2418   return;
2419 }
2420
2421 /* Split a complex SIMD move.  */
2422
2423 void
2424 aarch64_split_simd_move (rtx dst, rtx src)
2425 {
2426   machine_mode src_mode = GET_MODE (src);
2427   machine_mode dst_mode = GET_MODE (dst);
2428
2429   gcc_assert (VECTOR_MODE_P (dst_mode));
2430
2431   if (REG_P (dst) && REG_P (src))
2432     {
2433       gcc_assert (VECTOR_MODE_P (src_mode));
2434       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2435     }
2436 }
2437
2438 bool
2439 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2440                               machine_mode ymode, rtx y)
2441 {
2442   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2443   gcc_assert (r != NULL);
2444   return rtx_equal_p (x, r);
2445 }
2446
2447
2448 static rtx
2449 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2450 {
2451   if (can_create_pseudo_p ())
2452     return force_reg (mode, value);
2453   else
2454     {
2455       gcc_assert (x);
2456       aarch64_emit_move (x, value);
2457       return x;
2458     }
2459 }
2460
2461 /* Return true if we can move VALUE into a register using a single
2462    CNT[BHWD] instruction.  */
2463
2464 static bool
2465 aarch64_sve_cnt_immediate_p (poly_int64 value)
2466 {
2467   HOST_WIDE_INT factor = value.coeffs[0];
2468   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2469   return (value.coeffs[1] == factor
2470           && IN_RANGE (factor, 2, 16 * 16)
2471           && (factor & 1) == 0
2472           && factor <= 16 * (factor & -factor));
2473 }
2474
2475 /* Likewise for rtx X.  */
2476
2477 bool
2478 aarch64_sve_cnt_immediate_p (rtx x)
2479 {
2480   poly_int64 value;
2481   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2482 }
2483
2484 /* Return the asm string for an instruction with a CNT-like vector size
2485    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2486    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2487    first part of the operands template (the part that comes before the
2488    vector size itself).  FACTOR is the number of quadwords.
2489    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2490    If it is zero, we can use any element size.  */
2491
2492 static char *
2493 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2494                                   unsigned int factor,
2495                                   unsigned int nelts_per_vq)
2496 {
2497   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2498
2499   if (nelts_per_vq == 0)
2500     /* There is some overlap in the ranges of the four CNT instructions.
2501        Here we always use the smallest possible element size, so that the
2502        multiplier is 1 whereever possible.  */
2503     nelts_per_vq = factor & -factor;
2504   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2505   gcc_assert (IN_RANGE (shift, 1, 4));
2506   char suffix = "dwhb"[shift - 1];
2507
2508   factor >>= shift;
2509   unsigned int written;
2510   if (factor == 1)
2511     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2512                         prefix, suffix, operands);
2513   else
2514     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2515                         prefix, suffix, operands, factor);
2516   gcc_assert (written < sizeof (buffer));
2517   return buffer;
2518 }
2519
2520 /* Return the asm string for an instruction with a CNT-like vector size
2521    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2522    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2523    first part of the operands template (the part that comes before the
2524    vector size itself).  X is the value of the vector size operand,
2525    as a polynomial integer rtx.  */
2526
2527 char *
2528 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2529                                   rtx x)
2530 {
2531   poly_int64 value = rtx_to_poly_int64 (x);
2532   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2533   return aarch64_output_sve_cnt_immediate (prefix, operands,
2534                                            value.coeffs[1], 0);
2535 }
2536
2537 /* Return true if we can add VALUE to a register using a single ADDVL
2538    or ADDPL instruction.  */
2539
2540 static bool
2541 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2542 {
2543   HOST_WIDE_INT factor = value.coeffs[0];
2544   if (factor == 0 || value.coeffs[1] != factor)
2545     return false;
2546   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2547      and a value of 16 is one vector width.  */
2548   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2549           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2550 }
2551
2552 /* Likewise for rtx X.  */
2553
2554 bool
2555 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2556 {
2557   poly_int64 value;
2558   return (poly_int_rtx_p (x, &value)
2559           && aarch64_sve_addvl_addpl_immediate_p (value));
2560 }
2561
2562 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2563    and storing the result in operand 0.  */
2564
2565 char *
2566 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2567 {
2568   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2569   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2570   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2571
2572   /* Use INC or DEC if possible.  */
2573   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2574     {
2575       if (aarch64_sve_cnt_immediate_p (offset_value))
2576         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2577                                                  offset_value.coeffs[1], 0);
2578       if (aarch64_sve_cnt_immediate_p (-offset_value))
2579         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2580                                                  -offset_value.coeffs[1], 0);
2581     }
2582
2583   int factor = offset_value.coeffs[1];
2584   if ((factor & 15) == 0)
2585     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2586   else
2587     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2588   return buffer;
2589 }
2590
2591 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2592    instruction.  If it is, store the number of elements in each vector
2593    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2594    factor in *FACTOR_OUT (if nonnull).  */
2595
2596 bool
2597 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2598                                  unsigned int *nelts_per_vq_out)
2599 {
2600   rtx elt;
2601   poly_int64 value;
2602
2603   if (!const_vec_duplicate_p (x, &elt)
2604       || !poly_int_rtx_p (elt, &value))
2605     return false;
2606
2607   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2608   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2609     /* There's no vector INCB.  */
2610     return false;
2611
2612   HOST_WIDE_INT factor = value.coeffs[0];
2613   if (value.coeffs[1] != factor)
2614     return false;
2615
2616   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2617   if ((factor % nelts_per_vq) != 0
2618       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2619     return false;
2620
2621   if (factor_out)
2622     *factor_out = factor;
2623   if (nelts_per_vq_out)
2624     *nelts_per_vq_out = nelts_per_vq;
2625   return true;
2626 }
2627
2628 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2629    instruction.  */
2630
2631 bool
2632 aarch64_sve_inc_dec_immediate_p (rtx x)
2633 {
2634   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2635 }
2636
2637 /* Return the asm template for an SVE vector INC or DEC instruction.
2638    OPERANDS gives the operands before the vector count and X is the
2639    value of the vector count operand itself.  */
2640
2641 char *
2642 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2643 {
2644   int factor;
2645   unsigned int nelts_per_vq;
2646   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2647     gcc_unreachable ();
2648   if (factor < 0)
2649     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2650                                              nelts_per_vq);
2651   else
2652     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2653                                              nelts_per_vq);
2654 }
2655
2656 static int
2657 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2658                                 scalar_int_mode mode)
2659 {
2660   int i;
2661   unsigned HOST_WIDE_INT val, val2, mask;
2662   int one_match, zero_match;
2663   int num_insns;
2664
2665   val = INTVAL (imm);
2666
2667   if (aarch64_move_imm (val, mode))
2668     {
2669       if (generate)
2670         emit_insn (gen_rtx_SET (dest, imm));
2671       return 1;
2672     }
2673
2674   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2675      (with XXXX non-zero). In that case check to see if the move can be done in
2676      a smaller mode.  */
2677   val2 = val & 0xffffffff;
2678   if (mode == DImode
2679       && aarch64_move_imm (val2, SImode)
2680       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2681     {
2682       if (generate)
2683         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2684
2685       /* Check if we have to emit a second instruction by checking to see
2686          if any of the upper 32 bits of the original DI mode value is set.  */
2687       if (val == val2)
2688         return 1;
2689
2690       i = (val >> 48) ? 48 : 32;
2691
2692       if (generate)
2693          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2694                                     GEN_INT ((val >> i) & 0xffff)));
2695
2696       return 2;
2697     }
2698
2699   if ((val >> 32) == 0 || mode == SImode)
2700     {
2701       if (generate)
2702         {
2703           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2704           if (mode == SImode)
2705             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2706                                        GEN_INT ((val >> 16) & 0xffff)));
2707           else
2708             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2709                                        GEN_INT ((val >> 16) & 0xffff)));
2710         }
2711       return 2;
2712     }
2713
2714   /* Remaining cases are all for DImode.  */
2715
2716   mask = 0xffff;
2717   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2718     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2719   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2720     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2721
2722   if (zero_match != 2 && one_match != 2)
2723     {
2724       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2725          For a 64-bit bitmask try whether changing 16 bits to all ones or
2726          zeroes creates a valid bitmask.  To check any repeated bitmask,
2727          try using 16 bits from the other 32-bit half of val.  */
2728
2729       for (i = 0; i < 64; i += 16, mask <<= 16)
2730         {
2731           val2 = val & ~mask;
2732           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2733             break;
2734           val2 = val | mask;
2735           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2736             break;
2737           val2 = val2 & ~mask;
2738           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2739           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2740             break;
2741         }
2742       if (i != 64)
2743         {
2744           if (generate)
2745             {
2746               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2747               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2748                                          GEN_INT ((val >> i) & 0xffff)));
2749             }
2750           return 2;
2751         }
2752     }
2753
2754   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2755      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2756      otherwise skip zero bits.  */
2757
2758   num_insns = 1;
2759   mask = 0xffff;
2760   val2 = one_match > zero_match ? ~val : val;
2761   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2762
2763   if (generate)
2764     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2765                                            ? (val | ~(mask << i))
2766                                            : (val & (mask << i)))));
2767   for (i += 16; i < 64; i += 16)
2768     {
2769       if ((val2 & (mask << i)) == 0)
2770         continue;
2771       if (generate)
2772         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2773                                    GEN_INT ((val >> i) & 0xffff)));
2774       num_insns ++;
2775     }
2776
2777   return num_insns;
2778 }
2779
2780 /* Return whether imm is a 128-bit immediate which is simple enough to
2781    expand inline.  */
2782 bool
2783 aarch64_mov128_immediate (rtx imm)
2784 {
2785   if (GET_CODE (imm) == CONST_INT)
2786     return true;
2787
2788   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2789
2790   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2791   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2792
2793   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2794          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2795 }
2796
2797
2798 /* Return the number of temporary registers that aarch64_add_offset_1
2799    would need to add OFFSET to a register.  */
2800
2801 static unsigned int
2802 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2803 {
2804   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2805 }
2806
2807 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2808    a non-polynomial OFFSET.  MODE is the mode of the addition.
2809    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2810    be set and CFA adjustments added to the generated instructions.
2811
2812    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2813    temporary if register allocation is already complete.  This temporary
2814    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2815    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2816    the immediate again.
2817
2818    Since this function may be used to adjust the stack pointer, we must
2819    ensure that it cannot cause transient stack deallocation (for example
2820    by first incrementing SP and then decrementing when adjusting by a
2821    large immediate).  */
2822
2823 static void
2824 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2825                       rtx src, HOST_WIDE_INT offset, rtx temp1,
2826                       bool frame_related_p, bool emit_move_imm)
2827 {
2828   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2829   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2830
2831   HOST_WIDE_INT moffset = abs_hwi (offset);
2832   rtx_insn *insn;
2833
2834   if (!moffset)
2835     {
2836       if (!rtx_equal_p (dest, src))
2837         {
2838           insn = emit_insn (gen_rtx_SET (dest, src));
2839           RTX_FRAME_RELATED_P (insn) = frame_related_p;
2840         }
2841       return;
2842     }
2843
2844   /* Single instruction adjustment.  */
2845   if (aarch64_uimm12_shift (moffset))
2846     {
2847       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2848       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2849       return;
2850     }
2851
2852   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2853      and either:
2854
2855      a) the offset cannot be loaded by a 16-bit move or
2856      b) there is no spare register into which we can move it.  */
2857   if (moffset < 0x1000000
2858       && ((!temp1 && !can_create_pseudo_p ())
2859           || !aarch64_move_imm (moffset, mode)))
2860     {
2861       HOST_WIDE_INT low_off = moffset & 0xfff;
2862
2863       low_off = offset < 0 ? -low_off : low_off;
2864       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2865       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2866       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2867       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2868       return;
2869     }
2870
2871   /* Emit a move immediate if required and an addition/subtraction.  */
2872   if (emit_move_imm)
2873     {
2874       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2875       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2876     }
2877   insn = emit_insn (offset < 0
2878                     ? gen_sub3_insn (dest, src, temp1)
2879                     : gen_add3_insn (dest, src, temp1));
2880   if (frame_related_p)
2881     {
2882       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2883       rtx adj = plus_constant (mode, src, offset);
2884       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2885     }
2886 }
2887
2888 /* Return the number of temporary registers that aarch64_add_offset
2889    would need to move OFFSET into a register or add OFFSET to a register;
2890    ADD_P is true if we want the latter rather than the former.  */
2891
2892 static unsigned int
2893 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2894 {
2895   /* This follows the same structure as aarch64_add_offset.  */
2896   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2897     return 0;
2898
2899   unsigned int count = 0;
2900   HOST_WIDE_INT factor = offset.coeffs[1];
2901   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2902   poly_int64 poly_offset (factor, factor);
2903   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2904     /* Need one register for the ADDVL/ADDPL result.  */
2905     count += 1;
2906   else if (factor != 0)
2907     {
2908       factor = abs (factor);
2909       if (factor > 16 * (factor & -factor))
2910         /* Need one register for the CNT result and one for the multiplication
2911            factor.  If necessary, the second temporary can be reused for the
2912            constant part of the offset.  */
2913         return 2;
2914       /* Need one register for the CNT result (which might then
2915          be shifted).  */
2916       count += 1;
2917     }
2918   return count + aarch64_add_offset_1_temporaries (constant);
2919 }
2920
2921 /* If X can be represented as a poly_int64, return the number
2922    of temporaries that are required to add it to a register.
2923    Return -1 otherwise.  */
2924
2925 int
2926 aarch64_add_offset_temporaries (rtx x)
2927 {
2928   poly_int64 offset;
2929   if (!poly_int_rtx_p (x, &offset))
2930     return -1;
2931   return aarch64_offset_temporaries (true, offset);
2932 }
2933
2934 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
2935    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2936    be set and CFA adjustments added to the generated instructions.
2937
2938    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2939    temporary if register allocation is already complete.  This temporary
2940    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2941    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2942    false to avoid emitting the immediate again.
2943
2944    TEMP2, if nonnull, is a second temporary register that doesn't
2945    overlap either DEST or REG.
2946
2947    Since this function may be used to adjust the stack pointer, we must
2948    ensure that it cannot cause transient stack deallocation (for example
2949    by first incrementing SP and then decrementing when adjusting by a
2950    large immediate).  */
2951
2952 static void
2953 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2954                     poly_int64 offset, rtx temp1, rtx temp2,
2955                     bool frame_related_p, bool emit_move_imm = true)
2956 {
2957   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2958   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2959   gcc_assert (temp1 == NULL_RTX
2960               || !frame_related_p
2961               || !reg_overlap_mentioned_p (temp1, dest));
2962   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2963
2964   /* Try using ADDVL or ADDPL to add the whole value.  */
2965   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2966     {
2967       rtx offset_rtx = gen_int_mode (offset, mode);
2968       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2969       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2970       return;
2971     }
2972
2973   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2974      SVE vector register, over and above the minimum size of 128 bits.
2975      This is equivalent to half the value returned by CNTD with a
2976      vector shape of ALL.  */
2977   HOST_WIDE_INT factor = offset.coeffs[1];
2978   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2979
2980   /* Try using ADDVL or ADDPL to add the VG-based part.  */
2981   poly_int64 poly_offset (factor, factor);
2982   if (src != const0_rtx
2983       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2984     {
2985       rtx offset_rtx = gen_int_mode (poly_offset, mode);
2986       if (frame_related_p)
2987         {
2988           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2989           RTX_FRAME_RELATED_P (insn) = true;
2990           src = dest;
2991         }
2992       else
2993         {
2994           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2995           src = aarch64_force_temporary (mode, temp1, addr);
2996           temp1 = temp2;
2997           temp2 = NULL_RTX;
2998         }
2999     }
3000   /* Otherwise use a CNT-based sequence.  */
3001   else if (factor != 0)
3002     {
3003       /* Use a subtraction if we have a negative factor.  */
3004       rtx_code code = PLUS;
3005       if (factor < 0)
3006         {
3007           factor = -factor;
3008           code = MINUS;
3009         }
3010
3011       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
3012          into the multiplication.  */
3013       rtx val;
3014       int shift = 0;
3015       if (factor & 1)
3016         /* Use a right shift by 1.  */
3017         shift = -1;
3018       else
3019         factor /= 2;
3020       HOST_WIDE_INT low_bit = factor & -factor;
3021       if (factor <= 16 * low_bit)
3022         {
3023           if (factor > 16 * 8)
3024             {
3025               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3026                  the value with the minimum multiplier and shift it into
3027                  position.  */
3028               int extra_shift = exact_log2 (low_bit);
3029               shift += extra_shift;
3030               factor >>= extra_shift;
3031             }
3032           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3033         }
3034       else
3035         {
3036           /* Use CNTD, then multiply it by FACTOR.  */
3037           val = gen_int_mode (poly_int64 (2, 2), mode);
3038           val = aarch64_force_temporary (mode, temp1, val);
3039
3040           /* Go back to using a negative multiplication factor if we have
3041              no register from which to subtract.  */
3042           if (code == MINUS && src == const0_rtx)
3043             {
3044               factor = -factor;
3045               code = PLUS;
3046             }
3047           rtx coeff1 = gen_int_mode (factor, mode);
3048           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3049           val = gen_rtx_MULT (mode, val, coeff1);
3050         }
3051
3052       if (shift > 0)
3053         {
3054           /* Multiply by 1 << SHIFT.  */
3055           val = aarch64_force_temporary (mode, temp1, val);
3056           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3057         }
3058       else if (shift == -1)
3059         {
3060           /* Divide by 2.  */
3061           val = aarch64_force_temporary (mode, temp1, val);
3062           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3063         }
3064
3065       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
3066       if (src != const0_rtx)
3067         {
3068           val = aarch64_force_temporary (mode, temp1, val);
3069           val = gen_rtx_fmt_ee (code, mode, src, val);
3070         }
3071       else if (code == MINUS)
3072         {
3073           val = aarch64_force_temporary (mode, temp1, val);
3074           val = gen_rtx_NEG (mode, val);
3075         }
3076
3077       if (constant == 0 || frame_related_p)
3078         {
3079           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3080           if (frame_related_p)
3081             {
3082               RTX_FRAME_RELATED_P (insn) = true;
3083               add_reg_note (insn, REG_CFA_ADJUST_CFA,
3084                             gen_rtx_SET (dest, plus_constant (Pmode, src,
3085                                                               poly_offset)));
3086             }
3087           src = dest;
3088           if (constant == 0)
3089             return;
3090         }
3091       else
3092         {
3093           src = aarch64_force_temporary (mode, temp1, val);
3094           temp1 = temp2;
3095           temp2 = NULL_RTX;
3096         }
3097
3098       emit_move_imm = true;
3099     }
3100
3101   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3102                         frame_related_p, emit_move_imm);
3103 }
3104
3105 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3106    than a poly_int64.  */
3107
3108 void
3109 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3110                           rtx offset_rtx, rtx temp1, rtx temp2)
3111 {
3112   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3113                       temp1, temp2, false);
3114 }
3115
3116 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3117    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
3118    if TEMP1 already contains abs (DELTA).  */
3119
3120 static inline void
3121 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3122 {
3123   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3124                       temp1, temp2, true, emit_move_imm);
3125 }
3126
3127 /* Subtract DELTA from the stack pointer, marking the instructions
3128    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
3129    if nonnull.  */
3130
3131 static inline void
3132 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3133                 bool emit_move_imm = true)
3134 {
3135   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3136                       temp1, temp2, frame_related_p, emit_move_imm);
3137 }
3138
3139 /* Set DEST to (vec_series BASE STEP).  */
3140
3141 static void
3142 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3143 {
3144   machine_mode mode = GET_MODE (dest);
3145   scalar_mode inner = GET_MODE_INNER (mode);
3146
3147   /* Each operand can be a register or an immediate in the range [-16, 15].  */
3148   if (!aarch64_sve_index_immediate_p (base))
3149     base = force_reg (inner, base);
3150   if (!aarch64_sve_index_immediate_p (step))
3151     step = force_reg (inner, step);
3152
3153   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3154 }
3155
3156 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
3157    integer of mode INT_MODE.  Return true on success.  */
3158
3159 static bool
3160 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
3161                                       rtx src)
3162 {
3163   /* If the constant is smaller than 128 bits, we can do the move
3164      using a vector of SRC_MODEs.  */
3165   if (src_mode != TImode)
3166     {
3167       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
3168                                      GET_MODE_SIZE (src_mode));
3169       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
3170       emit_move_insn (gen_lowpart (dup_mode, dest),
3171                       gen_const_vec_duplicate (dup_mode, src));
3172       return true;
3173     }
3174
3175   /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
3176   src = force_const_mem (src_mode, src);
3177   if (!src)
3178     return false;
3179
3180   /* Make sure that the address is legitimate.  */
3181   if (!aarch64_sve_ld1r_operand_p (src))
3182     {
3183       rtx addr = force_reg (Pmode, XEXP (src, 0));
3184       src = replace_equiv_address (src, addr);
3185     }
3186
3187   machine_mode mode = GET_MODE (dest);
3188   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3189   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3190   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3191   src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
3192   emit_insn (gen_rtx_SET (dest, src));
3193   return true;
3194 }
3195
3196 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
3197    isn't a simple duplicate or series.  */
3198
3199 static void
3200 aarch64_expand_sve_const_vector (rtx dest, rtx src)
3201 {
3202   machine_mode mode = GET_MODE (src);
3203   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3204   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3205   gcc_assert (npatterns > 1);
3206
3207   if (nelts_per_pattern == 1)
3208     {
3209       /* The constant is a repeating seqeuence of at least two elements,
3210          where the repeating elements occupy no more than 128 bits.
3211          Get an integer representation of the replicated value.  */
3212       scalar_int_mode int_mode;
3213       if (BYTES_BIG_ENDIAN)
3214         /* For now, always use LD1RQ to load the value on big-endian
3215            targets, since the handling of smaller integers includes a
3216            subreg that is semantically an element reverse.  */
3217         int_mode = TImode;
3218       else
3219         {
3220           unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
3221           gcc_assert (int_bits <= 128);
3222           int_mode = int_mode_for_size (int_bits, 0).require ();
3223         }
3224       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
3225       if (int_value
3226           && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
3227         return;
3228     }
3229
3230   /* Expand each pattern individually.  */
3231   rtx_vector_builder builder;
3232   auto_vec<rtx, 16> vectors (npatterns);
3233   for (unsigned int i = 0; i < npatterns; ++i)
3234     {
3235       builder.new_vector (mode, 1, nelts_per_pattern);
3236       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3237         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3238       vectors.quick_push (force_reg (mode, builder.build ()));
3239     }
3240
3241   /* Use permutes to interleave the separate vectors.  */
3242   while (npatterns > 1)
3243     {
3244       npatterns /= 2;
3245       for (unsigned int i = 0; i < npatterns; ++i)
3246         {
3247           rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
3248           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3249           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3250           vectors[i] = tmp;
3251         }
3252     }
3253   gcc_assert (vectors[0] == dest);
3254 }
3255
3256 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
3257    is a pattern that can be used to set DEST to a replicated scalar
3258    element.  */
3259
3260 void
3261 aarch64_expand_mov_immediate (rtx dest, rtx imm,
3262                               rtx (*gen_vec_duplicate) (rtx, rtx))
3263 {
3264   machine_mode mode = GET_MODE (dest);
3265
3266   /* Check on what type of symbol it is.  */
3267   scalar_int_mode int_mode;
3268   if ((GET_CODE (imm) == SYMBOL_REF
3269        || GET_CODE (imm) == LABEL_REF
3270        || GET_CODE (imm) == CONST
3271        || GET_CODE (imm) == CONST_POLY_INT)
3272       && is_a <scalar_int_mode> (mode, &int_mode))
3273     {
3274       rtx mem;
3275       poly_int64 offset;
3276       HOST_WIDE_INT const_offset;
3277       enum aarch64_symbol_type sty;
3278
3279       /* If we have (const (plus symbol offset)), separate out the offset
3280          before we start classifying the symbol.  */
3281       rtx base = strip_offset (imm, &offset);
3282
3283       /* We must always add an offset involving VL separately, rather than
3284          folding it into the relocation.  */
3285       if (!offset.is_constant (&const_offset))
3286         {
3287           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
3288             emit_insn (gen_rtx_SET (dest, imm));
3289           else
3290             {
3291               /* Do arithmetic on 32-bit values if the result is smaller
3292                  than that.  */
3293               if (partial_subreg_p (int_mode, SImode))
3294                 {
3295                   /* It is invalid to do symbol calculations in modes
3296                      narrower than SImode.  */
3297                   gcc_assert (base == const0_rtx);
3298                   dest = gen_lowpart (SImode, dest);
3299                   int_mode = SImode;
3300                 }
3301               if (base != const0_rtx)
3302                 {
3303                   base = aarch64_force_temporary (int_mode, dest, base);
3304                   aarch64_add_offset (int_mode, dest, base, offset,
3305                                       NULL_RTX, NULL_RTX, false);
3306                 }
3307               else
3308                 aarch64_add_offset (int_mode, dest, base, offset,
3309                                     dest, NULL_RTX, false);
3310             }
3311           return;
3312         }
3313
3314       sty = aarch64_classify_symbol (base, const_offset);
3315       switch (sty)
3316         {
3317         case SYMBOL_FORCE_TO_MEM:
3318           if (const_offset != 0
3319               && targetm.cannot_force_const_mem (int_mode, imm))
3320             {
3321               gcc_assert (can_create_pseudo_p ());
3322               base = aarch64_force_temporary (int_mode, dest, base);
3323               aarch64_add_offset (int_mode, dest, base, const_offset,
3324                                   NULL_RTX, NULL_RTX, false);
3325               return;
3326             }
3327
3328           mem = force_const_mem (ptr_mode, imm);
3329           gcc_assert (mem);
3330
3331           /* If we aren't generating PC relative literals, then
3332              we need to expand the literal pool access carefully.
3333              This is something that needs to be done in a number
3334              of places, so could well live as a separate function.  */
3335           if (!aarch64_pcrelative_literal_loads)
3336             {
3337               gcc_assert (can_create_pseudo_p ());
3338               base = gen_reg_rtx (ptr_mode);
3339               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3340               if (ptr_mode != Pmode)
3341                 base = convert_memory_address (Pmode, base);
3342               mem = gen_rtx_MEM (ptr_mode, base);
3343             }
3344
3345           if (int_mode != ptr_mode)
3346             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3347
3348           emit_insn (gen_rtx_SET (dest, mem));
3349
3350           return;
3351
3352         case SYMBOL_SMALL_TLSGD:
3353         case SYMBOL_SMALL_TLSDESC:
3354         case SYMBOL_SMALL_TLSIE:
3355         case SYMBOL_SMALL_GOT_28K:
3356         case SYMBOL_SMALL_GOT_4G:
3357         case SYMBOL_TINY_GOT:
3358         case SYMBOL_TINY_TLSIE:
3359           if (const_offset != 0)
3360             {
3361               gcc_assert(can_create_pseudo_p ());
3362               base = aarch64_force_temporary (int_mode, dest, base);
3363               aarch64_add_offset (int_mode, dest, base, const_offset,
3364                                   NULL_RTX, NULL_RTX, false);
3365               return;
3366             }
3367           /* FALLTHRU */
3368
3369         case SYMBOL_SMALL_ABSOLUTE:
3370         case SYMBOL_TINY_ABSOLUTE:
3371         case SYMBOL_TLSLE12:
3372         case SYMBOL_TLSLE24:
3373         case SYMBOL_TLSLE32:
3374         case SYMBOL_TLSLE48:
3375           aarch64_load_symref_appropriately (dest, imm, sty);
3376           return;
3377
3378         default:
3379           gcc_unreachable ();
3380         }
3381     }
3382
3383   if (!CONST_INT_P (imm))
3384     {
3385       rtx base, step, value;
3386       if (GET_CODE (imm) == HIGH
3387           || aarch64_simd_valid_immediate (imm, NULL))
3388         emit_insn (gen_rtx_SET (dest, imm));
3389       else if (const_vec_series_p (imm, &base, &step))
3390         aarch64_expand_vec_series (dest, base, step);
3391       else if (const_vec_duplicate_p (imm, &value))
3392         {
3393           /* If the constant is out of range of an SVE vector move,
3394              load it from memory if we can, otherwise move it into
3395              a register and use a DUP.  */
3396           scalar_mode inner_mode = GET_MODE_INNER (mode);
3397           rtx op = force_const_mem (inner_mode, value);
3398           if (!op)
3399             op = force_reg (inner_mode, value);
3400           else if (!aarch64_sve_ld1r_operand_p (op))
3401             {
3402               rtx addr = force_reg (Pmode, XEXP (op, 0));
3403               op = replace_equiv_address (op, addr);
3404             }
3405           emit_insn (gen_vec_duplicate (dest, op));
3406         }
3407       else if (GET_CODE (imm) == CONST_VECTOR
3408                && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3409         aarch64_expand_sve_const_vector (dest, imm);
3410       else
3411         {
3412           rtx mem = force_const_mem (mode, imm);
3413           gcc_assert (mem);
3414           emit_move_insn (dest, mem);
3415         }
3416
3417       return;
3418     }
3419
3420   aarch64_internal_mov_immediate (dest, imm, true,
3421                                   as_a <scalar_int_mode> (mode));
3422 }
3423
3424 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3425    that is known to contain PTRUE.  */
3426
3427 void
3428 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3429 {
3430   expand_operand ops[3];
3431   machine_mode mode = GET_MODE (dest);
3432   create_output_operand (&ops[0], dest, mode);
3433   create_input_operand (&ops[1], pred, GET_MODE(pred));
3434   create_input_operand (&ops[2], src, mode);
3435   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
3436 }
3437
3438 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3439    operand is in memory.  In this case we need to use the predicated LD1
3440    and ST1 instead of LDR and STR, both for correctness on big-endian
3441    targets and because LD1 and ST1 support a wider range of addressing modes.
3442    PRED_MODE is the mode of the predicate.
3443
3444    See the comment at the head of aarch64-sve.md for details about the
3445    big-endian handling.  */
3446
3447 void
3448 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3449 {
3450   machine_mode mode = GET_MODE (dest);
3451   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3452   if (!register_operand (src, mode)
3453       && !register_operand (dest, mode))
3454     {
3455       rtx tmp = gen_reg_rtx (mode);
3456       if (MEM_P (src))
3457         aarch64_emit_sve_pred_move (tmp, ptrue, src);
3458       else
3459         emit_move_insn (tmp, src);
3460       src = tmp;
3461     }
3462   aarch64_emit_sve_pred_move (dest, ptrue, src);
3463 }
3464
3465 /* Called only on big-endian targets.  See whether an SVE vector move
3466    from SRC to DEST is effectively a REV[BHW] instruction, because at
3467    least one operand is a subreg of an SVE vector that has wider or
3468    narrower elements.  Return true and emit the instruction if so.
3469
3470    For example:
3471
3472      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3473
3474    represents a VIEW_CONVERT between the following vectors, viewed
3475    in memory order:
3476
3477      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
3478      R1: { [0],      [1],      [2],      [3],     ... }
3479
3480    The high part of lane X in R2 should therefore correspond to lane X*2
3481    of R1, but the register representations are:
3482
3483          msb                                      lsb
3484      R2: ...... [1].high  [1].low   [0].high  [0].low
3485      R1: ...... [3]       [2]       [1]       [0]
3486
3487    where the low part of lane X in R2 corresponds to lane X*2 in R1.
3488    We therefore need a reverse operation to swap the high and low values
3489    around.
3490
3491    This is purely an optimization.  Without it we would spill the
3492    subreg operand to the stack in one mode and reload it in the
3493    other mode, which has the same effect as the REV.  */
3494
3495 bool
3496 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3497 {
3498   gcc_assert (BYTES_BIG_ENDIAN);
3499   if (GET_CODE (dest) == SUBREG)
3500     dest = SUBREG_REG (dest);
3501   if (GET_CODE (src) == SUBREG)
3502     src = SUBREG_REG (src);
3503
3504   /* The optimization handles two single SVE REGs with different element
3505      sizes.  */
3506   if (!REG_P (dest)
3507       || !REG_P (src)
3508       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3509       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3510       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3511           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3512     return false;
3513
3514   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
3515   rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3516   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3517                                UNSPEC_REV_SUBREG);
3518   emit_insn (gen_rtx_SET (dest, unspec));
3519   return true;
3520 }
3521
3522 /* Return a copy of X with mode MODE, without changing its other
3523    attributes.  Unlike gen_lowpart, this doesn't care whether the
3524    mode change is valid.  */
3525
3526 static rtx
3527 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3528 {
3529   if (GET_MODE (x) == mode)
3530     return x;
3531
3532   x = shallow_copy_rtx (x);
3533   set_mode_and_regno (x, mode, REGNO (x));
3534   return x;
3535 }
3536
3537 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3538    operands.  */
3539
3540 void
3541 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3542 {
3543   /* Decide which REV operation we need.  The mode with narrower elements
3544      determines the mode of the operands and the mode with the wider
3545      elements determines the reverse width.  */
3546   machine_mode mode_with_wider_elts = GET_MODE (dest);
3547   machine_mode mode_with_narrower_elts = GET_MODE (src);
3548   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3549       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3550     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3551
3552   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3553   unsigned int unspec;
3554   if (wider_bytes == 8)
3555     unspec = UNSPEC_REV64;
3556   else if (wider_bytes == 4)
3557     unspec = UNSPEC_REV32;
3558   else if (wider_bytes == 2)
3559     unspec = UNSPEC_REV16;
3560   else
3561     gcc_unreachable ();
3562   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3563
3564   /* Emit:
3565
3566        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3567                          UNSPEC_MERGE_PTRUE))
3568
3569      with the appropriate modes.  */
3570   ptrue = gen_lowpart (pred_mode, ptrue);
3571   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3572   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3573   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3574   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3575                         UNSPEC_MERGE_PTRUE);
3576   emit_insn (gen_rtx_SET (dest, src));
3577 }
3578
3579 static bool
3580 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3581                                  tree exp ATTRIBUTE_UNUSED)
3582 {
3583   if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
3584     return false;
3585
3586   return true;
3587 }
3588
3589 /* Implement TARGET_PASS_BY_REFERENCE.  */
3590
3591 static bool
3592 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3593                            machine_mode mode,
3594                            const_tree type,
3595                            bool named ATTRIBUTE_UNUSED)
3596 {
3597   HOST_WIDE_INT size;
3598   machine_mode dummymode;
3599   int nregs;
3600
3601   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3602   if (mode == BLKmode && type)
3603     size = int_size_in_bytes (type);
3604   else
3605     /* No frontends can create types with variable-sized modes, so we
3606        shouldn't be asked to pass or return them.  */
3607     size = GET_MODE_SIZE (mode).to_constant ();
3608
3609   /* Aggregates are passed by reference based on their size.  */
3610   if (type && AGGREGATE_TYPE_P (type))
3611     {
3612       size = int_size_in_bytes (type);
3613     }
3614
3615   /* Variable sized arguments are always returned by reference.  */
3616   if (size < 0)
3617     return true;
3618
3619   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3620   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3621                                                &dummymode, &nregs,
3622                                                NULL))
3623     return false;
3624
3625   /* Arguments which are variable sized or larger than 2 registers are
3626      passed by reference unless they are a homogenous floating point
3627      aggregate.  */
3628   return size > 2 * UNITS_PER_WORD;
3629 }
3630
3631 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3632 static bool
3633 aarch64_return_in_msb (const_tree valtype)
3634 {
3635   machine_mode dummy_mode;
3636   int dummy_int;
3637
3638   /* Never happens in little-endian mode.  */
3639   if (!BYTES_BIG_ENDIAN)
3640     return false;
3641
3642   /* Only composite types smaller than or equal to 16 bytes can
3643      be potentially returned in registers.  */
3644   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3645       || int_size_in_bytes (valtype) <= 0
3646       || int_size_in_bytes (valtype) > 16)
3647     return false;
3648
3649   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3650      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3651      is always passed/returned in the least significant bits of fp/simd
3652      register(s).  */
3653   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3654                                                &dummy_mode, &dummy_int, NULL))
3655     return false;
3656
3657   return true;
3658 }
3659
3660 /* Implement TARGET_FUNCTION_VALUE.
3661    Define how to find the value returned by a function.  */
3662
3663 static rtx
3664 aarch64_function_value (const_tree type, const_tree func,
3665                         bool outgoing ATTRIBUTE_UNUSED)
3666 {
3667   machine_mode mode;
3668   int unsignedp;
3669   int count;
3670   machine_mode ag_mode;
3671
3672   mode = TYPE_MODE (type);
3673   if (INTEGRAL_TYPE_P (type))
3674     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3675
3676   if (aarch64_return_in_msb (type))
3677     {
3678       HOST_WIDE_INT size = int_size_in_bytes (type);
3679
3680       if (size % UNITS_PER_WORD != 0)
3681         {
3682           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3683           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3684         }
3685     }
3686
3687   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3688                                                &ag_mode, &count, NULL))
3689     {
3690       if (!aarch64_composite_type_p (type, mode))
3691         {
3692           gcc_assert (count == 1 && mode == ag_mode);
3693           return gen_rtx_REG (mode, V0_REGNUM);
3694         }
3695       else
3696         {
3697           int i;
3698           rtx par;
3699
3700           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3701           for (i = 0; i < count; i++)
3702             {
3703               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3704               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3705               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3706               XVECEXP (par, 0, i) = tmp;
3707             }
3708           return par;
3709         }
3710     }
3711   else
3712     return gen_rtx_REG (mode, R0_REGNUM);
3713 }
3714
3715 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3716    Return true if REGNO is the number of a hard register in which the values
3717    of called function may come back.  */
3718
3719 static bool
3720 aarch64_function_value_regno_p (const unsigned int regno)
3721 {
3722   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3723      of 16-byte return values are: 128-bit integers and 16-byte small
3724      structures (excluding homogeneous floating-point aggregates).  */
3725   if (regno == R0_REGNUM || regno == R1_REGNUM)
3726     return true;
3727
3728   /* Up to four fp/simd registers can return a function value, e.g. a
3729      homogeneous floating-point aggregate having four members.  */
3730   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3731     return TARGET_FLOAT;
3732
3733   return false;
3734 }
3735
3736 /* Implement TARGET_RETURN_IN_MEMORY.
3737
3738    If the type T of the result of a function is such that
3739      void func (T arg)
3740    would require that arg be passed as a value in a register (or set of
3741    registers) according to the parameter passing rules, then the result
3742    is returned in the same registers as would be used for such an
3743    argument.  */
3744
3745 static bool
3746 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3747 {
3748   HOST_WIDE_INT size;
3749   machine_mode ag_mode;
3750   int count;
3751
3752   if (!AGGREGATE_TYPE_P (type)
3753       && TREE_CODE (type) != COMPLEX_TYPE
3754       && TREE_CODE (type) != VECTOR_TYPE)
3755     /* Simple scalar types always returned in registers.  */
3756     return false;
3757
3758   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3759                                                type,
3760                                                &ag_mode,
3761                                                &count,
3762                                                NULL))
3763     return false;
3764
3765   /* Types larger than 2 registers returned in memory.  */
3766   size = int_size_in_bytes (type);
3767   return (size < 0 || size > 2 * UNITS_PER_WORD);
3768 }
3769
3770 static bool
3771 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3772                                const_tree type, int *nregs)
3773 {
3774   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3775   return aarch64_vfp_is_call_or_return_candidate (mode,
3776                                                   type,
3777                                                   &pcum->aapcs_vfp_rmode,
3778                                                   nregs,
3779                                                   NULL);
3780 }
3781
3782 /* Given MODE and TYPE of a function argument, return the alignment in
3783    bits.  The idea is to suppress any stronger alignment requested by
3784    the user and opt for the natural alignment (specified in AAPCS64 \S
3785    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
3786    calculated in versions of GCC prior to GCC-9.  This is a helper
3787    function for local use only.  */
3788
3789 static unsigned int
3790 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
3791                                 bool *abi_break)
3792 {
3793   *abi_break = false;
3794   if (!type)
3795     return GET_MODE_ALIGNMENT (mode);
3796
3797   if (integer_zerop (TYPE_SIZE (type)))
3798     return 0;
3799
3800   gcc_assert (TYPE_MODE (type) == mode);
3801
3802   if (!AGGREGATE_TYPE_P (type))
3803     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3804
3805   if (TREE_CODE (type) == ARRAY_TYPE)
3806     return TYPE_ALIGN (TREE_TYPE (type));
3807
3808   unsigned int alignment = 0;
3809   unsigned int bitfield_alignment = 0;
3810   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3811     if (TREE_CODE (field) == FIELD_DECL)
3812       {
3813         alignment = std::max (alignment, DECL_ALIGN (field));
3814         if (DECL_BIT_FIELD_TYPE (field))
3815           bitfield_alignment
3816             = std::max (bitfield_alignment,
3817                         TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
3818       }
3819
3820   if (bitfield_alignment > alignment)
3821     {
3822       *abi_break = true;
3823       return bitfield_alignment;
3824     }
3825
3826   return alignment;
3827 }
3828
3829 /* Layout a function argument according to the AAPCS64 rules.  The rule
3830    numbers refer to the rule numbers in the AAPCS64.  */
3831
3832 static void
3833 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3834                     const_tree type,
3835                     bool named ATTRIBUTE_UNUSED)
3836 {
3837   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3838   int ncrn, nvrn, nregs;
3839   bool allocate_ncrn, allocate_nvrn;
3840   HOST_WIDE_INT size;
3841   bool abi_break;
3842
3843   /* We need to do this once per argument.  */
3844   if (pcum->aapcs_arg_processed)
3845     return;
3846
3847   pcum->aapcs_arg_processed = true;
3848
3849   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
3850   if (type)
3851     size = int_size_in_bytes (type);
3852   else
3853     /* No frontends can create types with variable-sized modes, so we
3854        shouldn't be asked to pass or return them.  */
3855     size = GET_MODE_SIZE (mode).to_constant ();
3856   size = ROUND_UP (size, UNITS_PER_WORD);
3857
3858   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3859   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3860                                                  mode,
3861                                                  type,
3862                                                  &nregs);
3863
3864   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3865      The following code thus handles passing by SIMD/FP registers first.  */
3866
3867   nvrn = pcum->aapcs_nvrn;
3868
3869   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3870      and homogenous short-vector aggregates (HVA).  */
3871   if (allocate_nvrn)
3872     {
3873       if (!TARGET_FLOAT)
3874         aarch64_err_no_fpadvsimd (mode);
3875
3876       if (nvrn + nregs <= NUM_FP_ARG_REGS)
3877         {
3878           pcum->aapcs_nextnvrn = nvrn + nregs;
3879           if (!aarch64_composite_type_p (type, mode))
3880             {
3881               gcc_assert (nregs == 1);
3882               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3883             }
3884           else
3885             {
3886               rtx par;
3887               int i;
3888               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3889               for (i = 0; i < nregs; i++)
3890                 {
3891                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3892                                          V0_REGNUM + nvrn + i);
3893                   rtx offset = gen_int_mode
3894                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3895                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3896                   XVECEXP (par, 0, i) = tmp;
3897                 }
3898               pcum->aapcs_reg = par;
3899             }
3900           return;
3901         }
3902       else
3903         {
3904           /* C.3 NSRN is set to 8.  */
3905           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3906           goto on_stack;
3907         }
3908     }
3909
3910   ncrn = pcum->aapcs_ncrn;
3911   nregs = size / UNITS_PER_WORD;
3912
3913   /* C6 - C9.  though the sign and zero extension semantics are
3914      handled elsewhere.  This is the case where the argument fits
3915      entirely general registers.  */
3916   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3917     {
3918       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3919
3920       /* C.8 if the argument has an alignment of 16 then the NGRN is
3921          rounded up to the next even number.  */
3922       if (nregs == 2
3923           && ncrn % 2
3924           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3925              comparison is there because for > 16 * BITS_PER_UNIT
3926              alignment nregs should be > 2 and therefore it should be
3927              passed by reference rather than value.  */
3928           && (aarch64_function_arg_alignment (mode, type, &abi_break)
3929               == 16 * BITS_PER_UNIT))
3930         {
3931           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
3932             inform (input_location, "parameter passing for argument of type "
3933                     "%qT changed in GCC 9.1", type);
3934           ++ncrn;
3935           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3936         }
3937
3938       /* NREGS can be 0 when e.g. an empty structure is to be passed.
3939          A reg is still generated for it, but the caller should be smart
3940          enough not to use it.  */
3941       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3942         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3943       else
3944         {
3945           rtx par;
3946           int i;
3947
3948           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3949           for (i = 0; i < nregs; i++)
3950             {
3951               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3952               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3953                                        GEN_INT (i * UNITS_PER_WORD));
3954               XVECEXP (par, 0, i) = tmp;
3955             }
3956           pcum->aapcs_reg = par;
3957         }
3958
3959       pcum->aapcs_nextncrn = ncrn + nregs;
3960       return;
3961     }
3962
3963   /* C.11  */
3964   pcum->aapcs_nextncrn = NUM_ARG_REGS;
3965
3966   /* The argument is passed on stack; record the needed number of words for
3967      this argument and align the total size if necessary.  */
3968 on_stack:
3969   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3970
3971   if (aarch64_function_arg_alignment (mode, type, &abi_break)
3972       == 16 * BITS_PER_UNIT)
3973     {
3974       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
3975       if (pcum->aapcs_stack_size != new_size)
3976         {
3977           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
3978             inform (input_location, "parameter passing for argument of type "
3979                     "%qT changed in GCC 9.1", type);
3980           pcum->aapcs_stack_size = new_size;
3981         }
3982     }
3983   return;
3984 }
3985
3986 /* Implement TARGET_FUNCTION_ARG.  */
3987
3988 static rtx
3989 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3990                       const_tree type, bool named)
3991 {
3992   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3993   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3994
3995   if (mode == VOIDmode)
3996     return NULL_RTX;
3997
3998   aarch64_layout_arg (pcum_v, mode, type, named);
3999   return pcum->aapcs_reg;
4000 }
4001
4002 void
4003 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4004                            const_tree fntype ATTRIBUTE_UNUSED,
4005                            rtx libname ATTRIBUTE_UNUSED,
4006                            const_tree fndecl ATTRIBUTE_UNUSED,
4007                            unsigned n_named ATTRIBUTE_UNUSED)
4008 {
4009   pcum->aapcs_ncrn = 0;
4010   pcum->aapcs_nvrn = 0;
4011   pcum->aapcs_nextncrn = 0;
4012   pcum->aapcs_nextnvrn = 0;
4013   pcum->pcs_variant = ARM_PCS_AAPCS64;
4014   pcum->aapcs_reg = NULL_RTX;
4015   pcum->aapcs_arg_processed = false;
4016   pcum->aapcs_stack_words = 0;
4017   pcum->aapcs_stack_size = 0;
4018
4019   if (!TARGET_FLOAT
4020       && fndecl && TREE_PUBLIC (fndecl)
4021       && fntype && fntype != error_mark_node)
4022     {
4023       const_tree type = TREE_TYPE (fntype);
4024       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
4025       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
4026       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4027                                                    &mode, &nregs, NULL))
4028         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4029     }
4030   return;
4031 }
4032
4033 static void
4034 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4035                               machine_mode mode,
4036                               const_tree type,
4037                               bool named)
4038 {
4039   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4040   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4041     {
4042       aarch64_layout_arg (pcum_v, mode, type, named);
4043       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4044                   != (pcum->aapcs_stack_words != 0));
4045       pcum->aapcs_arg_processed = false;
4046       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4047       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4048       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4049       pcum->aapcs_stack_words = 0;
4050       pcum->aapcs_reg = NULL_RTX;
4051     }
4052 }
4053
4054 bool
4055 aarch64_function_arg_regno_p (unsigned regno)
4056 {
4057   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4058           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4059 }
4060
4061 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
4062    PARM_BOUNDARY bits of alignment, but will be given anything up
4063    to STACK_BOUNDARY bits if the type requires it.  This makes sure
4064    that both before and after the layout of each argument, the Next
4065    Stacked Argument Address (NSAA) will have a minimum alignment of
4066    8 bytes.  */
4067
4068 static unsigned int
4069 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4070 {
4071   bool abi_break;
4072   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4073                                                            &abi_break);
4074   if (abi_break & warn_psabi)
4075     inform (input_location, "parameter passing for argument of type "
4076             "%qT changed in GCC 9.1", type);
4077
4078   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4079 }
4080
4081 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
4082
4083 static fixed_size_mode
4084 aarch64_get_reg_raw_mode (int regno)
4085 {
4086   if (TARGET_SVE && FP_REGNUM_P (regno))
4087     /* Don't use the SVE part of the register for __builtin_apply and
4088        __builtin_return.  The SVE registers aren't used by the normal PCS,
4089        so using them there would be a waste of time.  The PCS extensions
4090        for SVE types are fundamentally incompatible with the
4091        __builtin_return/__builtin_apply interface.  */
4092     return as_a <fixed_size_mode> (V16QImode);
4093   return default_get_reg_raw_mode (regno);
4094 }
4095
4096 /* Implement TARGET_FUNCTION_ARG_PADDING.
4097
4098    Small aggregate types are placed in the lowest memory address.
4099
4100    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
4101
4102 static pad_direction
4103 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4104 {
4105   /* On little-endian targets, the least significant byte of every stack
4106      argument is passed at the lowest byte address of the stack slot.  */
4107   if (!BYTES_BIG_ENDIAN)
4108     return PAD_UPWARD;
4109
4110   /* Otherwise, integral, floating-point and pointer types are padded downward:
4111      the least significant byte of a stack argument is passed at the highest
4112      byte address of the stack slot.  */
4113   if (type
4114       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4115          || POINTER_TYPE_P (type))
4116       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4117     return PAD_DOWNWARD;
4118
4119   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
4120   return PAD_UPWARD;
4121 }
4122
4123 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4124
4125    It specifies padding for the last (may also be the only)
4126    element of a block move between registers and memory.  If
4127    assuming the block is in the memory, padding upward means that
4128    the last element is padded after its highest significant byte,
4129    while in downward padding, the last element is padded at the
4130    its least significant byte side.
4131
4132    Small aggregates and small complex types are always padded
4133    upwards.
4134
4135    We don't need to worry about homogeneous floating-point or
4136    short-vector aggregates; their move is not affected by the
4137    padding direction determined here.  Regardless of endianness,
4138    each element of such an aggregate is put in the least
4139    significant bits of a fp/simd register.
4140
4141    Return !BYTES_BIG_ENDIAN if the least significant byte of the
4142    register has useful data, and return the opposite if the most
4143    significant byte does.  */
4144
4145 bool
4146 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4147                      bool first ATTRIBUTE_UNUSED)
4148 {
4149
4150   /* Small composite types are always padded upward.  */
4151   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4152     {
4153       HOST_WIDE_INT size;
4154       if (type)
4155         size = int_size_in_bytes (type);
4156       else
4157         /* No frontends can create types with variable-sized modes, so we
4158            shouldn't be asked to pass or return them.  */
4159         size = GET_MODE_SIZE (mode).to_constant ();
4160       if (size < 2 * UNITS_PER_WORD)
4161         return true;
4162     }
4163
4164   /* Otherwise, use the default padding.  */
4165   return !BYTES_BIG_ENDIAN;
4166 }
4167
4168 static scalar_int_mode
4169 aarch64_libgcc_cmp_return_mode (void)
4170 {
4171   return SImode;
4172 }
4173
4174 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4175
4176 /* We use the 12-bit shifted immediate arithmetic instructions so values
4177    must be multiple of (1 << 12), i.e. 4096.  */
4178 #define ARITH_FACTOR 4096
4179
4180 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4181 #error Cannot use simple address calculation for stack probing
4182 #endif
4183
4184 /* The pair of scratch registers used for stack probing.  */
4185 #define PROBE_STACK_FIRST_REG  R9_REGNUM
4186 #define PROBE_STACK_SECOND_REG R10_REGNUM
4187
4188 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4189    inclusive.  These are offsets from the current stack pointer.  */
4190
4191 static void
4192 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4193 {
4194   HOST_WIDE_INT size;
4195   if (!poly_size.is_constant (&size))
4196     {
4197       sorry ("stack probes for SVE frames");
4198       return;
4199     }
4200
4201   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4202
4203   /* See the same assertion on PROBE_INTERVAL above.  */
4204   gcc_assert ((first % ARITH_FACTOR) == 0);
4205
4206   /* See if we have a constant small number of probes to generate.  If so,
4207      that's the easy case.  */
4208   if (size <= PROBE_INTERVAL)
4209     {
4210       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4211
4212       emit_set_insn (reg1,
4213                      plus_constant (Pmode,
4214                                     stack_pointer_rtx, -(first + base)));
4215       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4216     }
4217
4218   /* The run-time loop is made up of 8 insns in the generic case while the
4219      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
4220   else if (size <= 4 * PROBE_INTERVAL)
4221     {
4222       HOST_WIDE_INT i, rem;
4223
4224       emit_set_insn (reg1,
4225                      plus_constant (Pmode,
4226                                     stack_pointer_rtx,
4227                                     -(first + PROBE_INTERVAL)));
4228       emit_stack_probe (reg1);
4229
4230       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4231          it exceeds SIZE.  If only two probes are needed, this will not
4232          generate any code.  Then probe at FIRST + SIZE.  */
4233       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4234         {
4235           emit_set_insn (reg1,
4236                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4237           emit_stack_probe (reg1);
4238         }
4239
4240       rem = size - (i - PROBE_INTERVAL);
4241       if (rem > 256)
4242         {
4243           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4244
4245           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4246           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4247         }
4248       else
4249         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4250     }
4251
4252   /* Otherwise, do the same as above, but in a loop.  Note that we must be
4253      extra careful with variables wrapping around because we might be at
4254      the very top (or the very bottom) of the address space and we have
4255      to be able to handle this case properly; in particular, we use an
4256      equality test for the loop condition.  */
4257   else
4258     {
4259       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
4260
4261       /* Step 1: round SIZE to the previous multiple of the interval.  */
4262
4263       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
4264
4265
4266       /* Step 2: compute initial and final value of the loop counter.  */
4267
4268       /* TEST_ADDR = SP + FIRST.  */
4269       emit_set_insn (reg1,
4270                      plus_constant (Pmode, stack_pointer_rtx, -first));
4271
4272       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
4273       HOST_WIDE_INT adjustment = - (first + rounded_size);
4274       if (! aarch64_uimm12_shift (adjustment))
4275         {
4276           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
4277                                           true, Pmode);
4278           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
4279         }
4280       else
4281         emit_set_insn (reg2,
4282                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
4283
4284       /* Step 3: the loop
4285
4286          do
4287            {
4288              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4289              probe at TEST_ADDR
4290            }
4291          while (TEST_ADDR != LAST_ADDR)
4292
4293          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4294          until it is equal to ROUNDED_SIZE.  */
4295
4296       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
4297
4298
4299       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4300          that SIZE is equal to ROUNDED_SIZE.  */
4301
4302       if (size != rounded_size)
4303         {
4304           HOST_WIDE_INT rem = size - rounded_size;
4305
4306           if (rem > 256)
4307             {
4308               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4309
4310               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
4311               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
4312             }
4313           else
4314             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
4315         }
4316     }
4317
4318   /* Make sure nothing is scheduled before we are done.  */
4319   emit_insn (gen_blockage ());
4320 }
4321
4322 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
4323    absolute addresses.  */
4324
4325 const char *
4326 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
4327 {
4328   static int labelno = 0;
4329   char loop_lab[32];
4330   rtx xops[2];
4331
4332   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
4333
4334   /* Loop.  */
4335   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
4336
4337   HOST_WIDE_INT stack_clash_probe_interval
4338     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
4339
4340   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
4341   xops[0] = reg1;
4342   HOST_WIDE_INT interval;
4343   if (flag_stack_clash_protection)
4344     interval = stack_clash_probe_interval;
4345   else
4346     interval = PROBE_INTERVAL;
4347
4348   gcc_assert (aarch64_uimm12_shift (interval));
4349   xops[1] = GEN_INT (interval);
4350
4351   output_asm_insn ("sub\t%0, %0, %1", xops);
4352
4353   /* If doing stack clash protection then we probe up by the ABI specified
4354      amount.  We do this because we're dropping full pages at a time in the
4355      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
4356   if (flag_stack_clash_protection)
4357     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
4358   else
4359     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
4360
4361   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
4362      by this amount for each iteration.  */
4363   output_asm_insn ("str\txzr, [%0, %1]", xops);
4364
4365   /* Test if TEST_ADDR == LAST_ADDR.  */
4366   xops[1] = reg2;
4367   output_asm_insn ("cmp\t%0, %1", xops);
4368
4369   /* Branch.  */
4370   fputs ("\tb.ne\t", asm_out_file);
4371   assemble_name_raw (asm_out_file, loop_lab);
4372   fputc ('\n', asm_out_file);
4373
4374   return "";
4375 }
4376
4377 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4378    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4379    of GUARD_SIZE.  When a probe is emitted it is done at most
4380    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4381    at most MIN_PROBE_THRESHOLD.  By the end of this function
4382    BASE = BASE - ADJUSTMENT.  */
4383
4384 const char *
4385 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
4386                                       rtx min_probe_threshold, rtx guard_size)
4387 {
4388   /* This function is not allowed to use any instruction generation function
4389      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
4390      so instead emit the code you want using output_asm_insn.  */
4391   gcc_assert (flag_stack_clash_protection);
4392   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
4393   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
4394
4395   /* The minimum required allocation before the residual requires probing.  */
4396   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
4397
4398   /* Clamp the value down to the nearest value that can be used with a cmp.  */
4399   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
4400   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
4401
4402   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
4403   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
4404
4405   static int labelno = 0;
4406   char loop_start_lab[32];
4407   char loop_end_lab[32];
4408   rtx xops[2];
4409
4410   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
4411   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
4412
4413   /* Emit loop start label.  */
4414   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
4415
4416   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
4417   xops[0] = adjustment;
4418   xops[1] = probe_offset_value_rtx;
4419   output_asm_insn ("cmp\t%0, %1", xops);
4420
4421   /* Branch to end if not enough adjustment to probe.  */
4422   fputs ("\tb.lt\t", asm_out_file);
4423   assemble_name_raw (asm_out_file, loop_end_lab);
4424   fputc ('\n', asm_out_file);
4425
4426   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
4427   xops[0] = base;
4428   xops[1] = probe_offset_value_rtx;
4429   output_asm_insn ("sub\t%0, %0, %1", xops);
4430
4431   /* Probe at BASE.  */
4432   xops[1] = const0_rtx;
4433   output_asm_insn ("str\txzr, [%0, %1]", xops);
4434
4435   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
4436   xops[0] = adjustment;
4437   xops[1] = probe_offset_value_rtx;
4438   output_asm_insn ("sub\t%0, %0, %1", xops);
4439
4440   /* Branch to start if still more bytes to allocate.  */
4441   fputs ("\tb\t", asm_out_file);
4442   assemble_name_raw (asm_out_file, loop_start_lab);
4443   fputc ('\n', asm_out_file);
4444
4445   /* No probe leave.  */
4446   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
4447
4448   /* BASE = BASE - ADJUSTMENT.  */
4449   xops[0] = base;
4450   xops[1] = adjustment;
4451   output_asm_insn ("sub\t%0, %0, %1", xops);
4452   return "";
4453 }
4454
4455 /* Determine whether a frame chain needs to be generated.  */
4456 static bool
4457 aarch64_needs_frame_chain (void)
4458 {
4459   /* Force a frame chain for EH returns so the return address is at FP+8.  */
4460   if (frame_pointer_needed || crtl->calls_eh_return)
4461     return true;
4462
4463   /* A leaf function cannot have calls or write LR.  */
4464   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4465
4466   /* Don't use a frame chain in leaf functions if leaf frame pointers
4467      are disabled.  */
4468   if (flag_omit_leaf_frame_pointer && is_leaf)
4469     return false;
4470
4471   return aarch64_use_frame_pointer;
4472 }
4473
4474 /* Mark the registers that need to be saved by the callee and calculate
4475    the size of the callee-saved registers area and frame record (both FP
4476    and LR may be omitted).  */
4477 static void
4478 aarch64_layout_frame (void)
4479 {
4480   HOST_WIDE_INT offset = 0;
4481   int regno, last_fp_reg = INVALID_REGNUM;
4482   bool simd_function = aarch64_simd_decl_p (cfun->decl);
4483
4484   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4485
4486   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
4487      the mid-end is doing.  */
4488   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
4489
4490 #define SLOT_NOT_REQUIRED (-2)
4491 #define SLOT_REQUIRED     (-1)
4492
4493   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4494   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4495
4496   /* If this is a non-leaf simd function with calls we assume that
4497      at least one of those calls is to a non-simd function and thus
4498      we must save V8 to V23 in the prologue.  */
4499
4500   if (simd_function && !crtl->is_leaf)
4501     {
4502       for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4503         if (FP_SIMD_SAVED_REGNUM_P (regno))
4504           df_set_regs_ever_live (regno, true);
4505     }
4506
4507   /* First mark all the registers that really need to be saved...  */
4508   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4509     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4510
4511   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4512     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4513
4514   /* ... that includes the eh data registers (if needed)...  */
4515   if (crtl->calls_eh_return)
4516     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4517       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4518         = SLOT_REQUIRED;
4519
4520   /* ... and any callee saved register that dataflow says is live.  */
4521   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4522     if (df_regs_ever_live_p (regno)
4523         && (regno == R30_REGNUM
4524             || !call_used_regs[regno]))
4525       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4526
4527   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4528     if (df_regs_ever_live_p (regno)
4529         && (!call_used_regs[regno]
4530             || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
4531       {
4532         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4533         last_fp_reg = regno;
4534       }
4535
4536   if (cfun->machine->frame.emit_frame_chain)
4537     {
4538       /* FP and LR are placed in the linkage record.  */
4539       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4540       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4541       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4542       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4543       offset = 2 * UNITS_PER_WORD;
4544     }
4545
4546   /* With stack-clash, LR must be saved in non-leaf functions.  */
4547   gcc_assert (crtl->is_leaf
4548               || (cfun->machine->frame.reg_offset[R30_REGNUM]
4549                   != SLOT_NOT_REQUIRED));
4550
4551   /* Now assign stack slots for them.  */
4552   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4553     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4554       {
4555         cfun->machine->frame.reg_offset[regno] = offset;
4556         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4557           cfun->machine->frame.wb_candidate1 = regno;
4558         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4559           cfun->machine->frame.wb_candidate2 = regno;
4560         offset += UNITS_PER_WORD;
4561       }
4562
4563   HOST_WIDE_INT max_int_offset = offset;
4564   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4565   bool has_align_gap = offset != max_int_offset;
4566
4567   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4568     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4569       {
4570         /* If there is an alignment gap between integer and fp callee-saves,
4571            allocate the last fp register to it if possible.  */
4572         if (regno == last_fp_reg
4573             && has_align_gap
4574             && !simd_function
4575             && (offset & 8) == 0)
4576           {
4577             cfun->machine->frame.reg_offset[regno] = max_int_offset;
4578             break;
4579           }
4580
4581         cfun->machine->frame.reg_offset[regno] = offset;
4582         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4583           cfun->machine->frame.wb_candidate1 = regno;
4584         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4585                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4586           cfun->machine->frame.wb_candidate2 = regno;
4587         offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
4588       }
4589
4590   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4591
4592   cfun->machine->frame.saved_regs_size = offset;
4593
4594   HOST_WIDE_INT varargs_and_saved_regs_size
4595     = offset + cfun->machine->frame.saved_varargs_size;
4596
4597   cfun->machine->frame.hard_fp_offset
4598     = aligned_upper_bound (varargs_and_saved_regs_size
4599                            + get_frame_size (),
4600                            STACK_BOUNDARY / BITS_PER_UNIT);
4601
4602   /* Both these values are already aligned.  */
4603   gcc_assert (multiple_p (crtl->outgoing_args_size,
4604                           STACK_BOUNDARY / BITS_PER_UNIT));
4605   cfun->machine->frame.frame_size
4606     = (cfun->machine->frame.hard_fp_offset
4607        + crtl->outgoing_args_size);
4608
4609   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4610
4611   cfun->machine->frame.initial_adjust = 0;
4612   cfun->machine->frame.final_adjust = 0;
4613   cfun->machine->frame.callee_adjust = 0;
4614   cfun->machine->frame.callee_offset = 0;
4615
4616   HOST_WIDE_INT max_push_offset = 0;
4617   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4618     max_push_offset = 512;
4619   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4620     max_push_offset = 256;
4621
4622   HOST_WIDE_INT const_size, const_fp_offset;
4623   if (cfun->machine->frame.frame_size.is_constant (&const_size)
4624       && const_size < max_push_offset
4625       && known_eq (crtl->outgoing_args_size, 0))
4626     {
4627       /* Simple, small frame with no outgoing arguments:
4628          stp reg1, reg2, [sp, -frame_size]!
4629          stp reg3, reg4, [sp, 16]  */
4630       cfun->machine->frame.callee_adjust = const_size;
4631     }
4632   else if (known_lt (crtl->outgoing_args_size
4633                      + cfun->machine->frame.saved_regs_size, 512)
4634            && !(cfun->calls_alloca
4635                 && known_lt (cfun->machine->frame.hard_fp_offset,
4636                              max_push_offset)))
4637     {
4638       /* Frame with small outgoing arguments:
4639          sub sp, sp, frame_size
4640          stp reg1, reg2, [sp, outgoing_args_size]
4641          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
4642       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4643       cfun->machine->frame.callee_offset
4644         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4645     }
4646   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4647            && const_fp_offset < max_push_offset)
4648     {
4649       /* Frame with large outgoing arguments but a small local area:
4650          stp reg1, reg2, [sp, -hard_fp_offset]!
4651          stp reg3, reg4, [sp, 16]
4652          sub sp, sp, outgoing_args_size  */
4653       cfun->machine->frame.callee_adjust = const_fp_offset;
4654       cfun->machine->frame.final_adjust
4655         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4656     }
4657   else
4658     {
4659       /* Frame with large local area and outgoing arguments using frame pointer:
4660          sub sp, sp, hard_fp_offset
4661          stp x29, x30, [sp, 0]
4662          add x29, sp, 0
4663          stp reg3, reg4, [sp, 16]
4664          sub sp, sp, outgoing_args_size  */
4665       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4666       cfun->machine->frame.final_adjust
4667         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4668     }
4669
4670   cfun->machine->frame.laid_out = true;
4671 }
4672
4673 /* Return true if the register REGNO is saved on entry to
4674    the current function.  */
4675
4676 static bool
4677 aarch64_register_saved_on_entry (int regno)
4678 {
4679   return cfun->machine->frame.reg_offset[regno] >= 0;
4680 }
4681
4682 /* Return the next register up from REGNO up to LIMIT for the callee
4683    to save.  */
4684
4685 static unsigned
4686 aarch64_next_callee_save (unsigned regno, unsigned limit)
4687 {
4688   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4689     regno ++;
4690   return regno;
4691 }
4692
4693 /* Push the register number REGNO of mode MODE to the stack with write-back
4694    adjusting the stack by ADJUSTMENT.  */
4695
4696 static void
4697 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4698                            HOST_WIDE_INT adjustment)
4699  {
4700   rtx base_rtx = stack_pointer_rtx;
4701   rtx insn, reg, mem;
4702
4703   reg = gen_rtx_REG (mode, regno);
4704   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4705                             plus_constant (Pmode, base_rtx, -adjustment));
4706   mem = gen_frame_mem (mode, mem);
4707
4708   insn = emit_move_insn (mem, reg);
4709   RTX_FRAME_RELATED_P (insn) = 1;
4710 }
4711
4712 /* Generate and return an instruction to store the pair of registers
4713    REG and REG2 of mode MODE to location BASE with write-back adjusting
4714    the stack location BASE by ADJUSTMENT.  */
4715
4716 static rtx
4717 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4718                           HOST_WIDE_INT adjustment)
4719 {
4720   switch (mode)
4721     {
4722     case E_DImode:
4723       return gen_storewb_pairdi_di (base, base, reg, reg2,
4724                                     GEN_INT (-adjustment),
4725                                     GEN_INT (UNITS_PER_WORD - adjustment));
4726     case E_DFmode:
4727       return gen_storewb_pairdf_di (base, base, reg, reg2,
4728                                     GEN_INT (-adjustment),
4729                                     GEN_INT (UNITS_PER_WORD - adjustment));
4730     case E_TFmode:
4731       return gen_storewb_pairtf_di (base, base, reg, reg2,
4732                                     GEN_INT (-adjustment),
4733                                     GEN_INT (UNITS_PER_VREG - adjustment));
4734     default:
4735       gcc_unreachable ();
4736     }
4737 }
4738
4739 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4740    stack pointer by ADJUSTMENT.  */
4741
4742 static void
4743 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4744 {
4745   rtx_insn *insn;
4746   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4747
4748   if (regno2 == INVALID_REGNUM)
4749     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4750
4751   rtx reg1 = gen_rtx_REG (mode, regno1);
4752   rtx reg2 = gen_rtx_REG (mode, regno2);
4753
4754   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4755                                               reg2, adjustment));
4756   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4757   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4758   RTX_FRAME_RELATED_P (insn) = 1;
4759 }
4760
4761 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4762    adjusting it by ADJUSTMENT afterwards.  */
4763
4764 static rtx
4765 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4766                          HOST_WIDE_INT adjustment)
4767 {
4768   switch (mode)
4769     {
4770     case E_DImode:
4771       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4772                                    GEN_INT (UNITS_PER_WORD));
4773     case E_DFmode:
4774       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4775                                    GEN_INT (UNITS_PER_WORD));
4776     case E_TFmode:
4777       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
4778                                    GEN_INT (UNITS_PER_VREG));
4779     default:
4780       gcc_unreachable ();
4781     }
4782 }
4783
4784 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4785    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4786    into CFI_OPS.  */
4787
4788 static void
4789 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4790                   rtx *cfi_ops)
4791 {
4792   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4793   rtx reg1 = gen_rtx_REG (mode, regno1);
4794
4795   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4796
4797   if (regno2 == INVALID_REGNUM)
4798     {
4799       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4800       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4801       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4802     }
4803   else
4804     {
4805       rtx reg2 = gen_rtx_REG (mode, regno2);
4806       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4807       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4808                                           reg2, adjustment));
4809     }
4810 }
4811
4812 /* Generate and return a store pair instruction of mode MODE to store
4813    register REG1 to MEM1 and register REG2 to MEM2.  */
4814
4815 static rtx
4816 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4817                         rtx reg2)
4818 {
4819   switch (mode)
4820     {
4821     case E_DImode:
4822       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4823
4824     case E_DFmode:
4825       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4826
4827     case E_TFmode:
4828       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
4829
4830     default:
4831       gcc_unreachable ();
4832     }
4833 }
4834
4835 /* Generate and regurn a load pair isntruction of mode MODE to load register
4836    REG1 from MEM1 and register REG2 from MEM2.  */
4837
4838 static rtx
4839 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4840                        rtx mem2)
4841 {
4842   switch (mode)
4843     {
4844     case E_DImode:
4845       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4846
4847     case E_DFmode:
4848       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4849
4850     case E_TFmode:
4851       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
4852
4853     default:
4854       gcc_unreachable ();
4855     }
4856 }
4857
4858 /* Return TRUE if return address signing should be enabled for the current
4859    function, otherwise return FALSE.  */
4860
4861 bool
4862 aarch64_return_address_signing_enabled (void)
4863 {
4864   /* This function should only be called after frame laid out.   */
4865   gcc_assert (cfun->machine->frame.laid_out);
4866
4867   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4868      if its LR is pushed onto stack.  */
4869   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4870           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4871               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4872 }
4873
4874 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
4875 bool
4876 aarch64_bti_enabled (void)
4877 {
4878   return (aarch64_enable_bti == 1);
4879 }
4880
4881 /* Emit code to save the callee-saved registers from register number START
4882    to LIMIT to the stack at the location starting at offset START_OFFSET,
4883    skipping any write-back candidates if SKIP_WB is true.  */
4884
4885 static void
4886 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4887                            unsigned start, unsigned limit, bool skip_wb)
4888 {
4889   rtx_insn *insn;
4890   unsigned regno;
4891   unsigned regno2;
4892
4893   for (regno = aarch64_next_callee_save (start, limit);
4894        regno <= limit;
4895        regno = aarch64_next_callee_save (regno + 1, limit))
4896     {
4897       rtx reg, mem;
4898       poly_int64 offset;
4899       int offset_diff;
4900
4901       if (skip_wb
4902           && (regno == cfun->machine->frame.wb_candidate1
4903               || regno == cfun->machine->frame.wb_candidate2))
4904         continue;
4905
4906       if (cfun->machine->reg_is_wrapped_separately[regno])
4907        continue;
4908
4909       reg = gen_rtx_REG (mode, regno);
4910       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4911       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4912                                                 offset));
4913
4914       regno2 = aarch64_next_callee_save (regno + 1, limit);
4915       offset_diff = cfun->machine->frame.reg_offset[regno2]
4916                     - cfun->machine->frame.reg_offset[regno];
4917
4918       if (regno2 <= limit
4919           && !cfun->machine->reg_is_wrapped_separately[regno2]
4920           && known_eq (GET_MODE_SIZE (mode), offset_diff))
4921         {
4922           rtx reg2 = gen_rtx_REG (mode, regno2);
4923           rtx mem2;
4924
4925           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4926           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4927                                                      offset));
4928           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4929                                                     reg2));
4930
4931           /* The first part of a frame-related parallel insn is
4932              always assumed to be relevant to the frame
4933              calculations; subsequent parts, are only
4934              frame-related if explicitly marked.  */
4935           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4936           regno = regno2;
4937         }
4938       else
4939         insn = emit_move_insn (mem, reg);
4940
4941       RTX_FRAME_RELATED_P (insn) = 1;
4942     }
4943 }
4944
4945 /* Emit code to restore the callee registers of mode MODE from register
4946    number START up to and including LIMIT.  Restore from the stack offset
4947    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4948    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
4949
4950 static void
4951 aarch64_restore_callee_saves (machine_mode mode,
4952                               poly_int64 start_offset, unsigned start,
4953                               unsigned limit, bool skip_wb, rtx *cfi_ops)
4954 {
4955   rtx base_rtx = stack_pointer_rtx;
4956   unsigned regno;
4957   unsigned regno2;
4958   poly_int64 offset;
4959
4960   for (regno = aarch64_next_callee_save (start, limit);
4961        regno <= limit;
4962        regno = aarch64_next_callee_save (regno + 1, limit))
4963     {
4964       if (cfun->machine->reg_is_wrapped_separately[regno])
4965        continue;
4966
4967       rtx reg, mem;
4968       int offset_diff;
4969
4970       if (skip_wb
4971           && (regno == cfun->machine->frame.wb_candidate1
4972               || regno == cfun->machine->frame.wb_candidate2))
4973         continue;
4974
4975       reg = gen_rtx_REG (mode, regno);
4976       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4977       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4978
4979       regno2 = aarch64_next_callee_save (regno + 1, limit);
4980       offset_diff = cfun->machine->frame.reg_offset[regno2]
4981                     - cfun->machine->frame.reg_offset[regno];
4982
4983       if (regno2 <= limit
4984           && !cfun->machine->reg_is_wrapped_separately[regno2]
4985           && known_eq (GET_MODE_SIZE (mode), offset_diff))
4986         {
4987           rtx reg2 = gen_rtx_REG (mode, regno2);
4988           rtx mem2;
4989
4990           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4991           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4992           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4993
4994           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4995           regno = regno2;
4996         }
4997       else
4998         emit_move_insn (reg, mem);
4999       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5000     }
5001 }
5002
5003 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5004    of MODE.  */
5005
5006 static inline bool
5007 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5008 {
5009   HOST_WIDE_INT multiple;
5010   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5011           && IN_RANGE (multiple, -8, 7));
5012 }
5013
5014 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5015    of MODE.  */
5016
5017 static inline bool
5018 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5019 {
5020   HOST_WIDE_INT multiple;
5021   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5022           && IN_RANGE (multiple, 0, 63));
5023 }
5024
5025 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5026    of MODE.  */
5027
5028 bool
5029 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5030 {
5031   HOST_WIDE_INT multiple;
5032   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5033           && IN_RANGE (multiple, -64, 63));
5034 }
5035
5036 /* Return true if OFFSET is a signed 9-bit value.  */
5037
5038 bool
5039 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5040                                        poly_int64 offset)
5041 {
5042   HOST_WIDE_INT const_offset;
5043   return (offset.is_constant (&const_offset)
5044           && IN_RANGE (const_offset, -256, 255));
5045 }
5046
5047 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5048    of MODE.  */
5049
5050 static inline bool
5051 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5052 {
5053   HOST_WIDE_INT multiple;
5054   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5055           && IN_RANGE (multiple, -256, 255));
5056 }
5057
5058 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5059    of MODE.  */
5060
5061 static inline bool
5062 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5063 {
5064   HOST_WIDE_INT multiple;
5065   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5066           && IN_RANGE (multiple, 0, 4095));
5067 }
5068
5069 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
5070
5071 static sbitmap
5072 aarch64_get_separate_components (void)
5073 {
5074   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5075   bitmap_clear (components);
5076
5077   /* The registers we need saved to the frame.  */
5078   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5079     if (aarch64_register_saved_on_entry (regno))
5080       {
5081         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5082         if (!frame_pointer_needed)
5083           offset += cfun->machine->frame.frame_size
5084                     - cfun->machine->frame.hard_fp_offset;
5085         /* Check that we can access the stack slot of the register with one
5086            direct load with no adjustments needed.  */
5087         if (offset_12bit_unsigned_scaled_p (DImode, offset))
5088           bitmap_set_bit (components, regno);
5089       }
5090
5091   /* Don't mess with the hard frame pointer.  */
5092   if (frame_pointer_needed)
5093     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5094
5095   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5096   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5097   /* If registers have been chosen to be stored/restored with
5098      writeback don't interfere with them to avoid having to output explicit
5099      stack adjustment instructions.  */
5100   if (reg2 != INVALID_REGNUM)
5101     bitmap_clear_bit (components, reg2);
5102   if (reg1 != INVALID_REGNUM)
5103     bitmap_clear_bit (components, reg1);
5104
5105   bitmap_clear_bit (components, LR_REGNUM);
5106   bitmap_clear_bit (components, SP_REGNUM);
5107
5108   return components;
5109 }
5110
5111 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
5112
5113 static sbitmap
5114 aarch64_components_for_bb (basic_block bb)
5115 {
5116   bitmap in = DF_LIVE_IN (bb);
5117   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5118   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5119   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5120
5121   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5122   bitmap_clear (components);
5123
5124   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
5125   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5126     if ((!call_used_regs[regno]
5127         || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5128        && (bitmap_bit_p (in, regno)
5129            || bitmap_bit_p (gen, regno)
5130            || bitmap_bit_p (kill, regno)))
5131       {
5132         unsigned regno2, offset, offset2;
5133         bitmap_set_bit (components, regno);
5134
5135         /* If there is a callee-save at an adjacent offset, add it too
5136            to increase the use of LDP/STP.  */
5137         offset = cfun->machine->frame.reg_offset[regno];
5138         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5139
5140         if (regno2 <= LAST_SAVED_REGNUM)
5141           {
5142             offset2 = cfun->machine->frame.reg_offset[regno2];
5143             if ((offset & ~8) == (offset2 & ~8))
5144               bitmap_set_bit (components, regno2);
5145           }
5146       }
5147
5148   return components;
5149 }
5150
5151 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5152    Nothing to do for aarch64.  */
5153
5154 static void
5155 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5156 {
5157 }
5158
5159 /* Return the next set bit in BMP from START onwards.  Return the total number
5160    of bits in BMP if no set bit is found at or after START.  */
5161
5162 static unsigned int
5163 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5164 {
5165   unsigned int nbits = SBITMAP_SIZE (bmp);
5166   if (start == nbits)
5167     return start;
5168
5169   gcc_assert (start < nbits);
5170   for (unsigned int i = start; i < nbits; i++)
5171     if (bitmap_bit_p (bmp, i))
5172       return i;
5173
5174   return nbits;
5175 }
5176
5177 /* Do the work for aarch64_emit_prologue_components and
5178    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
5179    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5180    for these components or the epilogue sequence.  That is, it determines
5181    whether we should emit stores or loads and what kind of CFA notes to attach
5182    to the insns.  Otherwise the logic for the two sequences is very
5183    similar.  */
5184
5185 static void
5186 aarch64_process_components (sbitmap components, bool prologue_p)
5187 {
5188   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5189                              ? HARD_FRAME_POINTER_REGNUM
5190                              : STACK_POINTER_REGNUM);
5191
5192   unsigned last_regno = SBITMAP_SIZE (components);
5193   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5194   rtx_insn *insn = NULL;
5195
5196   while (regno != last_regno)
5197     {
5198       /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5199          so DFmode for the vector registers is enough.  For simd functions
5200          we want to save the low 128 bits.  */
5201       machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5202
5203       rtx reg = gen_rtx_REG (mode, regno);
5204       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5205       if (!frame_pointer_needed)
5206         offset += cfun->machine->frame.frame_size
5207                   - cfun->machine->frame.hard_fp_offset;
5208       rtx addr = plus_constant (Pmode, ptr_reg, offset);
5209       rtx mem = gen_frame_mem (mode, addr);
5210
5211       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5212       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5213       /* No more registers to handle after REGNO.
5214          Emit a single save/restore and exit.  */
5215       if (regno2 == last_regno)
5216         {
5217           insn = emit_insn (set);
5218           RTX_FRAME_RELATED_P (insn) = 1;
5219           if (prologue_p)
5220             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5221           else
5222             add_reg_note (insn, REG_CFA_RESTORE, reg);
5223           break;
5224         }
5225
5226       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5227       /* The next register is not of the same class or its offset is not
5228          mergeable with the current one into a pair.  */
5229       if (!satisfies_constraint_Ump (mem)
5230           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5231           || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5232           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5233                        GET_MODE_SIZE (mode)))
5234         {
5235           insn = emit_insn (set);
5236           RTX_FRAME_RELATED_P (insn) = 1;
5237           if (prologue_p)
5238             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5239           else
5240             add_reg_note (insn, REG_CFA_RESTORE, reg);
5241
5242           regno = regno2;
5243           continue;
5244         }
5245
5246       /* REGNO2 can be saved/restored in a pair with REGNO.  */
5247       rtx reg2 = gen_rtx_REG (mode, regno2);
5248       if (!frame_pointer_needed)
5249         offset2 += cfun->machine->frame.frame_size
5250                   - cfun->machine->frame.hard_fp_offset;
5251       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5252       rtx mem2 = gen_frame_mem (mode, addr2);
5253       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5254                              : gen_rtx_SET (reg2, mem2);
5255
5256       if (prologue_p)
5257         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
5258       else
5259         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5260
5261       RTX_FRAME_RELATED_P (insn) = 1;
5262       if (prologue_p)
5263         {
5264           add_reg_note (insn, REG_CFA_OFFSET, set);
5265           add_reg_note (insn, REG_CFA_OFFSET, set2);
5266         }
5267       else
5268         {
5269           add_reg_note (insn, REG_CFA_RESTORE, reg);
5270           add_reg_note (insn, REG_CFA_RESTORE, reg2);
5271         }
5272
5273       regno = aarch64_get_next_set_bit (components, regno2 + 1);
5274     }
5275 }
5276
5277 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
5278
5279 static void
5280 aarch64_emit_prologue_components (sbitmap components)
5281 {
5282   aarch64_process_components (components, true);
5283 }
5284
5285 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
5286
5287 static void
5288 aarch64_emit_epilogue_components (sbitmap components)
5289 {
5290   aarch64_process_components (components, false);
5291 }
5292
5293 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
5294
5295 static void
5296 aarch64_set_handled_components (sbitmap components)
5297 {
5298   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5299     if (bitmap_bit_p (components, regno))
5300       cfun->machine->reg_is_wrapped_separately[regno] = true;
5301 }
5302
5303 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
5304    determining the probe offset for alloca.  */
5305
5306 static HOST_WIDE_INT
5307 aarch64_stack_clash_protection_alloca_probe_range (void)
5308 {
5309   return STACK_CLASH_CALLER_GUARD;
5310 }
5311
5312
5313 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5314    registers.  If POLY_SIZE is not large enough to require a probe this function
5315    will only adjust the stack.  When allocating the stack space
5316    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5317    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5318    arguments.  If we are then we ensure that any allocation larger than the ABI
5319    defined buffer needs a probe so that the invariant of having a 1KB buffer is
5320    maintained.
5321
5322    We emit barriers after each stack adjustment to prevent optimizations from
5323    breaking the invariant that we never drop the stack more than a page.  This
5324    invariant is needed to make it easier to correctly handle asynchronous
5325    events, e.g. if we were to allow the stack to be dropped by more than a page
5326    and then have multiple probes up and we take a signal somewhere in between
5327    then the signal handler doesn't know the state of the stack and can make no
5328    assumptions about which pages have been probed.  */
5329
5330 static void
5331 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
5332                                         poly_int64 poly_size,
5333                                         bool frame_related_p,
5334                                         bool final_adjustment_p)
5335 {
5336   HOST_WIDE_INT guard_size
5337     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5338   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5339   /* When doing the final adjustment for the outgoing argument size we can't
5340      assume that LR was saved at position 0.  So subtract it's offset from the
5341      ABI safe buffer so that we don't accidentally allow an adjustment that
5342      would result in an allocation larger than the ABI buffer without
5343      probing.  */
5344   HOST_WIDE_INT min_probe_threshold
5345     = final_adjustment_p
5346       ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
5347       : guard_size - guard_used_by_caller;
5348
5349   poly_int64 frame_size = cfun->machine->frame.frame_size;
5350
5351   /* We should always have a positive probe threshold.  */
5352   gcc_assert (min_probe_threshold > 0);
5353
5354   if (flag_stack_clash_protection && !final_adjustment_p)
5355     {
5356       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5357       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5358
5359       if (known_eq (frame_size, 0))
5360         {
5361           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
5362         }
5363       else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
5364                && known_lt (final_adjust, guard_used_by_caller))
5365         {
5366           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
5367         }
5368     }
5369
5370   /* If SIZE is not large enough to require probing, just adjust the stack and
5371      exit.  */
5372   if (known_lt (poly_size, min_probe_threshold)
5373       || !flag_stack_clash_protection)
5374     {
5375       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
5376       return;
5377     }
5378
5379   HOST_WIDE_INT size;
5380   /* Handle the SVE non-constant case first.  */
5381   if (!poly_size.is_constant (&size))
5382     {
5383      if (dump_file)
5384       {
5385         fprintf (dump_file, "Stack clash SVE prologue: ");
5386         print_dec (poly_size, dump_file);
5387         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
5388       }
5389
5390       /* First calculate the amount of bytes we're actually spilling.  */
5391       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
5392                           poly_size, temp1, temp2, false, true);
5393
5394       rtx_insn *insn = get_last_insn ();
5395
5396       if (frame_related_p)
5397         {
5398           /* This is done to provide unwinding information for the stack
5399              adjustments we're about to do, however to prevent the optimizers
5400              from removing the R11 move and leaving the CFA note (which would be
5401              very wrong) we tie the old and new stack pointer together.
5402              The tie will expand to nothing but the optimizers will not touch
5403              the instruction.  */
5404           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
5405           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
5406           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
5407
5408           /* We want the CFA independent of the stack pointer for the
5409              duration of the loop.  */
5410           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
5411           RTX_FRAME_RELATED_P (insn) = 1;
5412         }
5413
5414       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
5415       rtx guard_const = gen_int_mode (guard_size, Pmode);
5416
5417       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
5418                                                    stack_pointer_rtx, temp1,
5419                                                    probe_const, guard_const));
5420
5421       /* Now reset the CFA register if needed.  */
5422       if (frame_related_p)
5423         {
5424           add_reg_note (insn, REG_CFA_DEF_CFA,
5425                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
5426                                       gen_int_mode (poly_size, Pmode)));
5427           RTX_FRAME_RELATED_P (insn) = 1;
5428         }
5429
5430       return;
5431     }
5432
5433   if (dump_file)
5434     fprintf (dump_file,
5435              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5436              " bytes, probing will be required.\n", size);
5437
5438   /* Round size to the nearest multiple of guard_size, and calculate the
5439      residual as the difference between the original size and the rounded
5440      size.  */
5441   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
5442   HOST_WIDE_INT residual = size - rounded_size;
5443
5444   /* We can handle a small number of allocations/probes inline.  Otherwise
5445      punt to a loop.  */
5446   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
5447     {
5448       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
5449         {
5450           aarch64_sub_sp (NULL, temp2, guard_size, true);
5451           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5452                                            guard_used_by_caller));
5453           emit_insn (gen_blockage ());
5454         }
5455       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
5456     }
5457   else
5458     {
5459       /* Compute the ending address.  */
5460       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
5461                           temp1, NULL, false, true);
5462       rtx_insn *insn = get_last_insn ();
5463
5464       /* For the initial allocation, we don't have a frame pointer
5465          set up, so we always need CFI notes.  If we're doing the
5466          final allocation, then we may have a frame pointer, in which
5467          case it is the CFA, otherwise we need CFI notes.
5468
5469          We can determine which allocation we are doing by looking at
5470          the value of FRAME_RELATED_P since the final allocations are not
5471          frame related.  */
5472       if (frame_related_p)
5473         {
5474           /* We want the CFA independent of the stack pointer for the
5475              duration of the loop.  */
5476           add_reg_note (insn, REG_CFA_DEF_CFA,
5477                         plus_constant (Pmode, temp1, rounded_size));
5478           RTX_FRAME_RELATED_P (insn) = 1;
5479         }
5480
5481       /* This allocates and probes the stack.  Note that this re-uses some of
5482          the existing Ada stack protection code.  However we are guaranteed not
5483          to enter the non loop or residual branches of that code.
5484
5485          The non-loop part won't be entered because if our allocation amount
5486          doesn't require a loop, the case above would handle it.
5487
5488          The residual amount won't be entered because TEMP1 is a mutliple of
5489          the allocation size.  The residual will always be 0.  As such, the only
5490          part we are actually using from that code is the loop setup.  The
5491          actual probing is done in aarch64_output_probe_stack_range.  */
5492       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
5493                                                stack_pointer_rtx, temp1));
5494
5495       /* Now reset the CFA register if needed.  */
5496       if (frame_related_p)
5497         {
5498           add_reg_note (insn, REG_CFA_DEF_CFA,
5499                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
5500           RTX_FRAME_RELATED_P (insn) = 1;
5501         }
5502
5503       emit_insn (gen_blockage ());
5504       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
5505     }
5506
5507   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
5508      be probed.  This maintains the requirement that each page is probed at
5509      least once.  For initial probing we probe only if the allocation is
5510      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
5511      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
5512      GUARD_SIZE.  This works that for any allocation that is large enough to
5513      trigger a probe here, we'll have at least one, and if they're not large
5514      enough for this code to emit anything for them, The page would have been
5515      probed by the saving of FP/LR either by this function or any callees.  If
5516      we don't have any callees then we won't have more stack adjustments and so
5517      are still safe.  */
5518   if (residual)
5519     {
5520       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
5521       /* If we're doing final adjustments, and we've done any full page
5522          allocations then any residual needs to be probed.  */
5523       if (final_adjustment_p && rounded_size != 0)
5524         min_probe_threshold = 0;
5525       /* If doing a small final adjustment, we always probe at offset 0.
5526          This is done to avoid issues when LR is not at position 0 or when
5527          the final adjustment is smaller than the probing offset.  */
5528       else if (final_adjustment_p && rounded_size == 0)
5529         residual_probe_offset = 0;
5530
5531       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
5532       if (residual >= min_probe_threshold)
5533         {
5534           if (dump_file)
5535             fprintf (dump_file,
5536                      "Stack clash AArch64 prologue residuals: "
5537                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
5538                      "\n", residual);
5539
5540             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5541                                              residual_probe_offset));
5542           emit_insn (gen_blockage ());
5543         }
5544     }
5545 }
5546
5547 /* Return 1 if the register is used by the epilogue.  We need to say the
5548    return register is used, but only after epilogue generation is complete.
5549    Note that in the case of sibcalls, the values "used by the epilogue" are
5550    considered live at the start of the called function.
5551
5552    For SIMD functions we need to return 1 for FP registers that are saved and
5553    restored by a function but are not zero in call_used_regs.  If we do not do
5554    this optimizations may remove the restore of the register.  */
5555
5556 int
5557 aarch64_epilogue_uses (int regno)
5558 {
5559   if (epilogue_completed)
5560     {
5561       if (regno == LR_REGNUM)
5562         return 1;
5563       if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
5564         return 1;
5565     }
5566   return 0;
5567 }
5568
5569 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
5570    is saved at BASE + OFFSET.  */
5571
5572 static void
5573 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
5574                             rtx base, poly_int64 offset)
5575 {
5576   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
5577   add_reg_note (insn, REG_CFA_EXPRESSION,
5578                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
5579 }
5580
5581 /* AArch64 stack frames generated by this compiler look like:
5582
5583         +-------------------------------+
5584         |                               |
5585         |  incoming stack arguments     |
5586         |                               |
5587         +-------------------------------+
5588         |                               | <-- incoming stack pointer (aligned)
5589         |  callee-allocated save area   |
5590         |  for register varargs         |
5591         |                               |
5592         +-------------------------------+
5593         |  local variables              | <-- frame_pointer_rtx
5594         |                               |
5595         +-------------------------------+
5596         |  padding                      | \
5597         +-------------------------------+  |
5598         |  callee-saved registers       |  | frame.saved_regs_size
5599         +-------------------------------+  |
5600         |  LR'                          |  |
5601         +-------------------------------+  |
5602         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
5603         +-------------------------------+
5604         |  dynamic allocation           |
5605         +-------------------------------+
5606         |  padding                      |
5607         +-------------------------------+
5608         |  outgoing stack arguments     | <-- arg_pointer
5609         |                               |
5610         +-------------------------------+
5611         |                               | <-- stack_pointer_rtx (aligned)
5612
5613    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
5614    but leave frame_pointer_rtx and hard_frame_pointer_rtx
5615    unchanged.
5616
5617    By default for stack-clash we assume the guard is at least 64KB, but this
5618    value is configurable to either 4KB or 64KB.  We also force the guard size to
5619    be the same as the probing interval and both values are kept in sync.
5620
5621    With those assumptions the callee can allocate up to 63KB (or 3KB depending
5622    on the guard size) of stack space without probing.
5623
5624    When probing is needed, we emit a probe at the start of the prologue
5625    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
5626
5627    We have to track how much space has been allocated and the only stores
5628    to the stack we track as implicit probes are the FP/LR stores.
5629
5630    For outgoing arguments we probe if the size is larger than 1KB, such that
5631    the ABI specified buffer is maintained for the next callee.
5632
5633    The following registers are reserved during frame layout and should not be
5634    used for any other purpose:
5635
5636    - r11: Used by stack clash protection when SVE is enabled.
5637    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
5638    - r14 and r15: Used for speculation tracking.
5639    - r16(IP0), r17(IP1): Used by indirect tailcalls.
5640    - r30(LR), r29(FP): Used by standard frame layout.
5641
5642    These registers must be avoided in frame layout related code unless the
5643    explicit intention is to interact with one of the features listed above.  */
5644
5645 /* Generate the prologue instructions for entry into a function.
5646    Establish the stack frame by decreasing the stack pointer with a
5647    properly calculated size and, if necessary, create a frame record
5648    filled with the values of LR and previous frame pointer.  The
5649    current FP is also set up if it is in use.  */
5650
5651 void
5652 aarch64_expand_prologue (void)
5653 {
5654   poly_int64 frame_size = cfun->machine->frame.frame_size;
5655   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5656   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5657   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5658   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5659   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5660   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5661   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
5662   rtx_insn *insn;
5663
5664   /* Sign return address for functions.  */
5665   if (aarch64_return_address_signing_enabled ())
5666     {
5667       switch (aarch64_ra_sign_key)
5668         {
5669           case AARCH64_KEY_A:
5670             insn = emit_insn (gen_paciasp ());
5671             break;
5672           case AARCH64_KEY_B:
5673             insn = emit_insn (gen_pacibsp ());
5674             break;
5675           default:
5676             gcc_unreachable ();
5677         }
5678       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5679       RTX_FRAME_RELATED_P (insn) = 1;
5680     }
5681
5682   if (flag_stack_usage_info)
5683     current_function_static_stack_size = constant_lower_bound (frame_size);
5684
5685   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5686     {
5687       if (crtl->is_leaf && !cfun->calls_alloca)
5688         {
5689           if (maybe_gt (frame_size, PROBE_INTERVAL)
5690               && maybe_gt (frame_size, get_stack_check_protect ()))
5691             aarch64_emit_probe_stack_range (get_stack_check_protect (),
5692                                             (frame_size
5693                                              - get_stack_check_protect ()));
5694         }
5695       else if (maybe_gt (frame_size, 0))
5696         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
5697     }
5698
5699   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5700   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5701
5702   /* In theory we should never have both an initial adjustment
5703      and a callee save adjustment.  Verify that is the case since the
5704      code below does not handle it for -fstack-clash-protection.  */
5705   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
5706
5707   /* Will only probe if the initial adjustment is larger than the guard
5708      less the amount of the guard reserved for use by the caller's
5709      outgoing args.  */
5710   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
5711                                           true, false);
5712
5713   if (callee_adjust != 0)
5714     aarch64_push_regs (reg1, reg2, callee_adjust);
5715
5716   if (emit_frame_chain)
5717     {
5718       poly_int64 reg_offset = callee_adjust;
5719       if (callee_adjust == 0)
5720         {
5721           reg1 = R29_REGNUM;
5722           reg2 = R30_REGNUM;
5723           reg_offset = callee_offset;
5724           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
5725         }
5726       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
5727                           stack_pointer_rtx, callee_offset,
5728                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
5729       if (frame_pointer_needed && !frame_size.is_constant ())
5730         {
5731           /* Variable-sized frames need to describe the save slot
5732              address using DW_CFA_expression rather than DW_CFA_offset.
5733              This means that, without taking further action, the
5734              locations of the registers that we've already saved would
5735              remain based on the stack pointer even after we redefine
5736              the CFA based on the frame pointer.  We therefore need new
5737              DW_CFA_expressions to re-express the save slots with addresses
5738              based on the frame pointer.  */
5739           rtx_insn *insn = get_last_insn ();
5740           gcc_assert (RTX_FRAME_RELATED_P (insn));
5741
5742           /* Add an explicit CFA definition if this was previously
5743              implicit.  */
5744           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
5745             {
5746               rtx src = plus_constant (Pmode, stack_pointer_rtx,
5747                                        callee_offset);
5748               add_reg_note (insn, REG_CFA_ADJUST_CFA,
5749                             gen_rtx_SET (hard_frame_pointer_rtx, src));
5750             }
5751
5752           /* Change the save slot expressions for the registers that
5753              we've already saved.  */
5754           reg_offset -= callee_offset;
5755           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
5756                                       reg_offset + UNITS_PER_WORD);
5757           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
5758                                       reg_offset);
5759         }
5760       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
5761     }
5762
5763   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5764                              callee_adjust != 0 || emit_frame_chain);
5765   if (aarch64_simd_decl_p (cfun->decl))
5766     aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5767                                callee_adjust != 0 || emit_frame_chain);
5768   else
5769     aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5770                                callee_adjust != 0 || emit_frame_chain);
5771
5772   /* We may need to probe the final adjustment if it is larger than the guard
5773      that is assumed by the called.  */
5774   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
5775                                           !frame_pointer_needed, true);
5776 }
5777
5778 /* Return TRUE if we can use a simple_return insn.
5779
5780    This function checks whether the callee saved stack is empty, which
5781    means no restore actions are need. The pro_and_epilogue will use
5782    this to check whether shrink-wrapping opt is feasible.  */
5783
5784 bool
5785 aarch64_use_return_insn_p (void)
5786 {
5787   if (!reload_completed)
5788     return false;
5789
5790   if (crtl->profile)
5791     return false;
5792
5793   return known_eq (cfun->machine->frame.frame_size, 0);
5794 }
5795
5796 /* Return false for non-leaf SIMD functions in order to avoid
5797    shrink-wrapping them.  Doing this will lose the necessary
5798    save/restore of FP registers.  */
5799
5800 bool
5801 aarch64_use_simple_return_insn_p (void)
5802 {
5803   if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
5804     return false;
5805
5806   return true;
5807 }
5808
5809 /* Generate the epilogue instructions for returning from a function.
5810    This is almost exactly the reverse of the prolog sequence, except
5811    that we need to insert barriers to avoid scheduling loads that read
5812    from a deallocated stack, and we optimize the unwind records by
5813    emitting them all together if possible.  */
5814 void
5815 aarch64_expand_epilogue (bool for_sibcall)
5816 {
5817   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5818   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5819   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5820   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5821   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5822   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5823   rtx cfi_ops = NULL;
5824   rtx_insn *insn;
5825   /* A stack clash protection prologue may not have left EP0_REGNUM or
5826      EP1_REGNUM in a usable state.  The same is true for allocations
5827      with an SVE component, since we then need both temporary registers
5828      for each allocation.  For stack clash we are in a usable state if
5829      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
5830   HOST_WIDE_INT guard_size
5831     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5832   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5833
5834   /* We can re-use the registers when the allocation amount is smaller than
5835      guard_size - guard_used_by_caller because we won't be doing any probes
5836      then.  In such situations the register should remain live with the correct
5837      value.  */
5838   bool can_inherit_p = (initial_adjust.is_constant ()
5839                         && final_adjust.is_constant ())
5840                         && (!flag_stack_clash_protection
5841                             || known_lt (initial_adjust,
5842                                          guard_size - guard_used_by_caller));
5843
5844   /* We need to add memory barrier to prevent read from deallocated stack.  */
5845   bool need_barrier_p
5846     = maybe_ne (get_frame_size ()
5847                 + cfun->machine->frame.saved_varargs_size, 0);
5848
5849   /* Emit a barrier to prevent loads from a deallocated stack.  */
5850   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5851       || cfun->calls_alloca
5852       || crtl->calls_eh_return)
5853     {
5854       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5855       need_barrier_p = false;
5856     }
5857
5858   /* Restore the stack pointer from the frame pointer if it may not
5859      be the same as the stack pointer.  */
5860   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5861   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5862   if (frame_pointer_needed
5863       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5864     /* If writeback is used when restoring callee-saves, the CFA
5865        is restored on the instruction doing the writeback.  */
5866     aarch64_add_offset (Pmode, stack_pointer_rtx,
5867                         hard_frame_pointer_rtx, -callee_offset,
5868                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
5869   else
5870      /* The case where we need to re-use the register here is very rare, so
5871         avoid the complicated condition and just always emit a move if the
5872         immediate doesn't fit.  */
5873      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
5874
5875   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5876                                 callee_adjust != 0, &cfi_ops);
5877   if (aarch64_simd_decl_p (cfun->decl))
5878     aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5879                                   callee_adjust != 0, &cfi_ops);
5880   else
5881     aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5882                                   callee_adjust != 0, &cfi_ops);
5883
5884   if (need_barrier_p)
5885     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5886
5887   if (callee_adjust != 0)
5888     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5889
5890   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5891     {
5892       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
5893       insn = get_last_insn ();
5894       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5895       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5896       RTX_FRAME_RELATED_P (insn) = 1;
5897       cfi_ops = NULL;
5898     }
5899
5900   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
5901      add restriction on emit_move optimization to leaf functions.  */
5902   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
5903                   (!can_inherit_p || !crtl->is_leaf
5904                    || df_regs_ever_live_p (EP0_REGNUM)));
5905
5906   if (cfi_ops)
5907     {
5908       /* Emit delayed restores and reset the CFA to be SP.  */
5909       insn = get_last_insn ();
5910       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5911       REG_NOTES (insn) = cfi_ops;
5912       RTX_FRAME_RELATED_P (insn) = 1;
5913     }
5914
5915   /* We prefer to emit the combined return/authenticate instruction RETAA,
5916      however there are three cases in which we must instead emit an explicit
5917      authentication instruction.
5918
5919         1) Sibcalls don't return in a normal way, so if we're about to call one
5920            we must authenticate.
5921
5922         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5923            generating code for !TARGET_ARMV8_3 we can't use it and must
5924            explicitly authenticate.
5925
5926         3) On an eh_return path we make extra stack adjustments to update the
5927            canonical frame address to be the exception handler's CFA.  We want
5928            to authenticate using the CFA of the function which calls eh_return.
5929     */
5930   if (aarch64_return_address_signing_enabled ()
5931       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5932     {
5933       switch (aarch64_ra_sign_key)
5934         {
5935           case AARCH64_KEY_A:
5936             insn = emit_insn (gen_autiasp ());
5937             break;
5938           case AARCH64_KEY_B:
5939             insn = emit_insn (gen_autibsp ());
5940             break;
5941           default:
5942             gcc_unreachable ();
5943         }
5944       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5945       RTX_FRAME_RELATED_P (insn) = 1;
5946     }
5947
5948   /* Stack adjustment for exception handler.  */
5949   if (crtl->calls_eh_return && !for_sibcall)
5950     {
5951       /* We need to unwind the stack by the offset computed by
5952          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
5953          to be SP; letting the CFA move during this adjustment
5954          is just as correct as retaining the CFA from the body
5955          of the function.  Therefore, do nothing special.  */
5956       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5957     }
5958
5959   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5960   if (!for_sibcall)
5961     emit_jump_insn (ret_rtx);
5962 }
5963
5964 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
5965    normally or return to a previous frame after unwinding.
5966
5967    An EH return uses a single shared return sequence.  The epilogue is
5968    exactly like a normal epilogue except that it has an extra input
5969    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5970    that must be applied after the frame has been destroyed.  An extra label
5971    is inserted before the epilogue which initializes this register to zero,
5972    and this is the entry point for a normal return.
5973
5974    An actual EH return updates the return address, initializes the stack
5975    adjustment and jumps directly into the epilogue (bypassing the zeroing
5976    of the adjustment).  Since the return address is typically saved on the
5977    stack when a function makes a call, the saved LR must be updated outside
5978    the epilogue.
5979
5980    This poses problems as the store is generated well before the epilogue,
5981    so the offset of LR is not known yet.  Also optimizations will remove the
5982    store as it appears dead, even after the epilogue is generated (as the
5983    base or offset for loading LR is different in many cases).
5984
5985    To avoid these problems this implementation forces the frame pointer
5986    in eh_return functions so that the location of LR is fixed and known early.
5987    It also marks the store volatile, so no optimization is permitted to
5988    remove the store.  */
5989 rtx
5990 aarch64_eh_return_handler_rtx (void)
5991 {
5992   rtx tmp = gen_frame_mem (Pmode,
5993     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5994
5995   /* Mark the store volatile, so no optimization is permitted to remove it.  */
5996   MEM_VOLATILE_P (tmp) = true;
5997   return tmp;
5998 }
5999
6000 /* Output code to add DELTA to the first argument, and then jump
6001    to FUNCTION.  Used for C++ multiple inheritance.  */
6002 static void
6003 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6004                          HOST_WIDE_INT delta,
6005                          HOST_WIDE_INT vcall_offset,
6006                          tree function)
6007 {
6008   /* The this pointer is always in x0.  Note that this differs from
6009      Arm where the this pointer maybe bumped to r1 if r0 is required
6010      to return a pointer to an aggregate.  On AArch64 a result value
6011      pointer will be in x8.  */
6012   int this_regno = R0_REGNUM;
6013   rtx this_rtx, temp0, temp1, addr, funexp;
6014   rtx_insn *insn;
6015   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6016
6017   if (aarch64_bti_enabled ())
6018     emit_insn (gen_bti_c());
6019
6020   reload_completed = 1;
6021   emit_note (NOTE_INSN_PROLOGUE_END);
6022
6023   this_rtx = gen_rtx_REG (Pmode, this_regno);
6024   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6025   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6026
6027   if (vcall_offset == 0)
6028     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6029   else
6030     {
6031       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6032
6033       addr = this_rtx;
6034       if (delta != 0)
6035         {
6036           if (delta >= -256 && delta < 256)
6037             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6038                                        plus_constant (Pmode, this_rtx, delta));
6039           else
6040             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6041                                 temp1, temp0, false);
6042         }
6043
6044       if (Pmode == ptr_mode)
6045         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6046       else
6047         aarch64_emit_move (temp0,
6048                            gen_rtx_ZERO_EXTEND (Pmode,
6049                                                 gen_rtx_MEM (ptr_mode, addr)));
6050
6051       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6052           addr = plus_constant (Pmode, temp0, vcall_offset);
6053       else
6054         {
6055           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6056                                           Pmode);
6057           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6058         }
6059
6060       if (Pmode == ptr_mode)
6061         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6062       else
6063         aarch64_emit_move (temp1,
6064                            gen_rtx_SIGN_EXTEND (Pmode,
6065                                                 gen_rtx_MEM (ptr_mode, addr)));
6066
6067       emit_insn (gen_add2_insn (this_rtx, temp1));
6068     }
6069
6070   /* Generate a tail call to the target function.  */
6071   if (!TREE_USED (function))
6072     {
6073       assemble_external (function);
6074       TREE_USED (function) = 1;
6075     }
6076   funexp = XEXP (DECL_RTL (function), 0);
6077   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6078   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6079   SIBLING_CALL_P (insn) = 1;
6080
6081   insn = get_insns ();
6082   shorten_branches (insn);
6083
6084   assemble_start_function (thunk, fnname);
6085   final_start_function (insn, file, 1);
6086   final (insn, file, 1);
6087   final_end_function ();
6088   assemble_end_function (thunk, fnname);
6089
6090   /* Stop pretending to be a post-reload pass.  */
6091   reload_completed = 0;
6092 }
6093
6094 static bool
6095 aarch64_tls_referenced_p (rtx x)
6096 {
6097   if (!TARGET_HAVE_TLS)
6098     return false;
6099   subrtx_iterator::array_type array;
6100   FOR_EACH_SUBRTX (iter, array, x, ALL)
6101     {
6102       const_rtx x = *iter;
6103       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6104         return true;
6105       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6106          TLS offsets, not real symbol references.  */
6107       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6108         iter.skip_subrtxes ();
6109     }
6110   return false;
6111 }
6112
6113
6114 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6115    a left shift of 0 or 12 bits.  */
6116 bool
6117 aarch64_uimm12_shift (HOST_WIDE_INT val)
6118 {
6119   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6120           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6121           );
6122 }
6123
6124 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6125    that can be created with a left shift of 0 or 12.  */
6126 static HOST_WIDE_INT
6127 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6128 {
6129   /* Check to see if the value fits in 24 bits, as that is the maximum we can
6130      handle correctly.  */
6131   gcc_assert ((val & 0xffffff) == val);
6132
6133   if (((val & 0xfff) << 0) == val)
6134     return val;
6135
6136   return val & (0xfff << 12);
6137 }
6138
6139 /* Return true if val is an immediate that can be loaded into a
6140    register by a MOVZ instruction.  */
6141 static bool
6142 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6143 {
6144   if (GET_MODE_SIZE (mode) > 4)
6145     {
6146       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6147           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6148         return 1;
6149     }
6150   else
6151     {
6152       /* Ignore sign extension.  */
6153       val &= (HOST_WIDE_INT) 0xffffffff;
6154     }
6155   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6156           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6157 }
6158
6159 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
6160    64-bit (DImode) integer.  */
6161
6162 static unsigned HOST_WIDE_INT
6163 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6164 {
6165   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6166   while (size < 64)
6167     {
6168       val &= (HOST_WIDE_INT_1U << size) - 1;
6169       val |= val << size;
6170       size *= 2;
6171     }
6172   return val;
6173 }
6174
6175 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
6176
6177 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6178   {
6179     0x0000000100000001ull,
6180     0x0001000100010001ull,
6181     0x0101010101010101ull,
6182     0x1111111111111111ull,
6183     0x5555555555555555ull,
6184   };
6185
6186
6187 /* Return true if val is a valid bitmask immediate.  */
6188
6189 bool
6190 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6191 {
6192   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6193   int bits;
6194
6195   /* Check for a single sequence of one bits and return quickly if so.
6196      The special cases of all ones and all zeroes returns false.  */
6197   val = aarch64_replicate_bitmask_imm (val_in, mode);
6198   tmp = val + (val & -val);
6199
6200   if (tmp == (tmp & -tmp))
6201     return (val + 1) > 1;
6202
6203   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
6204   if (mode == SImode)
6205     val = (val << 32) | (val & 0xffffffff);
6206
6207   /* Invert if the immediate doesn't start with a zero bit - this means we
6208      only need to search for sequences of one bits.  */
6209   if (val & 1)
6210     val = ~val;
6211
6212   /* Find the first set bit and set tmp to val with the first sequence of one
6213      bits removed.  Return success if there is a single sequence of ones.  */
6214   first_one = val & -val;
6215   tmp = val & (val + first_one);
6216
6217   if (tmp == 0)
6218     return true;
6219
6220   /* Find the next set bit and compute the difference in bit position.  */
6221   next_one = tmp & -tmp;
6222   bits = clz_hwi (first_one) - clz_hwi (next_one);
6223   mask = val ^ tmp;
6224
6225   /* Check the bit position difference is a power of 2, and that the first
6226      sequence of one bits fits within 'bits' bits.  */
6227   if ((mask >> bits) != 0 || bits != (bits & -bits))
6228     return false;
6229
6230   /* Check the sequence of one bits is repeated 64/bits times.  */
6231   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6232 }
6233
6234 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6235    Assumed precondition: VAL_IN Is not zero.  */
6236
6237 unsigned HOST_WIDE_INT
6238 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6239 {
6240   int lowest_bit_set = ctz_hwi (val_in);
6241   int highest_bit_set = floor_log2 (val_in);
6242   gcc_assert (val_in != 0);
6243
6244   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6245           (HOST_WIDE_INT_1U << lowest_bit_set));
6246 }
6247
6248 /* Create constant where bits outside of lowest bit set to highest bit set
6249    are set to 1.  */
6250
6251 unsigned HOST_WIDE_INT
6252 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6253 {
6254   return val_in | ~aarch64_and_split_imm1 (val_in);
6255 }
6256
6257 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
6258
6259 bool
6260 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
6261 {
6262   scalar_int_mode int_mode;
6263   if (!is_a <scalar_int_mode> (mode, &int_mode))
6264     return false;
6265
6266   if (aarch64_bitmask_imm (val_in, int_mode))
6267     return false;
6268
6269   if (aarch64_move_imm (val_in, int_mode))
6270     return false;
6271
6272   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
6273
6274   return aarch64_bitmask_imm (imm2, int_mode);
6275 }
6276
6277 /* Return true if val is an immediate that can be loaded into a
6278    register in a single instruction.  */
6279 bool
6280 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
6281 {
6282   scalar_int_mode int_mode;
6283   if (!is_a <scalar_int_mode> (mode, &int_mode))
6284     return false;
6285
6286   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
6287     return 1;
6288   return aarch64_bitmask_imm (val, int_mode);
6289 }
6290
6291 static bool
6292 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
6293 {
6294   rtx base, offset;
6295
6296   if (GET_CODE (x) == HIGH)
6297     return true;
6298
6299   /* There's no way to calculate VL-based values using relocations.  */
6300   subrtx_iterator::array_type array;
6301   FOR_EACH_SUBRTX (iter, array, x, ALL)
6302     if (GET_CODE (*iter) == CONST_POLY_INT)
6303       return true;
6304
6305   split_const (x, &base, &offset);
6306   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
6307     {
6308       if (aarch64_classify_symbol (base, INTVAL (offset))
6309           != SYMBOL_FORCE_TO_MEM)
6310         return true;
6311       else
6312         /* Avoid generating a 64-bit relocation in ILP32; leave
6313            to aarch64_expand_mov_immediate to handle it properly.  */
6314         return mode != ptr_mode;
6315     }
6316
6317   return aarch64_tls_referenced_p (x);
6318 }
6319
6320 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6321    The expansion for a table switch is quite expensive due to the number
6322    of instructions, the table lookup and hard to predict indirect jump.
6323    When optimizing for speed, and -O3 enabled, use the per-core tuning if
6324    set, otherwise use tables for > 16 cases as a tradeoff between size and
6325    performance.  When optimizing for size, use the default setting.  */
6326
6327 static unsigned int
6328 aarch64_case_values_threshold (void)
6329 {
6330   /* Use the specified limit for the number of cases before using jump
6331      tables at higher optimization levels.  */
6332   if (optimize > 2
6333       && selected_cpu->tune->max_case_values != 0)
6334     return selected_cpu->tune->max_case_values;
6335   else
6336     return optimize_size ? default_case_values_threshold () : 17;
6337 }
6338
6339 /* Return true if register REGNO is a valid index register.
6340    STRICT_P is true if REG_OK_STRICT is in effect.  */
6341
6342 bool
6343 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
6344 {
6345   if (!HARD_REGISTER_NUM_P (regno))
6346     {
6347       if (!strict_p)
6348         return true;
6349
6350       if (!reg_renumber)
6351         return false;
6352
6353       regno = reg_renumber[regno];
6354     }
6355   return GP_REGNUM_P (regno);
6356 }
6357
6358 /* Return true if register REGNO is a valid base register for mode MODE.
6359    STRICT_P is true if REG_OK_STRICT is in effect.  */
6360
6361 bool
6362 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
6363 {
6364   if (!HARD_REGISTER_NUM_P (regno))
6365     {
6366       if (!strict_p)
6367         return true;
6368
6369       if (!reg_renumber)
6370         return false;
6371
6372       regno = reg_renumber[regno];
6373     }
6374
6375   /* The fake registers will be eliminated to either the stack or
6376      hard frame pointer, both of which are usually valid base registers.
6377      Reload deals with the cases where the eliminated form isn't valid.  */
6378   return (GP_REGNUM_P (regno)
6379           || regno == SP_REGNUM
6380           || regno == FRAME_POINTER_REGNUM
6381           || regno == ARG_POINTER_REGNUM);
6382 }
6383
6384 /* Return true if X is a valid base register for mode MODE.
6385    STRICT_P is true if REG_OK_STRICT is in effect.  */
6386
6387 static bool
6388 aarch64_base_register_rtx_p (rtx x, bool strict_p)
6389 {
6390   if (!strict_p
6391       && GET_CODE (x) == SUBREG
6392       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
6393     x = SUBREG_REG (x);
6394
6395   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
6396 }
6397
6398 /* Return true if address offset is a valid index.  If it is, fill in INFO
6399    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6400
6401 static bool
6402 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
6403                         machine_mode mode, bool strict_p)
6404 {
6405   enum aarch64_address_type type;
6406   rtx index;
6407   int shift;
6408
6409   /* (reg:P) */
6410   if ((REG_P (x) || GET_CODE (x) == SUBREG)
6411       && GET_MODE (x) == Pmode)
6412     {
6413       type = ADDRESS_REG_REG;
6414       index = x;
6415       shift = 0;
6416     }
6417   /* (sign_extend:DI (reg:SI)) */
6418   else if ((GET_CODE (x) == SIGN_EXTEND
6419             || GET_CODE (x) == ZERO_EXTEND)
6420            && GET_MODE (x) == DImode
6421            && GET_MODE (XEXP (x, 0)) == SImode)
6422     {
6423       type = (GET_CODE (x) == SIGN_EXTEND)
6424         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6425       index = XEXP (x, 0);
6426       shift = 0;
6427     }
6428   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6429   else if (GET_CODE (x) == MULT
6430            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6431                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6432            && GET_MODE (XEXP (x, 0)) == DImode
6433            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6434            && CONST_INT_P (XEXP (x, 1)))
6435     {
6436       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6437         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6438       index = XEXP (XEXP (x, 0), 0);
6439       shift = exact_log2 (INTVAL (XEXP (x, 1)));
6440     }
6441   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6442   else if (GET_CODE (x) == ASHIFT
6443            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6444                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6445            && GET_MODE (XEXP (x, 0)) == DImode
6446            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6447            && CONST_INT_P (XEXP (x, 1)))
6448     {
6449       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6450         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6451       index = XEXP (XEXP (x, 0), 0);
6452       shift = INTVAL (XEXP (x, 1));
6453     }
6454   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6455   else if ((GET_CODE (x) == SIGN_EXTRACT
6456             || GET_CODE (x) == ZERO_EXTRACT)
6457            && GET_MODE (x) == DImode
6458            && GET_CODE (XEXP (x, 0)) == MULT
6459            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6460            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6461     {
6462       type = (GET_CODE (x) == SIGN_EXTRACT)
6463         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6464       index = XEXP (XEXP (x, 0), 0);
6465       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6466       if (INTVAL (XEXP (x, 1)) != 32 + shift
6467           || INTVAL (XEXP (x, 2)) != 0)
6468         shift = -1;
6469     }
6470   /* (and:DI (mult:DI (reg:DI) (const_int scale))
6471      (const_int 0xffffffff<<shift)) */
6472   else if (GET_CODE (x) == AND
6473            && GET_MODE (x) == DImode
6474            && GET_CODE (XEXP (x, 0)) == MULT
6475            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6476            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6477            && CONST_INT_P (XEXP (x, 1)))
6478     {
6479       type = ADDRESS_REG_UXTW;
6480       index = XEXP (XEXP (x, 0), 0);
6481       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6482       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6483         shift = -1;
6484     }
6485   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
6486   else if ((GET_CODE (x) == SIGN_EXTRACT
6487             || GET_CODE (x) == ZERO_EXTRACT)
6488            && GET_MODE (x) == DImode
6489            && GET_CODE (XEXP (x, 0)) == ASHIFT
6490            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6491            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6492     {
6493       type = (GET_CODE (x) == SIGN_EXTRACT)
6494         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6495       index = XEXP (XEXP (x, 0), 0);
6496       shift = INTVAL (XEXP (XEXP (x, 0), 1));
6497       if (INTVAL (XEXP (x, 1)) != 32 + shift
6498           || INTVAL (XEXP (x, 2)) != 0)
6499         shift = -1;
6500     }
6501   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
6502      (const_int 0xffffffff<<shift)) */
6503   else if (GET_CODE (x) == AND
6504            && GET_MODE (x) == DImode
6505            && GET_CODE (XEXP (x, 0)) == ASHIFT
6506            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6507            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6508            && CONST_INT_P (XEXP (x, 1)))
6509     {
6510       type = ADDRESS_REG_UXTW;
6511       index = XEXP (XEXP (x, 0), 0);
6512       shift = INTVAL (XEXP (XEXP (x, 0), 1));
6513       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6514         shift = -1;
6515     }
6516   /* (mult:P (reg:P) (const_int scale)) */
6517   else if (GET_CODE (x) == MULT
6518            && GET_MODE (x) == Pmode
6519            && GET_MODE (XEXP (x, 0)) == Pmode
6520            && CONST_INT_P (XEXP (x, 1)))
6521     {
6522       type = ADDRESS_REG_REG;
6523       index = XEXP (x, 0);
6524       shift = exact_log2 (INTVAL (XEXP (x, 1)));
6525     }
6526   /* (ashift:P (reg:P) (const_int shift)) */
6527   else if (GET_CODE (x) == ASHIFT
6528            && GET_MODE (x) == Pmode
6529            && GET_MODE (XEXP (x, 0)) == Pmode
6530            && CONST_INT_P (XEXP (x, 1)))
6531     {
6532       type = ADDRESS_REG_REG;
6533       index = XEXP (x, 0);
6534       shift = INTVAL (XEXP (x, 1));
6535     }
6536   else
6537     return false;
6538
6539   if (!strict_p
6540       && GET_CODE (index) == SUBREG
6541       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
6542     index = SUBREG_REG (index);
6543
6544   if (aarch64_sve_data_mode_p (mode))
6545     {
6546       if (type != ADDRESS_REG_REG
6547           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
6548         return false;
6549     }
6550   else
6551     {
6552       if (shift != 0
6553           && !(IN_RANGE (shift, 1, 3)
6554                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
6555         return false;
6556     }
6557
6558   if (REG_P (index)
6559       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
6560     {
6561       info->type = type;
6562       info->offset = index;
6563       info->shift = shift;
6564       return true;
6565     }
6566
6567   return false;
6568 }
6569
6570 /* Return true if MODE is one of the modes for which we
6571    support LDP/STP operations.  */
6572
6573 static bool
6574 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
6575 {
6576   return mode == SImode || mode == DImode
6577          || mode == SFmode || mode == DFmode
6578          || (aarch64_vector_mode_supported_p (mode)
6579              && (known_eq (GET_MODE_SIZE (mode), 8)
6580                  || (known_eq (GET_MODE_SIZE (mode), 16)
6581                     && (aarch64_tune_params.extra_tuning_flags
6582                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
6583 }
6584
6585 /* Return true if REGNO is a virtual pointer register, or an eliminable
6586    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
6587    include stack_pointer or hard_frame_pointer.  */
6588 static bool
6589 virt_or_elim_regno_p (unsigned regno)
6590 {
6591   return ((regno >= FIRST_VIRTUAL_REGISTER
6592            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
6593           || regno == FRAME_POINTER_REGNUM
6594           || regno == ARG_POINTER_REGNUM);
6595 }
6596
6597 /* Return true if X is a valid address of type TYPE for machine mode MODE.
6598    If it is, fill in INFO appropriately.  STRICT_P is true if
6599    REG_OK_STRICT is in effect.  */
6600
6601 bool
6602 aarch64_classify_address (struct aarch64_address_info *info,
6603                           rtx x, machine_mode mode, bool strict_p,
6604                           aarch64_addr_query_type type)
6605 {
6606   enum rtx_code code = GET_CODE (x);
6607   rtx op0, op1;
6608   poly_int64 offset;
6609
6610   HOST_WIDE_INT const_size;
6611
6612   /* On BE, we use load/store pair for all large int mode load/stores.
6613      TI/TFmode may also use a load/store pair.  */
6614   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6615   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
6616   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
6617                             || type == ADDR_QUERY_LDP_STP_N
6618                             || mode == TImode
6619                             || mode == TFmode
6620                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
6621
6622   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
6623      corresponds to the actual size of the memory being loaded/stored and the
6624      mode of the corresponding addressing mode is half of that.  */
6625   if (type == ADDR_QUERY_LDP_STP_N
6626       && known_eq (GET_MODE_SIZE (mode), 16))
6627     mode = DFmode;
6628
6629   bool allow_reg_index_p = (!load_store_pair_p
6630                             && (known_lt (GET_MODE_SIZE (mode), 16)
6631                                 || vec_flags == VEC_ADVSIMD
6632                                 || vec_flags & VEC_SVE_DATA));
6633
6634   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
6635      [Rn, #offset, MUL VL].  */
6636   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
6637       && (code != REG && code != PLUS))
6638     return false;
6639
6640   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
6641      REG addressing.  */
6642   if (advsimd_struct_p
6643       && !BYTES_BIG_ENDIAN
6644       && (code != POST_INC && code != REG))
6645     return false;
6646
6647   gcc_checking_assert (GET_MODE (x) == VOIDmode
6648                        || SCALAR_INT_MODE_P (GET_MODE (x)));
6649
6650   switch (code)
6651     {
6652     case REG:
6653     case SUBREG:
6654       info->type = ADDRESS_REG_IMM;
6655       info->base = x;
6656       info->offset = const0_rtx;
6657       info->const_offset = 0;
6658       return aarch64_base_register_rtx_p (x, strict_p);
6659
6660     case PLUS:
6661       op0 = XEXP (x, 0);
6662       op1 = XEXP (x, 1);
6663
6664       if (! strict_p
6665           && REG_P (op0)
6666           && virt_or_elim_regno_p (REGNO (op0))
6667           && poly_int_rtx_p (op1, &offset))
6668         {
6669           info->type = ADDRESS_REG_IMM;
6670           info->base = op0;
6671           info->offset = op1;
6672           info->const_offset = offset;
6673
6674           return true;
6675         }
6676
6677       if (maybe_ne (GET_MODE_SIZE (mode), 0)
6678           && aarch64_base_register_rtx_p (op0, strict_p)
6679           && poly_int_rtx_p (op1, &offset))
6680         {
6681           info->type = ADDRESS_REG_IMM;
6682           info->base = op0;
6683           info->offset = op1;
6684           info->const_offset = offset;
6685
6686           /* TImode and TFmode values are allowed in both pairs of X
6687              registers and individual Q registers.  The available
6688              address modes are:
6689              X,X: 7-bit signed scaled offset
6690              Q:   9-bit signed offset
6691              We conservatively require an offset representable in either mode.
6692              When performing the check for pairs of X registers i.e.  LDP/STP
6693              pass down DImode since that is the natural size of the LDP/STP
6694              instruction memory accesses.  */
6695           if (mode == TImode || mode == TFmode)
6696             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
6697                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6698                         || offset_12bit_unsigned_scaled_p (mode, offset)));
6699
6700           /* A 7bit offset check because OImode will emit a ldp/stp
6701              instruction (only big endian will get here).
6702              For ldp/stp instructions, the offset is scaled for the size of a
6703              single element of the pair.  */
6704           if (mode == OImode)
6705             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
6706
6707           /* Three 9/12 bit offsets checks because CImode will emit three
6708              ldr/str instructions (only big endian will get here).  */
6709           if (mode == CImode)
6710             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6711                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
6712                                                                offset + 32)
6713                         || offset_12bit_unsigned_scaled_p (V16QImode,
6714                                                            offset + 32)));
6715
6716           /* Two 7bit offsets checks because XImode will emit two ldp/stp
6717              instructions (only big endian will get here).  */
6718           if (mode == XImode)
6719             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6720                     && aarch64_offset_7bit_signed_scaled_p (TImode,
6721                                                             offset + 32));
6722
6723           /* Make "m" use the LD1 offset range for SVE data modes, so
6724              that pre-RTL optimizers like ivopts will work to that
6725              instead of the wider LDR/STR range.  */
6726           if (vec_flags == VEC_SVE_DATA)
6727             return (type == ADDR_QUERY_M
6728                     ? offset_4bit_signed_scaled_p (mode, offset)
6729                     : offset_9bit_signed_scaled_p (mode, offset));
6730
6731           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
6732             {
6733               poly_int64 end_offset = (offset
6734                                        + GET_MODE_SIZE (mode)
6735                                        - BYTES_PER_SVE_VECTOR);
6736               return (type == ADDR_QUERY_M
6737                       ? offset_4bit_signed_scaled_p (mode, offset)
6738                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
6739                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
6740                                                          end_offset)));
6741             }
6742
6743           if (vec_flags == VEC_SVE_PRED)
6744             return offset_9bit_signed_scaled_p (mode, offset);
6745
6746           if (load_store_pair_p)
6747             return ((known_eq (GET_MODE_SIZE (mode), 4)
6748                      || known_eq (GET_MODE_SIZE (mode), 8)
6749                      || known_eq (GET_MODE_SIZE (mode), 16))
6750                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6751           else
6752             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6753                     || offset_12bit_unsigned_scaled_p (mode, offset));
6754         }
6755
6756       if (allow_reg_index_p)
6757         {
6758           /* Look for base + (scaled/extended) index register.  */
6759           if (aarch64_base_register_rtx_p (op0, strict_p)
6760               && aarch64_classify_index (info, op1, mode, strict_p))
6761             {
6762               info->base = op0;
6763               return true;
6764             }
6765           if (aarch64_base_register_rtx_p (op1, strict_p)
6766               && aarch64_classify_index (info, op0, mode, strict_p))
6767             {
6768               info->base = op1;
6769               return true;
6770             }
6771         }
6772
6773       return false;
6774
6775     case POST_INC:
6776     case POST_DEC:
6777     case PRE_INC:
6778     case PRE_DEC:
6779       info->type = ADDRESS_REG_WB;
6780       info->base = XEXP (x, 0);
6781       info->offset = NULL_RTX;
6782       return aarch64_base_register_rtx_p (info->base, strict_p);
6783
6784     case POST_MODIFY:
6785     case PRE_MODIFY:
6786       info->type = ADDRESS_REG_WB;
6787       info->base = XEXP (x, 0);
6788       if (GET_CODE (XEXP (x, 1)) == PLUS
6789           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
6790           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
6791           && aarch64_base_register_rtx_p (info->base, strict_p))
6792         {
6793           info->offset = XEXP (XEXP (x, 1), 1);
6794           info->const_offset = offset;
6795
6796           /* TImode and TFmode values are allowed in both pairs of X
6797              registers and individual Q registers.  The available
6798              address modes are:
6799              X,X: 7-bit signed scaled offset
6800              Q:   9-bit signed offset
6801              We conservatively require an offset representable in either mode.
6802            */
6803           if (mode == TImode || mode == TFmode)
6804             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
6805                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
6806
6807           if (load_store_pair_p)
6808             return ((known_eq (GET_MODE_SIZE (mode), 4)
6809                      || known_eq (GET_MODE_SIZE (mode), 8)
6810                      || known_eq (GET_MODE_SIZE (mode), 16))
6811                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6812           else
6813             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
6814         }
6815       return false;
6816
6817     case CONST:
6818     case SYMBOL_REF:
6819     case LABEL_REF:
6820       /* load literal: pc-relative constant pool entry.  Only supported
6821          for SI mode or larger.  */
6822       info->type = ADDRESS_SYMBOLIC;
6823
6824       if (!load_store_pair_p
6825           && GET_MODE_SIZE (mode).is_constant (&const_size)
6826           && const_size >= 4)
6827         {
6828           rtx sym, addend;
6829
6830           split_const (x, &sym, &addend);
6831           return ((GET_CODE (sym) == LABEL_REF
6832                    || (GET_CODE (sym) == SYMBOL_REF
6833                        && CONSTANT_POOL_ADDRESS_P (sym)
6834                        && aarch64_pcrelative_literal_loads)));
6835         }
6836       return false;
6837
6838     case LO_SUM:
6839       info->type = ADDRESS_LO_SUM;
6840       info->base = XEXP (x, 0);
6841       info->offset = XEXP (x, 1);
6842       if (allow_reg_index_p
6843           && aarch64_base_register_rtx_p (info->base, strict_p))
6844         {
6845           rtx sym, offs;
6846           split_const (info->offset, &sym, &offs);
6847           if (GET_CODE (sym) == SYMBOL_REF
6848               && (aarch64_classify_symbol (sym, INTVAL (offs))
6849                   == SYMBOL_SMALL_ABSOLUTE))
6850             {
6851               /* The symbol and offset must be aligned to the access size.  */
6852               unsigned int align;
6853
6854               if (CONSTANT_POOL_ADDRESS_P (sym))
6855                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
6856               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
6857                 {
6858                   tree exp = SYMBOL_REF_DECL (sym);
6859                   align = TYPE_ALIGN (TREE_TYPE (exp));
6860                   align = aarch64_constant_alignment (exp, align);
6861                 }
6862               else if (SYMBOL_REF_DECL (sym))
6863                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
6864               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
6865                        && SYMBOL_REF_BLOCK (sym) != NULL)
6866                 align = SYMBOL_REF_BLOCK (sym)->alignment;
6867               else
6868                 align = BITS_PER_UNIT;
6869
6870               poly_int64 ref_size = GET_MODE_SIZE (mode);
6871               if (known_eq (ref_size, 0))
6872                 ref_size = GET_MODE_SIZE (DImode);
6873
6874               return (multiple_p (INTVAL (offs), ref_size)
6875                       && multiple_p (align / BITS_PER_UNIT, ref_size));
6876             }
6877         }
6878       return false;
6879
6880     default:
6881       return false;
6882     }
6883 }
6884
6885 /* Return true if the address X is valid for a PRFM instruction.
6886    STRICT_P is true if we should do strict checking with
6887    aarch64_classify_address.  */
6888
6889 bool
6890 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
6891 {
6892   struct aarch64_address_info addr;
6893
6894   /* PRFM accepts the same addresses as DImode...  */
6895   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6896   if (!res)
6897     return false;
6898
6899   /* ... except writeback forms.  */
6900   return addr.type != ADDRESS_REG_WB;
6901 }
6902
6903 bool
6904 aarch64_symbolic_address_p (rtx x)
6905 {
6906   rtx offset;
6907
6908   split_const (x, &x, &offset);
6909   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6910 }
6911
6912 /* Classify the base of symbolic expression X.  */
6913
6914 enum aarch64_symbol_type
6915 aarch64_classify_symbolic_expression (rtx x)
6916 {
6917   rtx offset;
6918
6919   split_const (x, &x, &offset);
6920   return aarch64_classify_symbol (x, INTVAL (offset));
6921 }
6922
6923
6924 /* Return TRUE if X is a legitimate address for accessing memory in
6925    mode MODE.  */
6926 static bool
6927 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6928 {
6929   struct aarch64_address_info addr;
6930
6931   return aarch64_classify_address (&addr, x, mode, strict_p);
6932 }
6933
6934 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6935    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6936 bool
6937 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6938                               aarch64_addr_query_type type)
6939 {
6940   struct aarch64_address_info addr;
6941
6942   return aarch64_classify_address (&addr, x, mode, strict_p, type);
6943 }
6944
6945 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
6946
6947 static bool
6948 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6949                                          poly_int64 orig_offset,
6950                                          machine_mode mode)
6951 {
6952   HOST_WIDE_INT size;
6953   if (GET_MODE_SIZE (mode).is_constant (&size))
6954     {
6955       HOST_WIDE_INT const_offset, second_offset;
6956
6957       /* A general SVE offset is A * VQ + B.  Remove the A component from
6958          coefficient 0 in order to get the constant B.  */
6959       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6960
6961       /* Split an out-of-range address displacement into a base and
6962          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
6963          range otherwise to increase opportunities for sharing the base
6964          address of different sizes.  Unaligned accesses use the signed
6965          9-bit range, TImode/TFmode use the intersection of signed
6966          scaled 7-bit and signed 9-bit offset.  */
6967       if (mode == TImode || mode == TFmode)
6968         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6969       else if ((const_offset & (size - 1)) != 0)
6970         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6971       else
6972         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6973
6974       if (second_offset == 0 || known_eq (orig_offset, second_offset))
6975         return false;
6976
6977       /* Split the offset into second_offset and the rest.  */
6978       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6979       *offset2 = gen_int_mode (second_offset, Pmode);
6980       return true;
6981     }
6982   else
6983     {
6984       /* Get the mode we should use as the basis of the range.  For structure
6985          modes this is the mode of one vector.  */
6986       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6987       machine_mode step_mode
6988         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6989
6990       /* Get the "mul vl" multiplier we'd like to use.  */
6991       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6992       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6993       if (vec_flags & VEC_SVE_DATA)
6994         /* LDR supports a 9-bit range, but the move patterns for
6995            structure modes require all vectors to be in range of the
6996            same base.  The simplest way of accomodating that while still
6997            promoting reuse of anchor points between different modes is
6998            to use an 8-bit range unconditionally.  */
6999         vnum = ((vnum + 128) & 255) - 128;
7000       else
7001         /* Predicates are only handled singly, so we might as well use
7002            the full range.  */
7003         vnum = ((vnum + 256) & 511) - 256;
7004       if (vnum == 0)
7005         return false;
7006
7007       /* Convert the "mul vl" multiplier into a byte offset.  */
7008       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7009       if (known_eq (second_offset, orig_offset))
7010         return false;
7011
7012       /* Split the offset into second_offset and the rest.  */
7013       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7014       *offset2 = gen_int_mode (second_offset, Pmode);
7015       return true;
7016     }
7017 }
7018
7019 /* Return the binary representation of floating point constant VALUE in INTVAL.
7020    If the value cannot be converted, return false without setting INTVAL.
7021    The conversion is done in the given MODE.  */
7022 bool
7023 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7024 {
7025
7026   /* We make a general exception for 0.  */
7027   if (aarch64_float_const_zero_rtx_p (value))
7028     {
7029       *intval = 0;
7030       return true;
7031     }
7032
7033   scalar_float_mode mode;
7034   if (GET_CODE (value) != CONST_DOUBLE
7035       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7036       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7037       /* Only support up to DF mode.  */
7038       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7039     return false;
7040
7041   unsigned HOST_WIDE_INT ival = 0;
7042
7043   long res[2];
7044   real_to_target (res,
7045                   CONST_DOUBLE_REAL_VALUE (value),
7046                   REAL_MODE_FORMAT (mode));
7047
7048   if (mode == DFmode)
7049     {
7050       int order = BYTES_BIG_ENDIAN ? 1 : 0;
7051       ival = zext_hwi (res[order], 32);
7052       ival |= (zext_hwi (res[1 - order], 32) << 32);
7053     }
7054   else
7055       ival = zext_hwi (res[0], 32);
7056
7057   *intval = ival;
7058   return true;
7059 }
7060
7061 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7062    single MOV(+MOVK) followed by an FMOV.  */
7063 bool
7064 aarch64_float_const_rtx_p (rtx x)
7065 {
7066   machine_mode mode = GET_MODE (x);
7067   if (mode == VOIDmode)
7068     return false;
7069
7070   /* Determine whether it's cheaper to write float constants as
7071      mov/movk pairs over ldr/adrp pairs.  */
7072   unsigned HOST_WIDE_INT ival;
7073
7074   if (GET_CODE (x) == CONST_DOUBLE
7075       && SCALAR_FLOAT_MODE_P (mode)
7076       && aarch64_reinterpret_float_as_int (x, &ival))
7077     {
7078       scalar_int_mode imode = (mode == HFmode
7079                                ? SImode
7080                                : int_mode_for_mode (mode).require ());
7081       int num_instr = aarch64_internal_mov_immediate
7082                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7083       return num_instr < 3;
7084     }
7085
7086   return false;
7087 }
7088
7089 /* Return TRUE if rtx X is immediate constant 0.0 */
7090 bool
7091 aarch64_float_const_zero_rtx_p (rtx x)
7092 {
7093   if (GET_MODE (x) == VOIDmode)
7094     return false;
7095
7096   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7097     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7098   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7099 }
7100
7101 /* Return TRUE if rtx X is immediate constant that fits in a single
7102    MOVI immediate operation.  */
7103 bool
7104 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7105 {
7106   if (!TARGET_SIMD)
7107      return false;
7108
7109   machine_mode vmode;
7110   scalar_int_mode imode;
7111   unsigned HOST_WIDE_INT ival;
7112
7113   if (GET_CODE (x) == CONST_DOUBLE
7114       && SCALAR_FLOAT_MODE_P (mode))
7115     {
7116       if (!aarch64_reinterpret_float_as_int (x, &ival))
7117         return false;
7118
7119       /* We make a general exception for 0.  */
7120       if (aarch64_float_const_zero_rtx_p (x))
7121         return true;
7122
7123       imode = int_mode_for_mode (mode).require ();
7124     }
7125   else if (GET_CODE (x) == CONST_INT
7126            && is_a <scalar_int_mode> (mode, &imode))
7127     ival = INTVAL (x);
7128   else
7129     return false;
7130
7131    /* use a 64 bit mode for everything except for DI/DF mode, where we use
7132      a 128 bit vector mode.  */
7133   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7134
7135   vmode = aarch64_simd_container_mode (imode, width);
7136   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7137
7138   return aarch64_simd_valid_immediate (v_op, NULL);
7139 }
7140
7141
7142 /* Return the fixed registers used for condition codes.  */
7143
7144 static bool
7145 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7146 {
7147   *p1 = CC_REGNUM;
7148   *p2 = INVALID_REGNUM;
7149   return true;
7150 }
7151
7152 /* This function is used by the call expanders of the machine description.
7153    RESULT is the register in which the result is returned.  It's NULL for
7154    "call" and "sibcall".
7155    MEM is the location of the function call.
7156    SIBCALL indicates whether this function call is normal call or sibling call.
7157    It will generate different pattern accordingly.  */
7158
7159 void
7160 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7161 {
7162   rtx call, callee, tmp;
7163   rtvec vec;
7164   machine_mode mode;
7165
7166   gcc_assert (MEM_P (mem));
7167   callee = XEXP (mem, 0);
7168   mode = GET_MODE (callee);
7169   gcc_assert (mode == Pmode);
7170
7171   /* Decide if we should generate indirect calls by loading the
7172      address of the callee into a register before performing
7173      the branch-and-link.  */
7174   if (SYMBOL_REF_P (callee)
7175       ? (aarch64_is_long_call_p (callee)
7176          || aarch64_is_noplt_call_p (callee))
7177       : !REG_P (callee))
7178     XEXP (mem, 0) = force_reg (mode, callee);
7179
7180   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7181
7182   if (result != NULL_RTX)
7183     call = gen_rtx_SET (result, call);
7184
7185   if (sibcall)
7186     tmp = ret_rtx;
7187   else
7188     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7189
7190   vec = gen_rtvec (2, call, tmp);
7191   call = gen_rtx_PARALLEL (VOIDmode, vec);
7192
7193   aarch64_emit_call_insn (call);
7194 }
7195
7196 /* Emit call insn with PAT and do aarch64-specific handling.  */
7197
7198 void
7199 aarch64_emit_call_insn (rtx pat)
7200 {
7201   rtx insn = emit_call_insn (pat);
7202
7203   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7204   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7205   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7206 }
7207
7208 machine_mode
7209 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7210 {
7211   machine_mode mode_x = GET_MODE (x);
7212   rtx_code code_x = GET_CODE (x);
7213
7214   /* All floating point compares return CCFP if it is an equality
7215      comparison, and CCFPE otherwise.  */
7216   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
7217     {
7218       switch (code)
7219         {
7220         case EQ:
7221         case NE:
7222         case UNORDERED:
7223         case ORDERED:
7224         case UNLT:
7225         case UNLE:
7226         case UNGT:
7227         case UNGE:
7228         case UNEQ:
7229           return CCFPmode;
7230
7231         case LT:
7232         case LE:
7233         case GT:
7234         case GE:
7235         case LTGT:
7236           return CCFPEmode;
7237
7238         default:
7239           gcc_unreachable ();
7240         }
7241     }
7242
7243   /* Equality comparisons of short modes against zero can be performed
7244      using the TST instruction with the appropriate bitmask.  */
7245   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
7246       && (code == EQ || code == NE)
7247       && (mode_x == HImode || mode_x == QImode))
7248     return CC_NZmode;
7249
7250   /* Similarly, comparisons of zero_extends from shorter modes can
7251      be performed using an ANDS with an immediate mask.  */
7252   if (y == const0_rtx && code_x == ZERO_EXTEND
7253       && (mode_x == SImode || mode_x == DImode)
7254       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
7255       && (code == EQ || code == NE))
7256     return CC_NZmode;
7257
7258   if ((mode_x == SImode || mode_x == DImode)
7259       && y == const0_rtx
7260       && (code == EQ || code == NE || code == LT || code == GE)
7261       && (code_x == PLUS || code_x == MINUS || code_x == AND
7262           || code_x == NEG
7263           || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7264               && CONST_INT_P (XEXP (x, 2)))))
7265     return CC_NZmode;
7266
7267   /* A compare with a shifted operand.  Because of canonicalization,
7268      the comparison will have to be swapped when we emit the assembly
7269      code.  */
7270   if ((mode_x == SImode || mode_x == DImode)
7271       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
7272       && (code_x == ASHIFT || code_x == ASHIFTRT
7273           || code_x == LSHIFTRT
7274           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
7275     return CC_SWPmode;
7276
7277   /* Similarly for a negated operand, but we can only do this for
7278      equalities.  */
7279   if ((mode_x == SImode || mode_x == DImode)
7280       && (REG_P (y) || GET_CODE (y) == SUBREG)
7281       && (code == EQ || code == NE)
7282       && code_x == NEG)
7283     return CC_Zmode;
7284
7285   /* A test for unsigned overflow from an addition.  */
7286   if ((mode_x == DImode || mode_x == TImode)
7287       && (code == LTU || code == GEU)
7288       && code_x == PLUS
7289       && rtx_equal_p (XEXP (x, 0), y))
7290     return CC_Cmode;
7291
7292   /* A test for unsigned overflow from an add with carry.  */
7293   if ((mode_x == DImode || mode_x == TImode)
7294       && (code == LTU || code == GEU)
7295       && code_x == PLUS
7296       && CONST_SCALAR_INT_P (y)
7297       && (rtx_mode_t (y, mode_x)
7298           == (wi::shwi (1, mode_x)
7299               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
7300     return CC_ADCmode;
7301
7302   /* A test for signed overflow.  */
7303   if ((mode_x == DImode || mode_x == TImode)
7304       && code == NE
7305       && code_x == PLUS
7306       && GET_CODE (y) == SIGN_EXTEND)
7307     return CC_Vmode;
7308
7309   /* For everything else, return CCmode.  */
7310   return CCmode;
7311 }
7312
7313 static int
7314 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
7315
7316 int
7317 aarch64_get_condition_code (rtx x)
7318 {
7319   machine_mode mode = GET_MODE (XEXP (x, 0));
7320   enum rtx_code comp_code = GET_CODE (x);
7321
7322   if (GET_MODE_CLASS (mode) != MODE_CC)
7323     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
7324   return aarch64_get_condition_code_1 (mode, comp_code);
7325 }
7326
7327 static int
7328 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
7329 {
7330   switch (mode)
7331     {
7332     case E_CCFPmode:
7333     case E_CCFPEmode:
7334       switch (comp_code)
7335         {
7336         case GE: return AARCH64_GE;
7337         case GT: return AARCH64_GT;
7338         case LE: return AARCH64_LS;
7339         case LT: return AARCH64_MI;
7340         case NE: return AARCH64_NE;
7341         case EQ: return AARCH64_EQ;
7342         case ORDERED: return AARCH64_VC;
7343         case UNORDERED: return AARCH64_VS;
7344         case UNLT: return AARCH64_LT;
7345         case UNLE: return AARCH64_LE;
7346         case UNGT: return AARCH64_HI;
7347         case UNGE: return AARCH64_PL;
7348         default: return -1;
7349         }
7350       break;
7351
7352     case E_CCmode:
7353       switch (comp_code)
7354         {
7355         case NE: return AARCH64_NE;
7356         case EQ: return AARCH64_EQ;
7357         case GE: return AARCH64_GE;
7358         case GT: return AARCH64_GT;
7359         case LE: return AARCH64_LE;
7360         case LT: return AARCH64_LT;
7361         case GEU: return AARCH64_CS;
7362         case GTU: return AARCH64_HI;
7363         case LEU: return AARCH64_LS;
7364         case LTU: return AARCH64_CC;
7365         default: return -1;
7366         }
7367       break;
7368
7369     case E_CC_SWPmode:
7370       switch (comp_code)
7371         {
7372         case NE: return AARCH64_NE;
7373         case EQ: return AARCH64_EQ;
7374         case GE: return AARCH64_LE;
7375         case GT: return AARCH64_LT;
7376         case LE: return AARCH64_GE;
7377         case LT: return AARCH64_GT;
7378         case GEU: return AARCH64_LS;
7379         case GTU: return AARCH64_CC;
7380         case LEU: return AARCH64_CS;
7381         case LTU: return AARCH64_HI;
7382         default: return -1;
7383         }
7384       break;
7385
7386     case E_CC_NZmode:
7387       switch (comp_code)
7388         {
7389         case NE: return AARCH64_NE;
7390         case EQ: return AARCH64_EQ;
7391         case GE: return AARCH64_PL;
7392         case LT: return AARCH64_MI;
7393         default: return -1;
7394         }
7395       break;
7396
7397     case E_CC_Zmode:
7398       switch (comp_code)
7399         {
7400         case NE: return AARCH64_NE;
7401         case EQ: return AARCH64_EQ;
7402         default: return -1;
7403         }
7404       break;
7405
7406     case E_CC_Cmode:
7407       switch (comp_code)
7408         {
7409         case LTU: return AARCH64_CS;
7410         case GEU: return AARCH64_CC;
7411         default: return -1;
7412         }
7413       break;
7414
7415     case E_CC_ADCmode:
7416       switch (comp_code)
7417         {
7418         case GEU: return AARCH64_CS;
7419         case LTU: return AARCH64_CC;
7420         default: return -1;
7421         }
7422       break;
7423
7424     case E_CC_Vmode:
7425       switch (comp_code)
7426         {
7427         case NE: return AARCH64_VS;
7428         case EQ: return AARCH64_VC;
7429         default: return -1;
7430         }
7431       break;
7432
7433     default:
7434       return -1;
7435     }
7436
7437   return -1;
7438 }
7439
7440 bool
7441 aarch64_const_vec_all_same_in_range_p (rtx x,
7442                                        HOST_WIDE_INT minval,
7443                                        HOST_WIDE_INT maxval)
7444 {
7445   rtx elt;
7446   return (const_vec_duplicate_p (x, &elt)
7447           && CONST_INT_P (elt)
7448           && IN_RANGE (INTVAL (elt), minval, maxval));
7449 }
7450
7451 bool
7452 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
7453 {
7454   return aarch64_const_vec_all_same_in_range_p (x, val, val);
7455 }
7456
7457 /* Return true if VEC is a constant in which every element is in the range
7458    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
7459
7460 static bool
7461 aarch64_const_vec_all_in_range_p (rtx vec,
7462                                   HOST_WIDE_INT minval,
7463                                   HOST_WIDE_INT maxval)
7464 {
7465   if (GET_CODE (vec) != CONST_VECTOR
7466       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
7467     return false;
7468
7469   int nunits;
7470   if (!CONST_VECTOR_STEPPED_P (vec))
7471     nunits = const_vector_encoded_nelts (vec);
7472   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
7473     return false;
7474
7475   for (int i = 0; i < nunits; i++)
7476     {
7477       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
7478       if (!CONST_INT_P (vec_elem)
7479           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
7480         return false;
7481     }
7482   return true;
7483 }
7484
7485 /* N Z C V.  */
7486 #define AARCH64_CC_V 1
7487 #define AARCH64_CC_C (1 << 1)
7488 #define AARCH64_CC_Z (1 << 2)
7489 #define AARCH64_CC_N (1 << 3)
7490
7491 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
7492 static const int aarch64_nzcv_codes[] =
7493 {
7494   0,            /* EQ, Z == 1.  */
7495   AARCH64_CC_Z, /* NE, Z == 0.  */
7496   0,            /* CS, C == 1.  */
7497   AARCH64_CC_C, /* CC, C == 0.  */
7498   0,            /* MI, N == 1.  */
7499   AARCH64_CC_N, /* PL, N == 0.  */
7500   0,            /* VS, V == 1.  */
7501   AARCH64_CC_V, /* VC, V == 0.  */
7502   0,            /* HI, C ==1 && Z == 0.  */
7503   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
7504   AARCH64_CC_V, /* GE, N == V.  */
7505   0,            /* LT, N != V.  */
7506   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
7507   0,            /* LE, !(Z == 0 && N == V).  */
7508   0,            /* AL, Any.  */
7509   0             /* NV, Any.  */
7510 };
7511
7512 /* Print floating-point vector immediate operand X to F, negating it
7513    first if NEGATE is true.  Return true on success, false if it isn't
7514    a constant we can handle.  */
7515
7516 static bool
7517 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
7518 {
7519   rtx elt;
7520
7521   if (!const_vec_duplicate_p (x, &elt))
7522     return false;
7523
7524   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
7525   if (negate)
7526     r = real_value_negate (&r);
7527
7528   /* We only handle the SVE single-bit immediates here.  */
7529   if (real_equal (&r, &dconst0))
7530     asm_fprintf (f, "0.0");
7531   else if (real_equal (&r, &dconst1))
7532     asm_fprintf (f, "1.0");
7533   else if (real_equal (&r, &dconsthalf))
7534     asm_fprintf (f, "0.5");
7535   else
7536     return false;
7537
7538   return true;
7539 }
7540
7541 /* Return the equivalent letter for size.  */
7542 static char
7543 sizetochar (int size)
7544 {
7545   switch (size)
7546     {
7547     case 64: return 'd';
7548     case 32: return 's';
7549     case 16: return 'h';
7550     case 8 : return 'b';
7551     default: gcc_unreachable ();
7552     }
7553 }
7554
7555 /* Print operand X to file F in a target specific manner according to CODE.
7556    The acceptable formatting commands given by CODE are:
7557      'c':               An integer or symbol address without a preceding #
7558                         sign.
7559      'C':               Take the duplicated element in a vector constant
7560                         and print it in hex.
7561      'D':               Take the duplicated element in a vector constant
7562                         and print it as an unsigned integer, in decimal.
7563      'e':               Print the sign/zero-extend size as a character 8->b,
7564                         16->h, 32->w.
7565      'p':               Prints N such that 2^N == X (X must be power of 2 and
7566                         const int).
7567      'P':               Print the number of non-zero bits in X (a const_int).
7568      'H':               Print the higher numbered register of a pair (TImode)
7569                         of regs.
7570      'm':               Print a condition (eq, ne, etc).
7571      'M':               Same as 'm', but invert condition.
7572      'N':               Take the duplicated element in a vector constant
7573                         and print the negative of it in decimal.
7574      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
7575      'S/T/U/V':         Print a FP/SIMD register name for a register list.
7576                         The register printed is the FP/SIMD register name
7577                         of X + 0/1/2/3 for S/T/U/V.
7578      'R':               Print a scalar FP/SIMD register name + 1.
7579      'X':               Print bottom 16 bits of integer constant in hex.
7580      'w/x':             Print a general register name or the zero register
7581                         (32-bit or 64-bit).
7582      '0':               Print a normal operand, if it's a general register,
7583                         then we assume DImode.
7584      'k':               Print NZCV for conditional compare instructions.
7585      'A':               Output address constant representing the first
7586                         argument of X, specifying a relocation offset
7587                         if appropriate.
7588      'L':               Output constant address specified by X
7589                         with a relocation offset if appropriate.
7590      'G':               Prints address of X, specifying a PC relative
7591                         relocation mode if appropriate.
7592      'y':               Output address of LDP or STP - this is used for
7593                         some LDP/STPs which don't use a PARALLEL in their
7594                         pattern (so the mode needs to be adjusted).
7595      'z':               Output address of a typical LDP or STP.  */
7596
7597 static void
7598 aarch64_print_operand (FILE *f, rtx x, int code)
7599 {
7600   rtx elt;
7601   switch (code)
7602     {
7603     case 'c':
7604       switch (GET_CODE (x))
7605         {
7606         case CONST_INT:
7607           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
7608           break;
7609
7610         case SYMBOL_REF:
7611           output_addr_const (f, x);
7612           break;
7613
7614         case CONST:
7615           if (GET_CODE (XEXP (x, 0)) == PLUS
7616               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
7617             {
7618               output_addr_const (f, x);
7619               break;
7620             }
7621           /* Fall through.  */
7622
7623         default:
7624           output_operand_lossage ("unsupported operand for code '%c'", code);
7625         }
7626       break;
7627
7628     case 'e':
7629       {
7630         int n;
7631
7632         if (!CONST_INT_P (x)
7633             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
7634           {
7635             output_operand_lossage ("invalid operand for '%%%c'", code);
7636             return;
7637           }
7638
7639         switch (n)
7640           {
7641           case 3:
7642             fputc ('b', f);
7643             break;
7644           case 4:
7645             fputc ('h', f);
7646             break;
7647           case 5:
7648             fputc ('w', f);
7649             break;
7650           default:
7651             output_operand_lossage ("invalid operand for '%%%c'", code);
7652             return;
7653           }
7654       }
7655       break;
7656
7657     case 'p':
7658       {
7659         int n;
7660
7661         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
7662           {
7663             output_operand_lossage ("invalid operand for '%%%c'", code);
7664             return;
7665           }
7666
7667         asm_fprintf (f, "%d", n);
7668       }
7669       break;
7670
7671     case 'P':
7672       if (!CONST_INT_P (x))
7673         {
7674           output_operand_lossage ("invalid operand for '%%%c'", code);
7675           return;
7676         }
7677
7678       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
7679       break;
7680
7681     case 'H':
7682       if (x == const0_rtx)
7683         {
7684           asm_fprintf (f, "xzr");
7685           break;
7686         }
7687
7688       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
7689         {
7690           output_operand_lossage ("invalid operand for '%%%c'", code);
7691           return;
7692         }
7693
7694       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
7695       break;
7696
7697     case 'M':
7698     case 'm':
7699       {
7700         int cond_code;
7701         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
7702         if (x == const_true_rtx)
7703           {
7704             if (code == 'M')
7705               fputs ("nv", f);
7706             return;
7707           }
7708
7709         if (!COMPARISON_P (x))
7710           {
7711             output_operand_lossage ("invalid operand for '%%%c'", code);
7712             return;
7713           }
7714
7715         cond_code = aarch64_get_condition_code (x);
7716         gcc_assert (cond_code >= 0);
7717         if (code == 'M')
7718           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
7719         fputs (aarch64_condition_codes[cond_code], f);
7720       }
7721       break;
7722
7723     case 'N':
7724       if (!const_vec_duplicate_p (x, &elt))
7725         {
7726           output_operand_lossage ("invalid vector constant");
7727           return;
7728         }
7729
7730       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7731         asm_fprintf (f, "%wd", -INTVAL (elt));
7732       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7733                && aarch64_print_vector_float_operand (f, x, true))
7734         ;
7735       else
7736         {
7737           output_operand_lossage ("invalid vector constant");
7738           return;
7739         }
7740       break;
7741
7742     case 'b':
7743     case 'h':
7744     case 's':
7745     case 'd':
7746     case 'q':
7747       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7748         {
7749           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7750           return;
7751         }
7752       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
7753       break;
7754
7755     case 'S':
7756     case 'T':
7757     case 'U':
7758     case 'V':
7759       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7760         {
7761           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7762           return;
7763         }
7764       asm_fprintf (f, "%c%d",
7765                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
7766                    REGNO (x) - V0_REGNUM + (code - 'S'));
7767       break;
7768
7769     case 'R':
7770       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7771         {
7772           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7773           return;
7774         }
7775       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
7776       break;
7777
7778     case 'X':
7779       if (!CONST_INT_P (x))
7780         {
7781           output_operand_lossage ("invalid operand for '%%%c'", code);
7782           return;
7783         }
7784       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
7785       break;
7786
7787     case 'C':
7788       {
7789         /* Print a replicated constant in hex.  */
7790         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7791           {
7792             output_operand_lossage ("invalid operand for '%%%c'", code);
7793             return;
7794           }
7795         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7796         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7797       }
7798       break;
7799
7800     case 'D':
7801       {
7802         /* Print a replicated constant in decimal, treating it as
7803            unsigned.  */
7804         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7805           {
7806             output_operand_lossage ("invalid operand for '%%%c'", code);
7807             return;
7808           }
7809         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7810         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7811       }
7812       break;
7813
7814     case 'w':
7815     case 'x':
7816       if (x == const0_rtx
7817           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
7818         {
7819           asm_fprintf (f, "%czr", code);
7820           break;
7821         }
7822
7823       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
7824         {
7825           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
7826           break;
7827         }
7828
7829       if (REG_P (x) && REGNO (x) == SP_REGNUM)
7830         {
7831           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
7832           break;
7833         }
7834
7835       /* Fall through */
7836
7837     case 0:
7838       if (x == NULL)
7839         {
7840           output_operand_lossage ("missing operand");
7841           return;
7842         }
7843
7844       switch (GET_CODE (x))
7845         {
7846         case REG:
7847           if (aarch64_sve_data_mode_p (GET_MODE (x)))
7848             {
7849               if (REG_NREGS (x) == 1)
7850                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
7851               else
7852                 {
7853                   char suffix
7854                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
7855                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
7856                                REGNO (x) - V0_REGNUM, suffix,
7857                                END_REGNO (x) - V0_REGNUM - 1, suffix);
7858                 }
7859             }
7860           else
7861             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
7862           break;
7863
7864         case MEM:
7865           output_address (GET_MODE (x), XEXP (x, 0));
7866           break;
7867
7868         case LABEL_REF:
7869         case SYMBOL_REF:
7870           output_addr_const (asm_out_file, x);
7871           break;
7872
7873         case CONST_INT:
7874           asm_fprintf (f, "%wd", INTVAL (x));
7875           break;
7876
7877         case CONST:
7878           if (!VECTOR_MODE_P (GET_MODE (x)))
7879             {
7880               output_addr_const (asm_out_file, x);
7881               break;
7882             }
7883           /* fall through */
7884
7885         case CONST_VECTOR:
7886           if (!const_vec_duplicate_p (x, &elt))
7887             {
7888               output_operand_lossage ("invalid vector constant");
7889               return;
7890             }
7891
7892           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7893             asm_fprintf (f, "%wd", INTVAL (elt));
7894           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7895                    && aarch64_print_vector_float_operand (f, x, false))
7896             ;
7897           else
7898             {
7899               output_operand_lossage ("invalid vector constant");
7900               return;
7901             }
7902           break;
7903
7904         case CONST_DOUBLE:
7905           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
7906              be getting CONST_DOUBLEs holding integers.  */
7907           gcc_assert (GET_MODE (x) != VOIDmode);
7908           if (aarch64_float_const_zero_rtx_p (x))
7909             {
7910               fputc ('0', f);
7911               break;
7912             }
7913           else if (aarch64_float_const_representable_p (x))
7914             {
7915 #define buf_size 20
7916               char float_buf[buf_size] = {'\0'};
7917               real_to_decimal_for_mode (float_buf,
7918                                         CONST_DOUBLE_REAL_VALUE (x),
7919                                         buf_size, buf_size,
7920                                         1, GET_MODE (x));
7921               asm_fprintf (asm_out_file, "%s", float_buf);
7922               break;
7923 #undef buf_size
7924             }
7925           output_operand_lossage ("invalid constant");
7926           return;
7927         default:
7928           output_operand_lossage ("invalid operand");
7929           return;
7930         }
7931       break;
7932
7933     case 'A':
7934       if (GET_CODE (x) == HIGH)
7935         x = XEXP (x, 0);
7936
7937       switch (aarch64_classify_symbolic_expression (x))
7938         {
7939         case SYMBOL_SMALL_GOT_4G:
7940           asm_fprintf (asm_out_file, ":got:");
7941           break;
7942
7943         case SYMBOL_SMALL_TLSGD:
7944           asm_fprintf (asm_out_file, ":tlsgd:");
7945           break;
7946
7947         case SYMBOL_SMALL_TLSDESC:
7948           asm_fprintf (asm_out_file, ":tlsdesc:");
7949           break;
7950
7951         case SYMBOL_SMALL_TLSIE:
7952           asm_fprintf (asm_out_file, ":gottprel:");
7953           break;
7954
7955         case SYMBOL_TLSLE24:
7956           asm_fprintf (asm_out_file, ":tprel:");
7957           break;
7958
7959         case SYMBOL_TINY_GOT:
7960           gcc_unreachable ();
7961           break;
7962
7963         default:
7964           break;
7965         }
7966       output_addr_const (asm_out_file, x);
7967       break;
7968
7969     case 'L':
7970       switch (aarch64_classify_symbolic_expression (x))
7971         {
7972         case SYMBOL_SMALL_GOT_4G:
7973           asm_fprintf (asm_out_file, ":lo12:");
7974           break;
7975
7976         case SYMBOL_SMALL_TLSGD:
7977           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7978           break;
7979
7980         case SYMBOL_SMALL_TLSDESC:
7981           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7982           break;
7983
7984         case SYMBOL_SMALL_TLSIE:
7985           asm_fprintf (asm_out_file, ":gottprel_lo12:");
7986           break;
7987
7988         case SYMBOL_TLSLE12:
7989           asm_fprintf (asm_out_file, ":tprel_lo12:");
7990           break;
7991
7992         case SYMBOL_TLSLE24:
7993           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7994           break;
7995
7996         case SYMBOL_TINY_GOT:
7997           asm_fprintf (asm_out_file, ":got:");
7998           break;
7999
8000         case SYMBOL_TINY_TLSIE:
8001           asm_fprintf (asm_out_file, ":gottprel:");
8002           break;
8003
8004         default:
8005           break;
8006         }
8007       output_addr_const (asm_out_file, x);
8008       break;
8009
8010     case 'G':
8011       switch (aarch64_classify_symbolic_expression (x))
8012         {
8013         case SYMBOL_TLSLE24:
8014           asm_fprintf (asm_out_file, ":tprel_hi12:");
8015           break;
8016         default:
8017           break;
8018         }
8019       output_addr_const (asm_out_file, x);
8020       break;
8021
8022     case 'k':
8023       {
8024         HOST_WIDE_INT cond_code;
8025
8026         if (!CONST_INT_P (x))
8027           {
8028             output_operand_lossage ("invalid operand for '%%%c'", code);
8029             return;
8030           }
8031
8032         cond_code = INTVAL (x);
8033         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8034         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8035       }
8036       break;
8037
8038     case 'y':
8039     case 'z':
8040       {
8041         machine_mode mode = GET_MODE (x);
8042
8043         if (GET_CODE (x) != MEM
8044             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8045           {
8046             output_operand_lossage ("invalid operand for '%%%c'", code);
8047             return;
8048           }
8049
8050         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8051                                             code == 'y'
8052                                             ? ADDR_QUERY_LDP_STP_N
8053                                             : ADDR_QUERY_LDP_STP))
8054           output_operand_lossage ("invalid operand prefix '%%%c'", code);
8055       }
8056       break;
8057
8058     default:
8059       output_operand_lossage ("invalid operand prefix '%%%c'", code);
8060       return;
8061     }
8062 }
8063
8064 /* Print address 'x' of a memory access with mode 'mode'.
8065    'op' is the context required by aarch64_classify_address.  It can either be
8066    MEM for a normal memory access or PARALLEL for LDP/STP.  */
8067 static bool
8068 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8069                                 aarch64_addr_query_type type)
8070 {
8071   struct aarch64_address_info addr;
8072   unsigned int size;
8073
8074   /* Check all addresses are Pmode - including ILP32.  */
8075   if (GET_MODE (x) != Pmode
8076       && (!CONST_INT_P (x)
8077           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8078     {
8079       output_operand_lossage ("invalid address mode");
8080       return false;
8081     }
8082
8083   if (aarch64_classify_address (&addr, x, mode, true, type))
8084     switch (addr.type)
8085       {
8086       case ADDRESS_REG_IMM:
8087         if (known_eq (addr.const_offset, 0))
8088           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8089         else if (aarch64_sve_data_mode_p (mode))
8090           {
8091             HOST_WIDE_INT vnum
8092               = exact_div (addr.const_offset,
8093                            BYTES_PER_SVE_VECTOR).to_constant ();
8094             asm_fprintf (f, "[%s, #%wd, mul vl]",
8095                          reg_names[REGNO (addr.base)], vnum);
8096           }
8097         else if (aarch64_sve_pred_mode_p (mode))
8098           {
8099             HOST_WIDE_INT vnum
8100               = exact_div (addr.const_offset,
8101                            BYTES_PER_SVE_PRED).to_constant ();
8102             asm_fprintf (f, "[%s, #%wd, mul vl]",
8103                          reg_names[REGNO (addr.base)], vnum);
8104           }
8105         else
8106           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8107                        INTVAL (addr.offset));
8108         return true;
8109
8110       case ADDRESS_REG_REG:
8111         if (addr.shift == 0)
8112           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8113                        reg_names [REGNO (addr.offset)]);
8114         else
8115           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8116                        reg_names [REGNO (addr.offset)], addr.shift);
8117         return true;
8118
8119       case ADDRESS_REG_UXTW:
8120         if (addr.shift == 0)
8121           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8122                        REGNO (addr.offset) - R0_REGNUM);
8123         else
8124           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8125                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8126         return true;
8127
8128       case ADDRESS_REG_SXTW:
8129         if (addr.shift == 0)
8130           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8131                        REGNO (addr.offset) - R0_REGNUM);
8132         else
8133           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8134                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8135         return true;
8136
8137       case ADDRESS_REG_WB:
8138         /* Writeback is only supported for fixed-width modes.  */
8139         size = GET_MODE_SIZE (mode).to_constant ();
8140         switch (GET_CODE (x))
8141           {
8142           case PRE_INC:
8143             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8144             return true;
8145           case POST_INC:
8146             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8147             return true;
8148           case PRE_DEC:
8149             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8150             return true;
8151           case POST_DEC:
8152             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
8153             return true;
8154           case PRE_MODIFY:
8155             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
8156                          INTVAL (addr.offset));
8157             return true;
8158           case POST_MODIFY:
8159             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8160                          INTVAL (addr.offset));
8161             return true;
8162           default:
8163             break;
8164           }
8165         break;
8166
8167       case ADDRESS_LO_SUM:
8168         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8169         output_addr_const (f, addr.offset);
8170         asm_fprintf (f, "]");
8171         return true;
8172
8173       case ADDRESS_SYMBOLIC:
8174         output_addr_const (f, x);
8175         return true;
8176       }
8177
8178   return false;
8179 }
8180
8181 /* Print address 'x' of a memory access with mode 'mode'.  */
8182 static void
8183 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8184 {
8185   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
8186     output_addr_const (f, x);
8187 }
8188
8189 bool
8190 aarch64_label_mentioned_p (rtx x)
8191 {
8192   const char *fmt;
8193   int i;
8194
8195   if (GET_CODE (x) == LABEL_REF)
8196     return true;
8197
8198   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8199      referencing instruction, but they are constant offsets, not
8200      symbols.  */
8201   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8202     return false;
8203
8204   fmt = GET_RTX_FORMAT (GET_CODE (x));
8205   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8206     {
8207       if (fmt[i] == 'E')
8208         {
8209           int j;
8210
8211           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8212             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8213               return 1;
8214         }
8215       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
8216         return 1;
8217     }
8218
8219   return 0;
8220 }
8221
8222 /* Implement REGNO_REG_CLASS.  */
8223
8224 enum reg_class
8225 aarch64_regno_regclass (unsigned regno)
8226 {
8227   if (GP_REGNUM_P (regno))
8228     return GENERAL_REGS;
8229
8230   if (regno == SP_REGNUM)
8231     return STACK_REG;
8232
8233   if (regno == FRAME_POINTER_REGNUM
8234       || regno == ARG_POINTER_REGNUM)
8235     return POINTER_REGS;
8236
8237   if (FP_REGNUM_P (regno))
8238     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
8239
8240   if (PR_REGNUM_P (regno))
8241     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
8242
8243   return NO_REGS;
8244 }
8245
8246 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8247    If OFFSET is out of range, return an offset of an anchor point
8248    that is in range.  Return 0 otherwise.  */
8249
8250 static HOST_WIDE_INT
8251 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
8252                        machine_mode mode)
8253 {
8254   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
8255   if (size > 16)
8256     return (offset + 0x400) & ~0x7f0;
8257
8258   /* For offsets that aren't a multiple of the access size, the limit is
8259      -256...255.  */
8260   if (offset & (size - 1))
8261     {
8262       /* BLKmode typically uses LDP of X-registers.  */
8263       if (mode == BLKmode)
8264         return (offset + 512) & ~0x3ff;
8265       return (offset + 0x100) & ~0x1ff;
8266     }
8267
8268   /* Small negative offsets are supported.  */
8269   if (IN_RANGE (offset, -256, 0))
8270     return 0;
8271
8272   if (mode == TImode || mode == TFmode)
8273     return (offset + 0x100) & ~0x1ff;
8274
8275   /* Use 12-bit offset by access size.  */
8276   return offset & (~0xfff * size);
8277 }
8278
8279 static rtx
8280 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
8281 {
8282   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8283      where mask is selected by alignment and size of the offset.
8284      We try to pick as large a range for the offset as possible to
8285      maximize the chance of a CSE.  However, for aligned addresses
8286      we limit the range to 4k so that structures with different sized
8287      elements are likely to use the same base.  We need to be careful
8288      not to split a CONST for some forms of address expression, otherwise
8289      it will generate sub-optimal code.  */
8290
8291   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
8292     {
8293       rtx base = XEXP (x, 0);
8294       rtx offset_rtx = XEXP (x, 1);
8295       HOST_WIDE_INT offset = INTVAL (offset_rtx);
8296
8297       if (GET_CODE (base) == PLUS)
8298         {
8299           rtx op0 = XEXP (base, 0);
8300           rtx op1 = XEXP (base, 1);
8301
8302           /* Force any scaling into a temp for CSE.  */
8303           op0 = force_reg (Pmode, op0);
8304           op1 = force_reg (Pmode, op1);
8305
8306           /* Let the pointer register be in op0.  */
8307           if (REG_POINTER (op1))
8308             std::swap (op0, op1);
8309
8310           /* If the pointer is virtual or frame related, then we know that
8311              virtual register instantiation or register elimination is going
8312              to apply a second constant.  We want the two constants folded
8313              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
8314           if (virt_or_elim_regno_p (REGNO (op0)))
8315             {
8316               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
8317                                    NULL_RTX, true, OPTAB_DIRECT);
8318               return gen_rtx_PLUS (Pmode, base, op1);
8319             }
8320
8321           /* Otherwise, in order to encourage CSE (and thence loop strength
8322              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
8323           base = expand_binop (Pmode, add_optab, op0, op1,
8324                                NULL_RTX, true, OPTAB_DIRECT);
8325           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
8326         }
8327
8328       HOST_WIDE_INT size;
8329       if (GET_MODE_SIZE (mode).is_constant (&size))
8330         {
8331           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
8332                                                              mode);
8333           if (base_offset != 0)
8334             {
8335               base = plus_constant (Pmode, base, base_offset);
8336               base = force_operand (base, NULL_RTX);
8337               return plus_constant (Pmode, base, offset - base_offset);
8338             }
8339         }
8340     }
8341
8342   return x;
8343 }
8344
8345 static reg_class_t
8346 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
8347                           reg_class_t rclass,
8348                           machine_mode mode,
8349                           secondary_reload_info *sri)
8350 {
8351   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8352      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
8353      comment at the head of aarch64-sve.md for more details about the
8354      big-endian handling.  */
8355   if (BYTES_BIG_ENDIAN
8356       && reg_class_subset_p (rclass, FP_REGS)
8357       && !((REG_P (x) && HARD_REGISTER_P (x))
8358            || aarch64_simd_valid_immediate (x, NULL))
8359       && aarch64_sve_data_mode_p (mode))
8360     {
8361       sri->icode = CODE_FOR_aarch64_sve_reload_be;
8362       return NO_REGS;
8363     }
8364
8365   /* If we have to disable direct literal pool loads and stores because the
8366      function is too big, then we need a scratch register.  */
8367   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
8368       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
8369           || targetm.vector_mode_supported_p (GET_MODE (x)))
8370       && !aarch64_pcrelative_literal_loads)
8371     {
8372       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
8373       return NO_REGS;
8374     }
8375
8376   /* Without the TARGET_SIMD instructions we cannot move a Q register
8377      to a Q register directly.  We need a scratch.  */
8378   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
8379       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
8380       && reg_class_subset_p (rclass, FP_REGS))
8381     {
8382       sri->icode = code_for_aarch64_reload_mov (mode);
8383       return NO_REGS;
8384     }
8385
8386   /* A TFmode or TImode memory access should be handled via an FP_REGS
8387      because AArch64 has richer addressing modes for LDR/STR instructions
8388      than LDP/STP instructions.  */
8389   if (TARGET_FLOAT && rclass == GENERAL_REGS
8390       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
8391     return FP_REGS;
8392
8393   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
8394       return GENERAL_REGS;
8395
8396   return NO_REGS;
8397 }
8398
8399 static bool
8400 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
8401 {
8402   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
8403
8404   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8405      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
8406   if (frame_pointer_needed)
8407     return to == HARD_FRAME_POINTER_REGNUM;
8408   return true;
8409 }
8410
8411 poly_int64
8412 aarch64_initial_elimination_offset (unsigned from, unsigned to)
8413 {
8414   if (to == HARD_FRAME_POINTER_REGNUM)
8415     {
8416       if (from == ARG_POINTER_REGNUM)
8417         return cfun->machine->frame.hard_fp_offset;
8418
8419       if (from == FRAME_POINTER_REGNUM)
8420         return cfun->machine->frame.hard_fp_offset
8421                - cfun->machine->frame.locals_offset;
8422     }
8423
8424   if (to == STACK_POINTER_REGNUM)
8425     {
8426       if (from == FRAME_POINTER_REGNUM)
8427           return cfun->machine->frame.frame_size
8428                  - cfun->machine->frame.locals_offset;
8429     }
8430
8431   return cfun->machine->frame.frame_size;
8432 }
8433
8434 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
8435    previous frame.  */
8436
8437 rtx
8438 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
8439 {
8440   if (count != 0)
8441     return const0_rtx;
8442   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
8443 }
8444
8445
8446 static void
8447 aarch64_asm_trampoline_template (FILE *f)
8448 {
8449   int offset1 = 16;
8450   int offset2 = 20;
8451
8452   if (aarch64_bti_enabled ())
8453     {
8454       asm_fprintf (f, "\thint\t34 // bti c\n");
8455       offset1 -= 4;
8456       offset2 -= 4;
8457     }
8458
8459   if (TARGET_ILP32)
8460     {
8461       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
8462       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
8463                    offset1);
8464     }
8465   else
8466     {
8467       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
8468       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
8469                    offset2);
8470     }
8471   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
8472
8473   /* The trampoline needs an extra padding instruction.  In case if BTI is
8474      enabled the padding instruction is replaced by the BTI instruction at
8475      the beginning.  */
8476   if (!aarch64_bti_enabled ())
8477     assemble_aligned_integer (4, const0_rtx);
8478
8479   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8480   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8481 }
8482
8483 static void
8484 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
8485 {
8486   rtx fnaddr, mem, a_tramp;
8487   const int tramp_code_sz = 16;
8488
8489   /* Don't need to copy the trailing D-words, we fill those in below.  */
8490   emit_block_move (m_tramp, assemble_trampoline_template (),
8491                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
8492   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
8493   fnaddr = XEXP (DECL_RTL (fndecl), 0);
8494   if (GET_MODE (fnaddr) != ptr_mode)
8495     fnaddr = convert_memory_address (ptr_mode, fnaddr);
8496   emit_move_insn (mem, fnaddr);
8497
8498   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
8499   emit_move_insn (mem, chain_value);
8500
8501   /* XXX We should really define a "clear_cache" pattern and use
8502      gen_clear_cache().  */
8503   a_tramp = XEXP (m_tramp, 0);
8504   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
8505                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
8506                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
8507                      ptr_mode);
8508 }
8509
8510 static unsigned char
8511 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
8512 {
8513   /* ??? Logically we should only need to provide a value when
8514      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
8515      can hold MODE, but at the moment we need to handle all modes.
8516      Just ignore any runtime parts for registers that can't store them.  */
8517   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
8518   unsigned int nregs;
8519   switch (regclass)
8520     {
8521     case TAILCALL_ADDR_REGS:
8522     case POINTER_REGS:
8523     case GENERAL_REGS:
8524     case ALL_REGS:
8525     case POINTER_AND_FP_REGS:
8526     case FP_REGS:
8527     case FP_LO_REGS:
8528       if (aarch64_sve_data_mode_p (mode)
8529           && constant_multiple_p (GET_MODE_SIZE (mode),
8530                                   BYTES_PER_SVE_VECTOR, &nregs))
8531         return nregs;
8532       return (aarch64_vector_data_mode_p (mode)
8533               ? CEIL (lowest_size, UNITS_PER_VREG)
8534               : CEIL (lowest_size, UNITS_PER_WORD));
8535     case STACK_REG:
8536     case PR_REGS:
8537     case PR_LO_REGS:
8538     case PR_HI_REGS:
8539       return 1;
8540
8541     case NO_REGS:
8542       return 0;
8543
8544     default:
8545       break;
8546     }
8547   gcc_unreachable ();
8548 }
8549
8550 static reg_class_t
8551 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
8552 {
8553   if (regclass == POINTER_REGS)
8554     return GENERAL_REGS;
8555
8556   if (regclass == STACK_REG)
8557     {
8558       if (REG_P(x)
8559           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
8560           return regclass;
8561
8562       return NO_REGS;
8563     }
8564
8565   /* Register eliminiation can result in a request for
8566      SP+constant->FP_REGS.  We cannot support such operations which
8567      use SP as source and an FP_REG as destination, so reject out
8568      right now.  */
8569   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
8570     {
8571       rtx lhs = XEXP (x, 0);
8572
8573       /* Look through a possible SUBREG introduced by ILP32.  */
8574       if (GET_CODE (lhs) == SUBREG)
8575         lhs = SUBREG_REG (lhs);
8576
8577       gcc_assert (REG_P (lhs));
8578       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
8579                                       POINTER_REGS));
8580       return NO_REGS;
8581     }
8582
8583   return regclass;
8584 }
8585
8586 void
8587 aarch64_asm_output_labelref (FILE* f, const char *name)
8588 {
8589   asm_fprintf (f, "%U%s", name);
8590 }
8591
8592 static void
8593 aarch64_elf_asm_constructor (rtx symbol, int priority)
8594 {
8595   if (priority == DEFAULT_INIT_PRIORITY)
8596     default_ctor_section_asm_out_constructor (symbol, priority);
8597   else
8598     {
8599       section *s;
8600       /* While priority is known to be in range [0, 65535], so 18 bytes
8601          would be enough, the compiler might not know that.  To avoid
8602          -Wformat-truncation false positive, use a larger size.  */
8603       char buf[23];
8604       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
8605       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8606       switch_to_section (s);
8607       assemble_align (POINTER_SIZE);
8608       assemble_aligned_integer (POINTER_BYTES, symbol);
8609     }
8610 }
8611
8612 static void
8613 aarch64_elf_asm_destructor (rtx symbol, int priority)
8614 {
8615   if (priority == DEFAULT_INIT_PRIORITY)
8616     default_dtor_section_asm_out_destructor (symbol, priority);
8617   else
8618     {
8619       section *s;
8620       /* While priority is known to be in range [0, 65535], so 18 bytes
8621          would be enough, the compiler might not know that.  To avoid
8622          -Wformat-truncation false positive, use a larger size.  */
8623       char buf[23];
8624       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
8625       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8626       switch_to_section (s);
8627       assemble_align (POINTER_SIZE);
8628       assemble_aligned_integer (POINTER_BYTES, symbol);
8629     }
8630 }
8631
8632 const char*
8633 aarch64_output_casesi (rtx *operands)
8634 {
8635   char buf[100];
8636   char label[100];
8637   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
8638   int index;
8639   static const char *const patterns[4][2] =
8640   {
8641     {
8642       "ldrb\t%w3, [%0,%w1,uxtw]",
8643       "add\t%3, %4, %w3, sxtb #2"
8644     },
8645     {
8646       "ldrh\t%w3, [%0,%w1,uxtw #1]",
8647       "add\t%3, %4, %w3, sxth #2"
8648     },
8649     {
8650       "ldr\t%w3, [%0,%w1,uxtw #2]",
8651       "add\t%3, %4, %w3, sxtw #2"
8652     },
8653     /* We assume that DImode is only generated when not optimizing and
8654        that we don't really need 64-bit address offsets.  That would
8655        imply an object file with 8GB of code in a single function!  */
8656     {
8657       "ldr\t%w3, [%0,%w1,uxtw #2]",
8658       "add\t%3, %4, %w3, sxtw #2"
8659     }
8660   };
8661
8662   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
8663
8664   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
8665   index = exact_log2 (GET_MODE_SIZE (mode));
8666
8667   gcc_assert (index >= 0 && index <= 3);
8668
8669   /* Need to implement table size reduction, by chaning the code below.  */
8670   output_asm_insn (patterns[index][0], operands);
8671   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
8672   snprintf (buf, sizeof (buf),
8673             "adr\t%%4, %s", targetm.strip_name_encoding (label));
8674   output_asm_insn (buf, operands);
8675   output_asm_insn (patterns[index][1], operands);
8676   output_asm_insn ("br\t%3", operands);
8677   assemble_label (asm_out_file, label);
8678   return "";
8679 }
8680
8681
8682 /* Return size in bits of an arithmetic operand which is shifted/scaled and
8683    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
8684    operator.  */
8685
8686 int
8687 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
8688 {
8689   if (shift >= 0 && shift <= 3)
8690     {
8691       int size;
8692       for (size = 8; size <= 32; size *= 2)
8693         {
8694           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
8695           if (mask == bits << shift)
8696             return size;
8697         }
8698     }
8699   return 0;
8700 }
8701
8702 /* Constant pools are per function only when PC relative
8703    literal loads are true or we are in the large memory
8704    model.  */
8705
8706 static inline bool
8707 aarch64_can_use_per_function_literal_pools_p (void)
8708 {
8709   return (aarch64_pcrelative_literal_loads
8710           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
8711 }
8712
8713 static bool
8714 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
8715 {
8716   /* We can't use blocks for constants when we're using a per-function
8717      constant pool.  */
8718   return !aarch64_can_use_per_function_literal_pools_p ();
8719 }
8720
8721 /* Select appropriate section for constants depending
8722    on where we place literal pools.  */
8723
8724 static section *
8725 aarch64_select_rtx_section (machine_mode mode,
8726                             rtx x,
8727                             unsigned HOST_WIDE_INT align)
8728 {
8729   if (aarch64_can_use_per_function_literal_pools_p ())
8730     return function_section (current_function_decl);
8731
8732   return default_elf_select_rtx_section (mode, x, align);
8733 }
8734
8735 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
8736 void
8737 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
8738                                   HOST_WIDE_INT offset)
8739 {
8740   /* When using per-function literal pools, we must ensure that any code
8741      section is aligned to the minimal instruction length, lest we get
8742      errors from the assembler re "unaligned instructions".  */
8743   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
8744     ASM_OUTPUT_ALIGN (f, 2);
8745 }
8746
8747 /* Costs.  */
8748
8749 /* Helper function for rtx cost calculation.  Strip a shift expression
8750    from X.  Returns the inner operand if successful, or the original
8751    expression on failure.  */
8752 static rtx
8753 aarch64_strip_shift (rtx x)
8754 {
8755   rtx op = x;
8756
8757   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
8758      we can convert both to ROR during final output.  */
8759   if ((GET_CODE (op) == ASHIFT
8760        || GET_CODE (op) == ASHIFTRT
8761        || GET_CODE (op) == LSHIFTRT
8762        || GET_CODE (op) == ROTATERT
8763        || GET_CODE (op) == ROTATE)
8764       && CONST_INT_P (XEXP (op, 1)))
8765     return XEXP (op, 0);
8766
8767   if (GET_CODE (op) == MULT
8768       && CONST_INT_P (XEXP (op, 1))
8769       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
8770     return XEXP (op, 0);
8771
8772   return x;
8773 }
8774
8775 /* Helper function for rtx cost calculation.  Strip an extend
8776    expression from X.  Returns the inner operand if successful, or the
8777    original expression on failure.  We deal with a number of possible
8778    canonicalization variations here. If STRIP_SHIFT is true, then
8779    we can strip off a shift also.  */
8780 static rtx
8781 aarch64_strip_extend (rtx x, bool strip_shift)
8782 {
8783   scalar_int_mode mode;
8784   rtx op = x;
8785
8786   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
8787     return op;
8788
8789   /* Zero and sign extraction of a widened value.  */
8790   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
8791       && XEXP (op, 2) == const0_rtx
8792       && GET_CODE (XEXP (op, 0)) == MULT
8793       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
8794                                          XEXP (op, 1)))
8795     return XEXP (XEXP (op, 0), 0);
8796
8797   /* It can also be represented (for zero-extend) as an AND with an
8798      immediate.  */
8799   if (GET_CODE (op) == AND
8800       && GET_CODE (XEXP (op, 0)) == MULT
8801       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
8802       && CONST_INT_P (XEXP (op, 1))
8803       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
8804                            INTVAL (XEXP (op, 1))) != 0)
8805     return XEXP (XEXP (op, 0), 0);
8806
8807   /* Now handle extended register, as this may also have an optional
8808      left shift by 1..4.  */
8809   if (strip_shift
8810       && GET_CODE (op) == ASHIFT
8811       && CONST_INT_P (XEXP (op, 1))
8812       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
8813     op = XEXP (op, 0);
8814
8815   if (GET_CODE (op) == ZERO_EXTEND
8816       || GET_CODE (op) == SIGN_EXTEND)
8817     op = XEXP (op, 0);
8818
8819   if (op != x)
8820     return op;
8821
8822   return x;
8823 }
8824
8825 /* Return true iff CODE is a shift supported in combination
8826    with arithmetic instructions.  */
8827
8828 static bool
8829 aarch64_shift_p (enum rtx_code code)
8830 {
8831   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
8832 }
8833
8834
8835 /* Return true iff X is a cheap shift without a sign extend. */
8836
8837 static bool
8838 aarch64_cheap_mult_shift_p (rtx x)
8839 {
8840   rtx op0, op1;
8841
8842   op0 = XEXP (x, 0);
8843   op1 = XEXP (x, 1);
8844
8845   if (!(aarch64_tune_params.extra_tuning_flags
8846                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
8847     return false;
8848
8849   if (GET_CODE (op0) == SIGN_EXTEND)
8850     return false;
8851
8852   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
8853       && UINTVAL (op1) <= 4)
8854     return true;
8855
8856   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
8857     return false;
8858
8859   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
8860
8861   if (l2 > 0 && l2 <= 4)
8862     return true;
8863
8864   return false;
8865 }
8866
8867 /* Helper function for rtx cost calculation.  Calculate the cost of
8868    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
8869    Return the calculated cost of the expression, recursing manually in to
8870    operands where needed.  */
8871
8872 static int
8873 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
8874 {
8875   rtx op0, op1;
8876   const struct cpu_cost_table *extra_cost
8877     = aarch64_tune_params.insn_extra_cost;
8878   int cost = 0;
8879   bool compound_p = (outer == PLUS || outer == MINUS);
8880   machine_mode mode = GET_MODE (x);
8881
8882   gcc_checking_assert (code == MULT);
8883
8884   op0 = XEXP (x, 0);
8885   op1 = XEXP (x, 1);
8886
8887   if (VECTOR_MODE_P (mode))
8888     mode = GET_MODE_INNER (mode);
8889
8890   /* Integer multiply/fma.  */
8891   if (GET_MODE_CLASS (mode) == MODE_INT)
8892     {
8893       /* The multiply will be canonicalized as a shift, cost it as such.  */
8894       if (aarch64_shift_p (GET_CODE (x))
8895           || (CONST_INT_P (op1)
8896               && exact_log2 (INTVAL (op1)) > 0))
8897         {
8898           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
8899                            || GET_CODE (op0) == SIGN_EXTEND;
8900           if (speed)
8901             {
8902               if (compound_p)
8903                 {
8904                   /* If the shift is considered cheap,
8905                      then don't add any cost. */
8906                   if (aarch64_cheap_mult_shift_p (x))
8907                     ;
8908                   else if (REG_P (op1))
8909                     /* ARITH + shift-by-register.  */
8910                     cost += extra_cost->alu.arith_shift_reg;
8911                   else if (is_extend)
8912                     /* ARITH + extended register.  We don't have a cost field
8913                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
8914                     cost += extra_cost->alu.extend_arith;
8915                   else
8916                     /* ARITH + shift-by-immediate.  */
8917                     cost += extra_cost->alu.arith_shift;
8918                 }
8919               else
8920                 /* LSL (immediate).  */
8921                 cost += extra_cost->alu.shift;
8922
8923             }
8924           /* Strip extends as we will have costed them in the case above.  */
8925           if (is_extend)
8926             op0 = aarch64_strip_extend (op0, true);
8927
8928           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
8929
8930           return cost;
8931         }
8932
8933       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
8934          compound and let the below cases handle it.  After all, MNEG is a
8935          special-case alias of MSUB.  */
8936       if (GET_CODE (op0) == NEG)
8937         {
8938           op0 = XEXP (op0, 0);
8939           compound_p = true;
8940         }
8941
8942       /* Integer multiplies or FMAs have zero/sign extending variants.  */
8943       if ((GET_CODE (op0) == ZERO_EXTEND
8944            && GET_CODE (op1) == ZERO_EXTEND)
8945           || (GET_CODE (op0) == SIGN_EXTEND
8946               && GET_CODE (op1) == SIGN_EXTEND))
8947         {
8948           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8949           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8950
8951           if (speed)
8952             {
8953               if (compound_p)
8954                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
8955                 cost += extra_cost->mult[0].extend_add;
8956               else
8957                 /* MUL/SMULL/UMULL.  */
8958                 cost += extra_cost->mult[0].extend;
8959             }
8960
8961           return cost;
8962         }
8963
8964       /* This is either an integer multiply or a MADD.  In both cases
8965          we want to recurse and cost the operands.  */
8966       cost += rtx_cost (op0, mode, MULT, 0, speed);
8967       cost += rtx_cost (op1, mode, MULT, 1, speed);
8968
8969       if (speed)
8970         {
8971           if (compound_p)
8972             /* MADD/MSUB.  */
8973             cost += extra_cost->mult[mode == DImode].add;
8974           else
8975             /* MUL.  */
8976             cost += extra_cost->mult[mode == DImode].simple;
8977         }
8978
8979       return cost;
8980     }
8981   else
8982     {
8983       if (speed)
8984         {
8985           /* Floating-point FMA/FMUL can also support negations of the
8986              operands, unless the rounding mode is upward or downward in
8987              which case FNMUL is different than FMUL with operand negation.  */
8988           bool neg0 = GET_CODE (op0) == NEG;
8989           bool neg1 = GET_CODE (op1) == NEG;
8990           if (compound_p || !flag_rounding_math || (neg0 && neg1))
8991             {
8992               if (neg0)
8993                 op0 = XEXP (op0, 0);
8994               if (neg1)
8995                 op1 = XEXP (op1, 0);
8996             }
8997
8998           if (compound_p)
8999             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
9000             cost += extra_cost->fp[mode == DFmode].fma;
9001           else
9002             /* FMUL/FNMUL.  */
9003             cost += extra_cost->fp[mode == DFmode].mult;
9004         }
9005
9006       cost += rtx_cost (op0, mode, MULT, 0, speed);
9007       cost += rtx_cost (op1, mode, MULT, 1, speed);
9008       return cost;
9009     }
9010 }
9011
9012 static int
9013 aarch64_address_cost (rtx x,
9014                       machine_mode mode,
9015                       addr_space_t as ATTRIBUTE_UNUSED,
9016                       bool speed)
9017 {
9018   enum rtx_code c = GET_CODE (x);
9019   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9020   struct aarch64_address_info info;
9021   int cost = 0;
9022   info.shift = 0;
9023
9024   if (!aarch64_classify_address (&info, x, mode, false))
9025     {
9026       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9027         {
9028           /* This is a CONST or SYMBOL ref which will be split
9029              in a different way depending on the code model in use.
9030              Cost it through the generic infrastructure.  */
9031           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9032           /* Divide through by the cost of one instruction to
9033              bring it to the same units as the address costs.  */
9034           cost_symbol_ref /= COSTS_N_INSNS (1);
9035           /* The cost is then the cost of preparing the address,
9036              followed by an immediate (possibly 0) offset.  */
9037           return cost_symbol_ref + addr_cost->imm_offset;
9038         }
9039       else
9040         {
9041           /* This is most likely a jump table from a case
9042              statement.  */
9043           return addr_cost->register_offset;
9044         }
9045     }
9046
9047   switch (info.type)
9048     {
9049       case ADDRESS_LO_SUM:
9050       case ADDRESS_SYMBOLIC:
9051       case ADDRESS_REG_IMM:
9052         cost += addr_cost->imm_offset;
9053         break;
9054
9055       case ADDRESS_REG_WB:
9056         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9057           cost += addr_cost->pre_modify;
9058         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9059           cost += addr_cost->post_modify;
9060         else
9061           gcc_unreachable ();
9062
9063         break;
9064
9065       case ADDRESS_REG_REG:
9066         cost += addr_cost->register_offset;
9067         break;
9068
9069       case ADDRESS_REG_SXTW:
9070         cost += addr_cost->register_sextend;
9071         break;
9072
9073       case ADDRESS_REG_UXTW:
9074         cost += addr_cost->register_zextend;
9075         break;
9076
9077       default:
9078         gcc_unreachable ();
9079     }
9080
9081
9082   if (info.shift > 0)
9083     {
9084       /* For the sake of calculating the cost of the shifted register
9085          component, we can treat same sized modes in the same way.  */
9086       if (known_eq (GET_MODE_BITSIZE (mode), 16))
9087         cost += addr_cost->addr_scale_costs.hi;
9088       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9089         cost += addr_cost->addr_scale_costs.si;
9090       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9091         cost += addr_cost->addr_scale_costs.di;
9092       else
9093         /* We can't tell, or this is a 128-bit vector.  */
9094         cost += addr_cost->addr_scale_costs.ti;
9095     }
9096
9097   return cost;
9098 }
9099
9100 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
9101    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
9102    to be taken.  */
9103
9104 int
9105 aarch64_branch_cost (bool speed_p, bool predictable_p)
9106 {
9107   /* When optimizing for speed, use the cost of unpredictable branches.  */
9108   const struct cpu_branch_cost *branch_costs =
9109     aarch64_tune_params.branch_costs;
9110
9111   if (!speed_p || predictable_p)
9112     return branch_costs->predictable;
9113   else
9114     return branch_costs->unpredictable;
9115 }
9116
9117 /* Return true if the RTX X in mode MODE is a zero or sign extract
9118    usable in an ADD or SUB (extended register) instruction.  */
9119 static bool
9120 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9121 {
9122   /* Catch add with a sign extract.
9123      This is add_<optab><mode>_multp2.  */
9124   if (GET_CODE (x) == SIGN_EXTRACT
9125       || GET_CODE (x) == ZERO_EXTRACT)
9126     {
9127       rtx op0 = XEXP (x, 0);
9128       rtx op1 = XEXP (x, 1);
9129       rtx op2 = XEXP (x, 2);
9130
9131       if (GET_CODE (op0) == MULT
9132           && CONST_INT_P (op1)
9133           && op2 == const0_rtx
9134           && CONST_INT_P (XEXP (op0, 1))
9135           && aarch64_is_extend_from_extract (mode,
9136                                              XEXP (op0, 1),
9137                                              op1))
9138         {
9139           return true;
9140         }
9141     }
9142   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9143      No shift.  */
9144   else if (GET_CODE (x) == SIGN_EXTEND
9145            || GET_CODE (x) == ZERO_EXTEND)
9146     return REG_P (XEXP (x, 0));
9147
9148   return false;
9149 }
9150
9151 static bool
9152 aarch64_frint_unspec_p (unsigned int u)
9153 {
9154   switch (u)
9155     {
9156       case UNSPEC_FRINTZ:
9157       case UNSPEC_FRINTP:
9158       case UNSPEC_FRINTM:
9159       case UNSPEC_FRINTA:
9160       case UNSPEC_FRINTN:
9161       case UNSPEC_FRINTX:
9162       case UNSPEC_FRINTI:
9163         return true;
9164
9165       default:
9166         return false;
9167     }
9168 }
9169
9170 /* Return true iff X is an rtx that will match an extr instruction
9171    i.e. as described in the *extr<mode>5_insn family of patterns.
9172    OP0 and OP1 will be set to the operands of the shifts involved
9173    on success and will be NULL_RTX otherwise.  */
9174
9175 static bool
9176 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9177 {
9178   rtx op0, op1;
9179   scalar_int_mode mode;
9180   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9181     return false;
9182
9183   *res_op0 = NULL_RTX;
9184   *res_op1 = NULL_RTX;
9185
9186   if (GET_CODE (x) != IOR)
9187     return false;
9188
9189   op0 = XEXP (x, 0);
9190   op1 = XEXP (x, 1);
9191
9192   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
9193       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
9194     {
9195      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
9196       if (GET_CODE (op1) == ASHIFT)
9197         std::swap (op0, op1);
9198
9199       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9200         return false;
9201
9202       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9203       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9204
9205       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9206           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9207         {
9208           *res_op0 = XEXP (op0, 0);
9209           *res_op1 = XEXP (op1, 0);
9210           return true;
9211         }
9212     }
9213
9214   return false;
9215 }
9216
9217 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9218    storing it in *COST.  Result is true if the total cost of the operation
9219    has now been calculated.  */
9220 static bool
9221 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
9222 {
9223   rtx inner;
9224   rtx comparator;
9225   enum rtx_code cmpcode;
9226
9227   if (COMPARISON_P (op0))
9228     {
9229       inner = XEXP (op0, 0);
9230       comparator = XEXP (op0, 1);
9231       cmpcode = GET_CODE (op0);
9232     }
9233   else
9234     {
9235       inner = op0;
9236       comparator = const0_rtx;
9237       cmpcode = NE;
9238     }
9239
9240   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
9241     {
9242       /* Conditional branch.  */
9243       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9244         return true;
9245       else
9246         {
9247           if (cmpcode == NE || cmpcode == EQ)
9248             {
9249               if (comparator == const0_rtx)
9250                 {
9251                   /* TBZ/TBNZ/CBZ/CBNZ.  */
9252                   if (GET_CODE (inner) == ZERO_EXTRACT)
9253                     /* TBZ/TBNZ.  */
9254                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
9255                                        ZERO_EXTRACT, 0, speed);
9256                   else
9257                     /* CBZ/CBNZ.  */
9258                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
9259
9260                 return true;
9261               }
9262             }
9263           else if (cmpcode == LT || cmpcode == GE)
9264             {
9265               /* TBZ/TBNZ.  */
9266               if (comparator == const0_rtx)
9267                 return true;
9268             }
9269         }
9270     }
9271   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9272     {
9273       /* CCMP.  */
9274       if (GET_CODE (op1) == COMPARE)
9275         {
9276           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
9277           if (XEXP (op1, 1) == const0_rtx)
9278             *cost += 1;
9279           if (speed)
9280             {
9281               machine_mode mode = GET_MODE (XEXP (op1, 0));
9282               const struct cpu_cost_table *extra_cost
9283                 = aarch64_tune_params.insn_extra_cost;
9284
9285               if (GET_MODE_CLASS (mode) == MODE_INT)
9286                 *cost += extra_cost->alu.arith;
9287               else
9288                 *cost += extra_cost->fp[mode == DFmode].compare;
9289             }
9290           return true;
9291         }
9292
9293       /* It's a conditional operation based on the status flags,
9294          so it must be some flavor of CSEL.  */
9295
9296       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
9297       if (GET_CODE (op1) == NEG
9298           || GET_CODE (op1) == NOT
9299           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
9300         op1 = XEXP (op1, 0);
9301       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
9302         {
9303           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
9304           op1 = XEXP (op1, 0);
9305           op2 = XEXP (op2, 0);
9306         }
9307
9308       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
9309       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
9310       return true;
9311     }
9312
9313   /* We don't know what this is, cost all operands.  */
9314   return false;
9315 }
9316
9317 /* Check whether X is a bitfield operation of the form shift + extend that
9318    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
9319    operand to which the bitfield operation is applied.  Otherwise return
9320    NULL_RTX.  */
9321
9322 static rtx
9323 aarch64_extend_bitfield_pattern_p (rtx x)
9324 {
9325   rtx_code outer_code = GET_CODE (x);
9326   machine_mode outer_mode = GET_MODE (x);
9327
9328   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
9329       && outer_mode != SImode && outer_mode != DImode)
9330     return NULL_RTX;
9331
9332   rtx inner = XEXP (x, 0);
9333   rtx_code inner_code = GET_CODE (inner);
9334   machine_mode inner_mode = GET_MODE (inner);
9335   rtx op = NULL_RTX;
9336
9337   switch (inner_code)
9338     {
9339       case ASHIFT:
9340         if (CONST_INT_P (XEXP (inner, 1))
9341             && (inner_mode == QImode || inner_mode == HImode))
9342           op = XEXP (inner, 0);
9343         break;
9344       case LSHIFTRT:
9345         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
9346             && (inner_mode == QImode || inner_mode == HImode))
9347           op = XEXP (inner, 0);
9348         break;
9349       case ASHIFTRT:
9350         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
9351             && (inner_mode == QImode || inner_mode == HImode))
9352           op = XEXP (inner, 0);
9353         break;
9354       default:
9355         break;
9356     }
9357
9358   return op;
9359 }
9360
9361 /* Return true if the mask and a shift amount from an RTX of the form
9362    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9363    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
9364
9365 bool
9366 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
9367                                     rtx shft_amnt)
9368 {
9369   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
9370          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
9371          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
9372          && (INTVAL (mask)
9373              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
9374 }
9375
9376 /* Return true if the masks and a shift amount from an RTX of the form
9377    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
9378    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
9379
9380 bool
9381 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
9382                                    unsigned HOST_WIDE_INT mask1,
9383                                    unsigned HOST_WIDE_INT shft_amnt,
9384                                    unsigned HOST_WIDE_INT mask2)
9385 {
9386   unsigned HOST_WIDE_INT t;
9387
9388   /* Verify that there is no overlap in what bits are set in the two masks.  */
9389   if (mask1 != ~mask2)
9390     return false;
9391
9392   /* Verify that mask2 is not all zeros or ones.  */
9393   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
9394     return false;
9395
9396   /* The shift amount should always be less than the mode size.  */
9397   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
9398
9399   /* Verify that the mask being shifted is contiguous and would be in the
9400      least significant bits after shifting by shft_amnt.  */
9401   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
9402   return (t == (t & -t));
9403 }
9404
9405 /* Calculate the cost of calculating X, storing it in *COST.  Result
9406    is true if the total cost of the operation has now been calculated.  */
9407 static bool
9408 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
9409                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
9410 {
9411   rtx op0, op1, op2;
9412   const struct cpu_cost_table *extra_cost
9413     = aarch64_tune_params.insn_extra_cost;
9414   int code = GET_CODE (x);
9415   scalar_int_mode int_mode;
9416
9417   /* By default, assume that everything has equivalent cost to the
9418      cheapest instruction.  Any additional costs are applied as a delta
9419      above this default.  */
9420   *cost = COSTS_N_INSNS (1);
9421
9422   switch (code)
9423     {
9424     case SET:
9425       /* The cost depends entirely on the operands to SET.  */
9426       *cost = 0;
9427       op0 = SET_DEST (x);
9428       op1 = SET_SRC (x);
9429
9430       switch (GET_CODE (op0))
9431         {
9432         case MEM:
9433           if (speed)
9434             {
9435               rtx address = XEXP (op0, 0);
9436               if (VECTOR_MODE_P (mode))
9437                 *cost += extra_cost->ldst.storev;
9438               else if (GET_MODE_CLASS (mode) == MODE_INT)
9439                 *cost += extra_cost->ldst.store;
9440               else if (mode == SFmode)
9441                 *cost += extra_cost->ldst.storef;
9442               else if (mode == DFmode)
9443                 *cost += extra_cost->ldst.stored;
9444
9445               *cost +=
9446                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9447                                                      0, speed));
9448             }
9449
9450           *cost += rtx_cost (op1, mode, SET, 1, speed);
9451           return true;
9452
9453         case SUBREG:
9454           if (! REG_P (SUBREG_REG (op0)))
9455             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
9456
9457           /* Fall through.  */
9458         case REG:
9459           /* The cost is one per vector-register copied.  */
9460           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
9461             {
9462               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
9463               *cost = COSTS_N_INSNS (nregs);
9464             }
9465           /* const0_rtx is in general free, but we will use an
9466              instruction to set a register to 0.  */
9467           else if (REG_P (op1) || op1 == const0_rtx)
9468             {
9469               /* The cost is 1 per register copied.  */
9470               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
9471               *cost = COSTS_N_INSNS (nregs);
9472             }
9473           else
9474             /* Cost is just the cost of the RHS of the set.  */
9475             *cost += rtx_cost (op1, mode, SET, 1, speed);
9476           return true;
9477
9478         case ZERO_EXTRACT:
9479         case SIGN_EXTRACT:
9480           /* Bit-field insertion.  Strip any redundant widening of
9481              the RHS to meet the width of the target.  */
9482           if (GET_CODE (op1) == SUBREG)
9483             op1 = SUBREG_REG (op1);
9484           if ((GET_CODE (op1) == ZERO_EXTEND
9485                || GET_CODE (op1) == SIGN_EXTEND)
9486               && CONST_INT_P (XEXP (op0, 1))
9487               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
9488               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
9489             op1 = XEXP (op1, 0);
9490
9491           if (CONST_INT_P (op1))
9492             {
9493               /* MOV immediate is assumed to always be cheap.  */
9494               *cost = COSTS_N_INSNS (1);
9495             }
9496           else
9497             {
9498               /* BFM.  */
9499               if (speed)
9500                 *cost += extra_cost->alu.bfi;
9501               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
9502             }
9503
9504           return true;
9505
9506         default:
9507           /* We can't make sense of this, assume default cost.  */
9508           *cost = COSTS_N_INSNS (1);
9509           return false;
9510         }
9511       return false;
9512
9513     case CONST_INT:
9514       /* If an instruction can incorporate a constant within the
9515          instruction, the instruction's expression avoids calling
9516          rtx_cost() on the constant.  If rtx_cost() is called on a
9517          constant, then it is usually because the constant must be
9518          moved into a register by one or more instructions.
9519
9520          The exception is constant 0, which can be expressed
9521          as XZR/WZR and is therefore free.  The exception to this is
9522          if we have (set (reg) (const0_rtx)) in which case we must cost
9523          the move.  However, we can catch that when we cost the SET, so
9524          we don't need to consider that here.  */
9525       if (x == const0_rtx)
9526         *cost = 0;
9527       else
9528         {
9529           /* To an approximation, building any other constant is
9530              proportionally expensive to the number of instructions
9531              required to build that constant.  This is true whether we
9532              are compiling for SPEED or otherwise.  */
9533           if (!is_a <scalar_int_mode> (mode, &int_mode))
9534             int_mode = word_mode;
9535           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
9536                                  (NULL_RTX, x, false, int_mode));
9537         }
9538       return true;
9539
9540     case CONST_DOUBLE:
9541
9542       /* First determine number of instructions to do the move
9543           as an integer constant.  */
9544       if (!aarch64_float_const_representable_p (x)
9545            && !aarch64_can_const_movi_rtx_p (x, mode)
9546            && aarch64_float_const_rtx_p (x))
9547         {
9548           unsigned HOST_WIDE_INT ival;
9549           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
9550           gcc_assert (succeed);
9551
9552           scalar_int_mode imode = (mode == HFmode
9553                                    ? SImode
9554                                    : int_mode_for_mode (mode).require ());
9555           int ncost = aarch64_internal_mov_immediate
9556                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9557           *cost += COSTS_N_INSNS (ncost);
9558           return true;
9559         }
9560
9561       if (speed)
9562         {
9563           /* mov[df,sf]_aarch64.  */
9564           if (aarch64_float_const_representable_p (x))
9565             /* FMOV (scalar immediate).  */
9566             *cost += extra_cost->fp[mode == DFmode].fpconst;
9567           else if (!aarch64_float_const_zero_rtx_p (x))
9568             {
9569               /* This will be a load from memory.  */
9570               if (mode == DFmode)
9571                 *cost += extra_cost->ldst.loadd;
9572               else
9573                 *cost += extra_cost->ldst.loadf;
9574             }
9575           else
9576             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
9577                or MOV v0.s[0], wzr - neither of which are modeled by the
9578                cost tables.  Just use the default cost.  */
9579             {
9580             }
9581         }
9582
9583       return true;
9584
9585     case MEM:
9586       if (speed)
9587         {
9588           /* For loads we want the base cost of a load, plus an
9589              approximation for the additional cost of the addressing
9590              mode.  */
9591           rtx address = XEXP (x, 0);
9592           if (VECTOR_MODE_P (mode))
9593             *cost += extra_cost->ldst.loadv;
9594           else if (GET_MODE_CLASS (mode) == MODE_INT)
9595             *cost += extra_cost->ldst.load;
9596           else if (mode == SFmode)
9597             *cost += extra_cost->ldst.loadf;
9598           else if (mode == DFmode)
9599             *cost += extra_cost->ldst.loadd;
9600
9601           *cost +=
9602                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9603                                                      0, speed));
9604         }
9605
9606       return true;
9607
9608     case NEG:
9609       op0 = XEXP (x, 0);
9610
9611       if (VECTOR_MODE_P (mode))
9612         {
9613           if (speed)
9614             {
9615               /* FNEG.  */
9616               *cost += extra_cost->vect.alu;
9617             }
9618           return false;
9619         }
9620
9621       if (GET_MODE_CLASS (mode) == MODE_INT)
9622         {
9623           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9624               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9625             {
9626               /* CSETM.  */
9627               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
9628               return true;
9629             }
9630
9631           /* Cost this as SUB wzr, X.  */
9632           op0 = CONST0_RTX (mode);
9633           op1 = XEXP (x, 0);
9634           goto cost_minus;
9635         }
9636
9637       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9638         {
9639           /* Support (neg(fma...)) as a single instruction only if
9640              sign of zeros is unimportant.  This matches the decision
9641              making in aarch64.md.  */
9642           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
9643             {
9644               /* FNMADD.  */
9645               *cost = rtx_cost (op0, mode, NEG, 0, speed);
9646               return true;
9647             }
9648           if (GET_CODE (op0) == MULT)
9649             {
9650               /* FNMUL.  */
9651               *cost = rtx_cost (op0, mode, NEG, 0, speed);
9652               return true;
9653             }
9654           if (speed)
9655             /* FNEG.  */
9656             *cost += extra_cost->fp[mode == DFmode].neg;
9657           return false;
9658         }
9659
9660       return false;
9661
9662     case CLRSB:
9663     case CLZ:
9664       if (speed)
9665         {
9666           if (VECTOR_MODE_P (mode))
9667             *cost += extra_cost->vect.alu;
9668           else
9669             *cost += extra_cost->alu.clz;
9670         }
9671
9672       return false;
9673
9674     case COMPARE:
9675       op0 = XEXP (x, 0);
9676       op1 = XEXP (x, 1);
9677
9678       if (op1 == const0_rtx
9679           && GET_CODE (op0) == AND)
9680         {
9681           x = op0;
9682           mode = GET_MODE (op0);
9683           goto cost_logic;
9684         }
9685
9686       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
9687         {
9688           /* TODO: A write to the CC flags possibly costs extra, this
9689              needs encoding in the cost tables.  */
9690
9691           mode = GET_MODE (op0);
9692           /* ANDS.  */
9693           if (GET_CODE (op0) == AND)
9694             {
9695               x = op0;
9696               goto cost_logic;
9697             }
9698
9699           if (GET_CODE (op0) == PLUS)
9700             {
9701               /* ADDS (and CMN alias).  */
9702               x = op0;
9703               goto cost_plus;
9704             }
9705
9706           if (GET_CODE (op0) == MINUS)
9707             {
9708               /* SUBS.  */
9709               x = op0;
9710               goto cost_minus;
9711             }
9712
9713           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
9714               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
9715               && CONST_INT_P (XEXP (op0, 2)))
9716             {
9717               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
9718                  Handle it here directly rather than going to cost_logic
9719                  since we know the immediate generated for the TST is valid
9720                  so we can avoid creating an intermediate rtx for it only
9721                  for costing purposes.  */
9722               if (speed)
9723                 *cost += extra_cost->alu.logical;
9724
9725               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
9726                                  ZERO_EXTRACT, 0, speed);
9727               return true;
9728             }
9729
9730           if (GET_CODE (op1) == NEG)
9731             {
9732               /* CMN.  */
9733               if (speed)
9734                 *cost += extra_cost->alu.arith;
9735
9736               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
9737               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
9738               return true;
9739             }
9740
9741           /* CMP.
9742
9743              Compare can freely swap the order of operands, and
9744              canonicalization puts the more complex operation first.
9745              But the integer MINUS logic expects the shift/extend
9746              operation in op1.  */
9747           if (! (REG_P (op0)
9748                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
9749           {
9750             op0 = XEXP (x, 1);
9751             op1 = XEXP (x, 0);
9752           }
9753           goto cost_minus;
9754         }
9755
9756       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
9757         {
9758           /* FCMP.  */
9759           if (speed)
9760             *cost += extra_cost->fp[mode == DFmode].compare;
9761
9762           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
9763             {
9764               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
9765               /* FCMP supports constant 0.0 for no extra cost. */
9766               return true;
9767             }
9768           return false;
9769         }
9770
9771       if (VECTOR_MODE_P (mode))
9772         {
9773           /* Vector compare.  */
9774           if (speed)
9775             *cost += extra_cost->vect.alu;
9776
9777           if (aarch64_float_const_zero_rtx_p (op1))
9778             {
9779               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
9780                  cost.  */
9781               return true;
9782             }
9783           return false;
9784         }
9785       return false;
9786
9787     case MINUS:
9788       {
9789         op0 = XEXP (x, 0);
9790         op1 = XEXP (x, 1);
9791
9792 cost_minus:
9793         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
9794
9795         /* Detect valid immediates.  */
9796         if ((GET_MODE_CLASS (mode) == MODE_INT
9797              || (GET_MODE_CLASS (mode) == MODE_CC
9798                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
9799             && CONST_INT_P (op1)
9800             && aarch64_uimm12_shift (INTVAL (op1)))
9801           {
9802             if (speed)
9803               /* SUB(S) (immediate).  */
9804               *cost += extra_cost->alu.arith;
9805             return true;
9806           }
9807
9808         /* Look for SUB (extended register).  */
9809         if (is_a <scalar_int_mode> (mode, &int_mode)
9810             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
9811           {
9812             if (speed)
9813               *cost += extra_cost->alu.extend_arith;
9814
9815             op1 = aarch64_strip_extend (op1, true);
9816             *cost += rtx_cost (op1, VOIDmode,
9817                                (enum rtx_code) GET_CODE (op1), 0, speed);
9818             return true;
9819           }
9820
9821         rtx new_op1 = aarch64_strip_extend (op1, false);
9822
9823         /* Cost this as an FMA-alike operation.  */
9824         if ((GET_CODE (new_op1) == MULT
9825              || aarch64_shift_p (GET_CODE (new_op1)))
9826             && code != COMPARE)
9827           {
9828             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
9829                                             (enum rtx_code) code,
9830                                             speed);
9831             return true;
9832           }
9833
9834         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
9835
9836         if (speed)
9837           {
9838             if (VECTOR_MODE_P (mode))
9839               {
9840                 /* Vector SUB.  */
9841                 *cost += extra_cost->vect.alu;
9842               }
9843             else if (GET_MODE_CLASS (mode) == MODE_INT)
9844               {
9845                 /* SUB(S).  */
9846                 *cost += extra_cost->alu.arith;
9847               }
9848             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9849               {
9850                 /* FSUB.  */
9851                 *cost += extra_cost->fp[mode == DFmode].addsub;
9852               }
9853           }
9854         return true;
9855       }
9856
9857     case PLUS:
9858       {
9859         rtx new_op0;
9860
9861         op0 = XEXP (x, 0);
9862         op1 = XEXP (x, 1);
9863
9864 cost_plus:
9865         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9866             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9867           {
9868             /* CSINC.  */
9869             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
9870             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9871             return true;
9872           }
9873
9874         if (GET_MODE_CLASS (mode) == MODE_INT
9875             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
9876                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
9877           {
9878             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
9879
9880             if (speed)
9881               /* ADD (immediate).  */
9882               *cost += extra_cost->alu.arith;
9883             return true;
9884           }
9885
9886         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9887
9888         /* Look for ADD (extended register).  */
9889         if (is_a <scalar_int_mode> (mode, &int_mode)
9890             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
9891           {
9892             if (speed)
9893               *cost += extra_cost->alu.extend_arith;
9894
9895             op0 = aarch64_strip_extend (op0, true);
9896             *cost += rtx_cost (op0, VOIDmode,
9897                                (enum rtx_code) GET_CODE (op0), 0, speed);
9898             return true;
9899           }
9900
9901         /* Strip any extend, leave shifts behind as we will
9902            cost them through mult_cost.  */
9903         new_op0 = aarch64_strip_extend (op0, false);
9904
9905         if (GET_CODE (new_op0) == MULT
9906             || aarch64_shift_p (GET_CODE (new_op0)))
9907           {
9908             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
9909                                             speed);
9910             return true;
9911           }
9912
9913         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
9914
9915         if (speed)
9916           {
9917             if (VECTOR_MODE_P (mode))
9918               {
9919                 /* Vector ADD.  */
9920                 *cost += extra_cost->vect.alu;
9921               }
9922             else if (GET_MODE_CLASS (mode) == MODE_INT)
9923               {
9924                 /* ADD.  */
9925                 *cost += extra_cost->alu.arith;
9926               }
9927             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9928               {
9929                 /* FADD.  */
9930                 *cost += extra_cost->fp[mode == DFmode].addsub;
9931               }
9932           }
9933         return true;
9934       }
9935
9936     case BSWAP:
9937       *cost = COSTS_N_INSNS (1);
9938
9939       if (speed)
9940         {
9941           if (VECTOR_MODE_P (mode))
9942             *cost += extra_cost->vect.alu;
9943           else
9944             *cost += extra_cost->alu.rev;
9945         }
9946       return false;
9947
9948     case IOR:
9949       if (aarch_rev16_p (x))
9950         {
9951           *cost = COSTS_N_INSNS (1);
9952
9953           if (speed)
9954             {
9955               if (VECTOR_MODE_P (mode))
9956                 *cost += extra_cost->vect.alu;
9957               else
9958                 *cost += extra_cost->alu.rev;
9959             }
9960           return true;
9961         }
9962
9963       if (aarch64_extr_rtx_p (x, &op0, &op1))
9964         {
9965           *cost += rtx_cost (op0, mode, IOR, 0, speed);
9966           *cost += rtx_cost (op1, mode, IOR, 1, speed);
9967           if (speed)
9968             *cost += extra_cost->alu.shift;
9969
9970           return true;
9971         }
9972     /* Fall through.  */
9973     case XOR:
9974     case AND:
9975     cost_logic:
9976       op0 = XEXP (x, 0);
9977       op1 = XEXP (x, 1);
9978
9979       if (VECTOR_MODE_P (mode))
9980         {
9981           if (speed)
9982             *cost += extra_cost->vect.alu;
9983           return true;
9984         }
9985
9986       if (code == AND
9987           && GET_CODE (op0) == MULT
9988           && CONST_INT_P (XEXP (op0, 1))
9989           && CONST_INT_P (op1)
9990           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9991                                INTVAL (op1)) != 0)
9992         {
9993           /* This is a UBFM/SBFM.  */
9994           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9995           if (speed)
9996             *cost += extra_cost->alu.bfx;
9997           return true;
9998         }
9999
10000       if (is_int_mode (mode, &int_mode))
10001         {
10002           if (CONST_INT_P (op1))
10003             {
10004               /* We have a mask + shift version of a UBFIZ
10005                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
10006               if (GET_CODE (op0) == ASHIFT
10007                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10008                                                          XEXP (op0, 1)))
10009                 {
10010                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
10011                                      (enum rtx_code) code, 0, speed);
10012                   if (speed)
10013                     *cost += extra_cost->alu.bfx;
10014
10015                   return true;
10016                 }
10017               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10018                 {
10019                 /* We possibly get the immediate for free, this is not
10020                    modelled.  */
10021                   *cost += rtx_cost (op0, int_mode,
10022                                      (enum rtx_code) code, 0, speed);
10023                   if (speed)
10024                     *cost += extra_cost->alu.logical;
10025
10026                   return true;
10027                 }
10028             }
10029           else
10030             {
10031               rtx new_op0 = op0;
10032
10033               /* Handle ORN, EON, or BIC.  */
10034               if (GET_CODE (op0) == NOT)
10035                 op0 = XEXP (op0, 0);
10036
10037               new_op0 = aarch64_strip_shift (op0);
10038
10039               /* If we had a shift on op0 then this is a logical-shift-
10040                  by-register/immediate operation.  Otherwise, this is just
10041                  a logical operation.  */
10042               if (speed)
10043                 {
10044                   if (new_op0 != op0)
10045                     {
10046                       /* Shift by immediate.  */
10047                       if (CONST_INT_P (XEXP (op0, 1)))
10048                         *cost += extra_cost->alu.log_shift;
10049                       else
10050                         *cost += extra_cost->alu.log_shift_reg;
10051                     }
10052                   else
10053                     *cost += extra_cost->alu.logical;
10054                 }
10055
10056               /* In both cases we want to cost both operands.  */
10057               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10058                                  0, speed);
10059               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10060                                  1, speed);
10061
10062               return true;
10063             }
10064         }
10065       return false;
10066
10067     case NOT:
10068       x = XEXP (x, 0);
10069       op0 = aarch64_strip_shift (x);
10070
10071       if (VECTOR_MODE_P (mode))
10072         {
10073           /* Vector NOT.  */
10074           *cost += extra_cost->vect.alu;
10075           return false;
10076         }
10077
10078       /* MVN-shifted-reg.  */
10079       if (op0 != x)
10080         {
10081           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10082
10083           if (speed)
10084             *cost += extra_cost->alu.log_shift;
10085
10086           return true;
10087         }
10088       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10089          Handle the second form here taking care that 'a' in the above can
10090          be a shift.  */
10091       else if (GET_CODE (op0) == XOR)
10092         {
10093           rtx newop0 = XEXP (op0, 0);
10094           rtx newop1 = XEXP (op0, 1);
10095           rtx op0_stripped = aarch64_strip_shift (newop0);
10096
10097           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10098           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10099
10100           if (speed)
10101             {
10102               if (op0_stripped != newop0)
10103                 *cost += extra_cost->alu.log_shift;
10104               else
10105                 *cost += extra_cost->alu.logical;
10106             }
10107
10108           return true;
10109         }
10110       /* MVN.  */
10111       if (speed)
10112         *cost += extra_cost->alu.logical;
10113
10114       return false;
10115
10116     case ZERO_EXTEND:
10117
10118       op0 = XEXP (x, 0);
10119       /* If a value is written in SI mode, then zero extended to DI
10120          mode, the operation will in general be free as a write to
10121          a 'w' register implicitly zeroes the upper bits of an 'x'
10122          register.  However, if this is
10123
10124            (set (reg) (zero_extend (reg)))
10125
10126          we must cost the explicit register move.  */
10127       if (mode == DImode
10128           && GET_MODE (op0) == SImode
10129           && outer == SET)
10130         {
10131           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10132
10133         /* If OP_COST is non-zero, then the cost of the zero extend
10134            is effectively the cost of the inner operation.  Otherwise
10135            we have a MOV instruction and we take the cost from the MOV
10136            itself.  This is true independently of whether we are
10137            optimizing for space or time.  */
10138           if (op_cost)
10139             *cost = op_cost;
10140
10141           return true;
10142         }
10143       else if (MEM_P (op0))
10144         {
10145           /* All loads can zero extend to any size for free.  */
10146           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10147           return true;
10148         }
10149
10150       op0 = aarch64_extend_bitfield_pattern_p (x);
10151       if (op0)
10152         {
10153           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
10154           if (speed)
10155             *cost += extra_cost->alu.bfx;
10156           return true;
10157         }
10158
10159       if (speed)
10160         {
10161           if (VECTOR_MODE_P (mode))
10162             {
10163               /* UMOV.  */
10164               *cost += extra_cost->vect.alu;
10165             }
10166           else
10167             {
10168               /* We generate an AND instead of UXTB/UXTH.  */
10169               *cost += extra_cost->alu.logical;
10170             }
10171         }
10172       return false;
10173
10174     case SIGN_EXTEND:
10175       if (MEM_P (XEXP (x, 0)))
10176         {
10177           /* LDRSH.  */
10178           if (speed)
10179             {
10180               rtx address = XEXP (XEXP (x, 0), 0);
10181               *cost += extra_cost->ldst.load_sign_extend;
10182
10183               *cost +=
10184                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10185                                                      0, speed));
10186             }
10187           return true;
10188         }
10189
10190       op0 = aarch64_extend_bitfield_pattern_p (x);
10191       if (op0)
10192         {
10193           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
10194           if (speed)
10195             *cost += extra_cost->alu.bfx;
10196           return true;
10197         }
10198
10199       if (speed)
10200         {
10201           if (VECTOR_MODE_P (mode))
10202             *cost += extra_cost->vect.alu;
10203           else
10204             *cost += extra_cost->alu.extend;
10205         }
10206       return false;
10207
10208     case ASHIFT:
10209       op0 = XEXP (x, 0);
10210       op1 = XEXP (x, 1);
10211
10212       if (CONST_INT_P (op1))
10213         {
10214           if (speed)
10215             {
10216               if (VECTOR_MODE_P (mode))
10217                 {
10218                   /* Vector shift (immediate).  */
10219                   *cost += extra_cost->vect.alu;
10220                 }
10221               else
10222                 {
10223                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
10224                      aliases.  */
10225                   *cost += extra_cost->alu.shift;
10226                 }
10227             }
10228
10229           /* We can incorporate zero/sign extend for free.  */
10230           if (GET_CODE (op0) == ZERO_EXTEND
10231               || GET_CODE (op0) == SIGN_EXTEND)
10232             op0 = XEXP (op0, 0);
10233
10234           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
10235           return true;
10236         }
10237       else
10238         {
10239           if (VECTOR_MODE_P (mode))
10240             {
10241               if (speed)
10242                 /* Vector shift (register).  */
10243                 *cost += extra_cost->vect.alu;
10244             }
10245           else
10246             {
10247               if (speed)
10248                 /* LSLV.  */
10249                 *cost += extra_cost->alu.shift_reg;
10250
10251               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10252                   && CONST_INT_P (XEXP (op1, 1))
10253                   && known_eq (INTVAL (XEXP (op1, 1)),
10254                                GET_MODE_BITSIZE (mode) - 1))
10255                 {
10256                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10257                   /* We already demanded XEXP (op1, 0) to be REG_P, so
10258                      don't recurse into it.  */
10259                   return true;
10260                 }
10261             }
10262           return false;  /* All arguments need to be in registers.  */
10263         }
10264
10265     case ROTATE:
10266     case ROTATERT:
10267     case LSHIFTRT:
10268     case ASHIFTRT:
10269       op0 = XEXP (x, 0);
10270       op1 = XEXP (x, 1);
10271
10272       if (CONST_INT_P (op1))
10273         {
10274           /* ASR (immediate) and friends.  */
10275           if (speed)
10276             {
10277               if (VECTOR_MODE_P (mode))
10278                 *cost += extra_cost->vect.alu;
10279               else
10280                 *cost += extra_cost->alu.shift;
10281             }
10282
10283           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10284           return true;
10285         }
10286       else
10287         {
10288           if (VECTOR_MODE_P (mode))
10289             {
10290               if (speed)
10291                 /* Vector shift (register).  */
10292                 *cost += extra_cost->vect.alu;
10293             }
10294           else
10295             {
10296               if (speed)
10297                 /* ASR (register) and friends.  */
10298                 *cost += extra_cost->alu.shift_reg;
10299
10300               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10301                   && CONST_INT_P (XEXP (op1, 1))
10302                   && known_eq (INTVAL (XEXP (op1, 1)),
10303                                GET_MODE_BITSIZE (mode) - 1))
10304                 {
10305                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10306                   /* We already demanded XEXP (op1, 0) to be REG_P, so
10307                      don't recurse into it.  */
10308                   return true;
10309                 }
10310             }
10311           return false;  /* All arguments need to be in registers.  */
10312         }
10313
10314     case SYMBOL_REF:
10315
10316       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
10317           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
10318         {
10319           /* LDR.  */
10320           if (speed)
10321             *cost += extra_cost->ldst.load;
10322         }
10323       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
10324                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
10325         {
10326           /* ADRP, followed by ADD.  */
10327           *cost += COSTS_N_INSNS (1);
10328           if (speed)
10329             *cost += 2 * extra_cost->alu.arith;
10330         }
10331       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
10332                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10333         {
10334           /* ADR.  */
10335           if (speed)
10336             *cost += extra_cost->alu.arith;
10337         }
10338
10339       if (flag_pic)
10340         {
10341           /* One extra load instruction, after accessing the GOT.  */
10342           *cost += COSTS_N_INSNS (1);
10343           if (speed)
10344             *cost += extra_cost->ldst.load;
10345         }
10346       return true;
10347
10348     case HIGH:
10349     case LO_SUM:
10350       /* ADRP/ADD (immediate).  */
10351       if (speed)
10352         *cost += extra_cost->alu.arith;
10353       return true;
10354
10355     case ZERO_EXTRACT:
10356     case SIGN_EXTRACT:
10357       /* UBFX/SBFX.  */
10358       if (speed)
10359         {
10360           if (VECTOR_MODE_P (mode))
10361             *cost += extra_cost->vect.alu;
10362           else
10363             *cost += extra_cost->alu.bfx;
10364         }
10365
10366       /* We can trust that the immediates used will be correct (there
10367          are no by-register forms), so we need only cost op0.  */
10368       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
10369       return true;
10370
10371     case MULT:
10372       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
10373       /* aarch64_rtx_mult_cost always handles recursion to its
10374          operands.  */
10375       return true;
10376
10377     case MOD:
10378     /* We can expand signed mod by power of 2 using a NEGS, two parallel
10379        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
10380        an unconditional negate.  This case should only ever be reached through
10381        the set_smod_pow2_cheap check in expmed.c.  */
10382       if (CONST_INT_P (XEXP (x, 1))
10383           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
10384           && (mode == SImode || mode == DImode))
10385         {
10386           /* We expand to 4 instructions.  Reset the baseline.  */
10387           *cost = COSTS_N_INSNS (4);
10388
10389           if (speed)
10390             *cost += 2 * extra_cost->alu.logical
10391                      + 2 * extra_cost->alu.arith;
10392
10393           return true;
10394         }
10395
10396     /* Fall-through.  */
10397     case UMOD:
10398       if (speed)
10399         {
10400           /* Slighly prefer UMOD over SMOD.  */
10401           if (VECTOR_MODE_P (mode))
10402             *cost += extra_cost->vect.alu;
10403           else if (GET_MODE_CLASS (mode) == MODE_INT)
10404             *cost += (extra_cost->mult[mode == DImode].add
10405                       + extra_cost->mult[mode == DImode].idiv
10406                       + (code == MOD ? 1 : 0));
10407         }
10408       return false;  /* All arguments need to be in registers.  */
10409
10410     case DIV:
10411     case UDIV:
10412     case SQRT:
10413       if (speed)
10414         {
10415           if (VECTOR_MODE_P (mode))
10416             *cost += extra_cost->vect.alu;
10417           else if (GET_MODE_CLASS (mode) == MODE_INT)
10418             /* There is no integer SQRT, so only DIV and UDIV can get
10419                here.  */
10420             *cost += (extra_cost->mult[mode == DImode].idiv
10421                      /* Slighly prefer UDIV over SDIV.  */
10422                      + (code == DIV ? 1 : 0));
10423           else
10424             *cost += extra_cost->fp[mode == DFmode].div;
10425         }
10426       return false;  /* All arguments need to be in registers.  */
10427
10428     case IF_THEN_ELSE:
10429       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
10430                                          XEXP (x, 2), cost, speed);
10431
10432     case EQ:
10433     case NE:
10434     case GT:
10435     case GTU:
10436     case LT:
10437     case LTU:
10438     case GE:
10439     case GEU:
10440     case LE:
10441     case LEU:
10442
10443       return false; /* All arguments must be in registers.  */
10444
10445     case FMA:
10446       op0 = XEXP (x, 0);
10447       op1 = XEXP (x, 1);
10448       op2 = XEXP (x, 2);
10449
10450       if (speed)
10451         {
10452           if (VECTOR_MODE_P (mode))
10453             *cost += extra_cost->vect.alu;
10454           else
10455             *cost += extra_cost->fp[mode == DFmode].fma;
10456         }
10457
10458       /* FMSUB, FNMADD, and FNMSUB are free.  */
10459       if (GET_CODE (op0) == NEG)
10460         op0 = XEXP (op0, 0);
10461
10462       if (GET_CODE (op2) == NEG)
10463         op2 = XEXP (op2, 0);
10464
10465       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
10466          and the by-element operand as operand 0.  */
10467       if (GET_CODE (op1) == NEG)
10468         op1 = XEXP (op1, 0);
10469
10470       /* Catch vector-by-element operations.  The by-element operand can
10471          either be (vec_duplicate (vec_select (x))) or just
10472          (vec_select (x)), depending on whether we are multiplying by
10473          a vector or a scalar.
10474
10475          Canonicalization is not very good in these cases, FMA4 will put the
10476          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
10477       if (GET_CODE (op0) == VEC_DUPLICATE)
10478         op0 = XEXP (op0, 0);
10479       else if (GET_CODE (op1) == VEC_DUPLICATE)
10480         op1 = XEXP (op1, 0);
10481
10482       if (GET_CODE (op0) == VEC_SELECT)
10483         op0 = XEXP (op0, 0);
10484       else if (GET_CODE (op1) == VEC_SELECT)
10485         op1 = XEXP (op1, 0);
10486
10487       /* If the remaining parameters are not registers,
10488          get the cost to put them into registers.  */
10489       *cost += rtx_cost (op0, mode, FMA, 0, speed);
10490       *cost += rtx_cost (op1, mode, FMA, 1, speed);
10491       *cost += rtx_cost (op2, mode, FMA, 2, speed);
10492       return true;
10493
10494     case FLOAT:
10495     case UNSIGNED_FLOAT:
10496       if (speed)
10497         *cost += extra_cost->fp[mode == DFmode].fromint;
10498       return false;
10499
10500     case FLOAT_EXTEND:
10501       if (speed)
10502         {
10503           if (VECTOR_MODE_P (mode))
10504             {
10505               /*Vector truncate.  */
10506               *cost += extra_cost->vect.alu;
10507             }
10508           else
10509             *cost += extra_cost->fp[mode == DFmode].widen;
10510         }
10511       return false;
10512
10513     case FLOAT_TRUNCATE:
10514       if (speed)
10515         {
10516           if (VECTOR_MODE_P (mode))
10517             {
10518               /*Vector conversion.  */
10519               *cost += extra_cost->vect.alu;
10520             }
10521           else
10522             *cost += extra_cost->fp[mode == DFmode].narrow;
10523         }
10524       return false;
10525
10526     case FIX:
10527     case UNSIGNED_FIX:
10528       x = XEXP (x, 0);
10529       /* Strip the rounding part.  They will all be implemented
10530          by the fcvt* family of instructions anyway.  */
10531       if (GET_CODE (x) == UNSPEC)
10532         {
10533           unsigned int uns_code = XINT (x, 1);
10534
10535           if (uns_code == UNSPEC_FRINTA
10536               || uns_code == UNSPEC_FRINTM
10537               || uns_code == UNSPEC_FRINTN
10538               || uns_code == UNSPEC_FRINTP
10539               || uns_code == UNSPEC_FRINTZ)
10540             x = XVECEXP (x, 0, 0);
10541         }
10542
10543       if (speed)
10544         {
10545           if (VECTOR_MODE_P (mode))
10546             *cost += extra_cost->vect.alu;
10547           else
10548             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
10549         }
10550
10551       /* We can combine fmul by a power of 2 followed by a fcvt into a single
10552          fixed-point fcvt.  */
10553       if (GET_CODE (x) == MULT
10554           && ((VECTOR_MODE_P (mode)
10555                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
10556               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
10557         {
10558           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
10559                              0, speed);
10560           return true;
10561         }
10562
10563       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
10564       return true;
10565
10566     case ABS:
10567       if (VECTOR_MODE_P (mode))
10568         {
10569           /* ABS (vector).  */
10570           if (speed)
10571             *cost += extra_cost->vect.alu;
10572         }
10573       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10574         {
10575           op0 = XEXP (x, 0);
10576
10577           /* FABD, which is analogous to FADD.  */
10578           if (GET_CODE (op0) == MINUS)
10579             {
10580               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
10581               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
10582               if (speed)
10583                 *cost += extra_cost->fp[mode == DFmode].addsub;
10584
10585               return true;
10586             }
10587           /* Simple FABS is analogous to FNEG.  */
10588           if (speed)
10589             *cost += extra_cost->fp[mode == DFmode].neg;
10590         }
10591       else
10592         {
10593           /* Integer ABS will either be split to
10594              two arithmetic instructions, or will be an ABS
10595              (scalar), which we don't model.  */
10596           *cost = COSTS_N_INSNS (2);
10597           if (speed)
10598             *cost += 2 * extra_cost->alu.arith;
10599         }
10600       return false;
10601
10602     case SMAX:
10603     case SMIN:
10604       if (speed)
10605         {
10606           if (VECTOR_MODE_P (mode))
10607             *cost += extra_cost->vect.alu;
10608           else
10609             {
10610               /* FMAXNM/FMINNM/FMAX/FMIN.
10611                  TODO: This may not be accurate for all implementations, but
10612                  we do not model this in the cost tables.  */
10613               *cost += extra_cost->fp[mode == DFmode].addsub;
10614             }
10615         }
10616       return false;
10617
10618     case UNSPEC:
10619       /* The floating point round to integer frint* instructions.  */
10620       if (aarch64_frint_unspec_p (XINT (x, 1)))
10621         {
10622           if (speed)
10623             *cost += extra_cost->fp[mode == DFmode].roundint;
10624
10625           return false;
10626         }
10627
10628       if (XINT (x, 1) == UNSPEC_RBIT)
10629         {
10630           if (speed)
10631             *cost += extra_cost->alu.rev;
10632
10633           return false;
10634         }
10635       break;
10636
10637     case TRUNCATE:
10638
10639       /* Decompose <su>muldi3_highpart.  */
10640       if (/* (truncate:DI  */
10641           mode == DImode
10642           /*   (lshiftrt:TI  */
10643           && GET_MODE (XEXP (x, 0)) == TImode
10644           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
10645           /*      (mult:TI  */
10646           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
10647           /*        (ANY_EXTEND:TI (reg:DI))
10648                     (ANY_EXTEND:TI (reg:DI)))  */
10649           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
10650                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
10651               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
10652                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
10653           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
10654           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
10655           /*     (const_int 64)  */
10656           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10657           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
10658         {
10659           /* UMULH/SMULH.  */
10660           if (speed)
10661             *cost += extra_cost->mult[mode == DImode].extend;
10662           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
10663                              mode, MULT, 0, speed);
10664           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
10665                              mode, MULT, 1, speed);
10666           return true;
10667         }
10668
10669       /* Fall through.  */
10670     default:
10671       break;
10672     }
10673
10674   if (dump_file
10675       && flag_aarch64_verbose_cost)
10676     fprintf (dump_file,
10677       "\nFailed to cost RTX.  Assuming default cost.\n");
10678
10679   return true;
10680 }
10681
10682 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
10683    calculated for X.  This cost is stored in *COST.  Returns true
10684    if the total cost of X was calculated.  */
10685 static bool
10686 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
10687                    int param, int *cost, bool speed)
10688 {
10689   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
10690
10691   if (dump_file
10692       && flag_aarch64_verbose_cost)
10693     {
10694       print_rtl_single (dump_file, x);
10695       fprintf (dump_file, "\n%s cost: %d (%s)\n",
10696                speed ? "Hot" : "Cold",
10697                *cost, result ? "final" : "partial");
10698     }
10699
10700   return result;
10701 }
10702
10703 static int
10704 aarch64_register_move_cost (machine_mode mode,
10705                             reg_class_t from_i, reg_class_t to_i)
10706 {
10707   enum reg_class from = (enum reg_class) from_i;
10708   enum reg_class to = (enum reg_class) to_i;
10709   const struct cpu_regmove_cost *regmove_cost
10710     = aarch64_tune_params.regmove_cost;
10711
10712   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
10713   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
10714     to = GENERAL_REGS;
10715
10716   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
10717     from = GENERAL_REGS;
10718
10719   /* Moving between GPR and stack cost is the same as GP2GP.  */
10720   if ((from == GENERAL_REGS && to == STACK_REG)
10721       || (to == GENERAL_REGS && from == STACK_REG))
10722     return regmove_cost->GP2GP;
10723
10724   /* To/From the stack register, we move via the gprs.  */
10725   if (to == STACK_REG || from == STACK_REG)
10726     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
10727             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
10728
10729   if (known_eq (GET_MODE_SIZE (mode), 16))
10730     {
10731       /* 128-bit operations on general registers require 2 instructions.  */
10732       if (from == GENERAL_REGS && to == GENERAL_REGS)
10733         return regmove_cost->GP2GP * 2;
10734       else if (from == GENERAL_REGS)
10735         return regmove_cost->GP2FP * 2;
10736       else if (to == GENERAL_REGS)
10737         return regmove_cost->FP2GP * 2;
10738
10739       /* When AdvSIMD instructions are disabled it is not possible to move
10740          a 128-bit value directly between Q registers.  This is handled in
10741          secondary reload.  A general register is used as a scratch to move
10742          the upper DI value and the lower DI value is moved directly,
10743          hence the cost is the sum of three moves. */
10744       if (! TARGET_SIMD)
10745         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
10746
10747       return regmove_cost->FP2FP;
10748     }
10749
10750   if (from == GENERAL_REGS && to == GENERAL_REGS)
10751     return regmove_cost->GP2GP;
10752   else if (from == GENERAL_REGS)
10753     return regmove_cost->GP2FP;
10754   else if (to == GENERAL_REGS)
10755     return regmove_cost->FP2GP;
10756
10757   return regmove_cost->FP2FP;
10758 }
10759
10760 static int
10761 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
10762                           reg_class_t rclass ATTRIBUTE_UNUSED,
10763                           bool in ATTRIBUTE_UNUSED)
10764 {
10765   return aarch64_tune_params.memmov_cost;
10766 }
10767
10768 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
10769    to optimize 1.0/sqrt.  */
10770
10771 static bool
10772 use_rsqrt_p (machine_mode mode)
10773 {
10774   return (!flag_trapping_math
10775           && flag_unsafe_math_optimizations
10776           && ((aarch64_tune_params.approx_modes->recip_sqrt
10777                & AARCH64_APPROX_MODE (mode))
10778               || flag_mrecip_low_precision_sqrt));
10779 }
10780
10781 /* Function to decide when to use the approximate reciprocal square root
10782    builtin.  */
10783
10784 static tree
10785 aarch64_builtin_reciprocal (tree fndecl)
10786 {
10787   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
10788
10789   if (!use_rsqrt_p (mode))
10790     return NULL_TREE;
10791   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
10792 }
10793
10794 /* Emit instruction sequence to compute either the approximate square root
10795    or its approximate reciprocal, depending on the flag RECP, and return
10796    whether the sequence was emitted or not.  */
10797
10798 bool
10799 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
10800 {
10801   machine_mode mode = GET_MODE (dst);
10802
10803   if (GET_MODE_INNER (mode) == HFmode)
10804     {
10805       gcc_assert (!recp);
10806       return false;
10807     }
10808
10809   if (!recp)
10810     {
10811       if (!(flag_mlow_precision_sqrt
10812             || (aarch64_tune_params.approx_modes->sqrt
10813                 & AARCH64_APPROX_MODE (mode))))
10814         return false;
10815
10816       if (flag_finite_math_only
10817           || flag_trapping_math
10818           || !flag_unsafe_math_optimizations
10819           || optimize_function_for_size_p (cfun))
10820         return false;
10821     }
10822   else
10823     /* Caller assumes we cannot fail.  */
10824     gcc_assert (use_rsqrt_p (mode));
10825
10826   machine_mode mmsk = mode_for_int_vector (mode).require ();
10827   rtx xmsk = gen_reg_rtx (mmsk);
10828   if (!recp)
10829     /* When calculating the approximate square root, compare the
10830        argument with 0.0 and create a mask.  */
10831     emit_insn (gen_rtx_SET (xmsk,
10832                             gen_rtx_NEG (mmsk,
10833                                          gen_rtx_EQ (mmsk, src,
10834                                                      CONST0_RTX (mode)))));
10835
10836   /* Estimate the approximate reciprocal square root.  */
10837   rtx xdst = gen_reg_rtx (mode);
10838   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
10839
10840   /* Iterate over the series twice for SF and thrice for DF.  */
10841   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10842
10843   /* Optionally iterate over the series once less for faster performance
10844      while sacrificing the accuracy.  */
10845   if ((recp && flag_mrecip_low_precision_sqrt)
10846       || (!recp && flag_mlow_precision_sqrt))
10847     iterations--;
10848
10849   /* Iterate over the series to calculate the approximate reciprocal square
10850      root.  */
10851   rtx x1 = gen_reg_rtx (mode);
10852   while (iterations--)
10853     {
10854       rtx x2 = gen_reg_rtx (mode);
10855       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
10856
10857       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
10858
10859       if (iterations > 0)
10860         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
10861     }
10862
10863   if (!recp)
10864     {
10865       /* Qualify the approximate reciprocal square root when the argument is
10866          0.0 by squashing the intermediary result to 0.0.  */
10867       rtx xtmp = gen_reg_rtx (mmsk);
10868       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
10869                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
10870       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
10871
10872       /* Calculate the approximate square root.  */
10873       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
10874     }
10875
10876   /* Finalize the approximation.  */
10877   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
10878
10879   return true;
10880 }
10881
10882 /* Emit the instruction sequence to compute the approximation for the division
10883    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
10884
10885 bool
10886 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10887 {
10888   machine_mode mode = GET_MODE (quo);
10889
10890   if (GET_MODE_INNER (mode) == HFmode)
10891     return false;
10892
10893   bool use_approx_division_p = (flag_mlow_precision_div
10894                                 || (aarch64_tune_params.approx_modes->division
10895                                     & AARCH64_APPROX_MODE (mode)));
10896
10897   if (!flag_finite_math_only
10898       || flag_trapping_math
10899       || !flag_unsafe_math_optimizations
10900       || optimize_function_for_size_p (cfun)
10901       || !use_approx_division_p)
10902     return false;
10903
10904   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10905     return false;
10906
10907   /* Estimate the approximate reciprocal.  */
10908   rtx xrcp = gen_reg_rtx (mode);
10909   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
10910
10911   /* Iterate over the series twice for SF and thrice for DF.  */
10912   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10913
10914   /* Optionally iterate over the series once less for faster performance,
10915      while sacrificing the accuracy.  */
10916   if (flag_mlow_precision_div)
10917     iterations--;
10918
10919   /* Iterate over the series to calculate the approximate reciprocal.  */
10920   rtx xtmp = gen_reg_rtx (mode);
10921   while (iterations--)
10922     {
10923       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
10924
10925       if (iterations > 0)
10926         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10927     }
10928
10929   if (num != CONST1_RTX (mode))
10930     {
10931       /* As the approximate reciprocal of DEN is already calculated, only
10932          calculate the approximate division when NUM is not 1.0.  */
10933       rtx xnum = force_reg (mode, num);
10934       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10935     }
10936
10937   /* Finalize the approximation.  */
10938   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10939   return true;
10940 }
10941
10942 /* Return the number of instructions that can be issued per cycle.  */
10943 static int
10944 aarch64_sched_issue_rate (void)
10945 {
10946   return aarch64_tune_params.issue_rate;
10947 }
10948
10949 static int
10950 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10951 {
10952   int issue_rate = aarch64_sched_issue_rate ();
10953
10954   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10955 }
10956
10957
10958 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10959    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
10960    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
10961
10962 static int
10963 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10964                                                     int ready_index)
10965 {
10966   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10967 }
10968
10969
10970 /* Vectorizer cost model target hooks.  */
10971
10972 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
10973 static int
10974 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10975                                     tree vectype,
10976                                     int misalign ATTRIBUTE_UNUSED)
10977 {
10978   unsigned elements;
10979   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10980   bool fp = false;
10981
10982   if (vectype != NULL)
10983     fp = FLOAT_TYPE_P (vectype);
10984
10985   switch (type_of_cost)
10986     {
10987       case scalar_stmt:
10988         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10989
10990       case scalar_load:
10991         return costs->scalar_load_cost;
10992
10993       case scalar_store:
10994         return costs->scalar_store_cost;
10995
10996       case vector_stmt:
10997         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10998
10999       case vector_load:
11000         return costs->vec_align_load_cost;
11001
11002       case vector_store:
11003         return costs->vec_store_cost;
11004
11005       case vec_to_scalar:
11006         return costs->vec_to_scalar_cost;
11007
11008       case scalar_to_vec:
11009         return costs->scalar_to_vec_cost;
11010
11011       case unaligned_load:
11012       case vector_gather_load:
11013         return costs->vec_unalign_load_cost;
11014
11015       case unaligned_store:
11016       case vector_scatter_store:
11017         return costs->vec_unalign_store_cost;
11018
11019       case cond_branch_taken:
11020         return costs->cond_taken_branch_cost;
11021
11022       case cond_branch_not_taken:
11023         return costs->cond_not_taken_branch_cost;
11024
11025       case vec_perm:
11026         return costs->vec_permute_cost;
11027
11028       case vec_promote_demote:
11029         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11030
11031       case vec_construct:
11032         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
11033         return elements / 2 + 1;
11034
11035       default:
11036         gcc_unreachable ();
11037     }
11038 }
11039
11040 /* Implement targetm.vectorize.add_stmt_cost.  */
11041 static unsigned
11042 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
11043                        struct _stmt_vec_info *stmt_info, int misalign,
11044                        enum vect_cost_model_location where)
11045 {
11046   unsigned *cost = (unsigned *) data;
11047   unsigned retval = 0;
11048
11049   if (flag_vect_cost_model)
11050     {
11051       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
11052       int stmt_cost =
11053             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
11054
11055       /* Statements in an inner loop relative to the loop being
11056          vectorized are weighted more heavily.  The value here is
11057          arbitrary and could potentially be improved with analysis.  */
11058       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
11059         count *= 50; /*  FIXME  */
11060
11061       retval = (unsigned) (count * stmt_cost);
11062       cost[where] += retval;
11063     }
11064
11065   return retval;
11066 }
11067
11068 static void initialize_aarch64_code_model (struct gcc_options *);
11069
11070 /* Parse the TO_PARSE string and put the architecture struct that it
11071    selects into RES and the architectural features into ISA_FLAGS.
11072    Return an aarch64_parse_opt_result describing the parse result.
11073    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11074    When the TO_PARSE string contains an invalid extension,
11075    a copy of the string is created and stored to INVALID_EXTENSION.  */
11076
11077 static enum aarch64_parse_opt_result
11078 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11079                     uint64_t *isa_flags, std::string *invalid_extension)
11080 {
11081   const char *ext;
11082   const struct processor *arch;
11083   size_t len;
11084
11085   ext = strchr (to_parse, '+');
11086
11087   if (ext != NULL)
11088     len = ext - to_parse;
11089   else
11090     len = strlen (to_parse);
11091
11092   if (len == 0)
11093     return AARCH64_PARSE_MISSING_ARG;
11094
11095
11096   /* Loop through the list of supported ARCHes to find a match.  */
11097   for (arch = all_architectures; arch->name != NULL; arch++)
11098     {
11099       if (strlen (arch->name) == len
11100           && strncmp (arch->name, to_parse, len) == 0)
11101         {
11102           uint64_t isa_temp = arch->flags;
11103
11104           if (ext != NULL)
11105             {
11106               /* TO_PARSE string contains at least one extension.  */
11107               enum aarch64_parse_opt_result ext_res
11108                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11109
11110               if (ext_res != AARCH64_PARSE_OK)
11111                 return ext_res;
11112             }
11113           /* Extension parsing was successful.  Confirm the result
11114              arch and ISA flags.  */
11115           *res = arch;
11116           *isa_flags = isa_temp;
11117           return AARCH64_PARSE_OK;
11118         }
11119     }
11120
11121   /* ARCH name not found in list.  */
11122   return AARCH64_PARSE_INVALID_ARG;
11123 }
11124
11125 /* Parse the TO_PARSE string and put the result tuning in RES and the
11126    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
11127    describing the parse result.  If there is an error parsing, RES and
11128    ISA_FLAGS are left unchanged.
11129    When the TO_PARSE string contains an invalid extension,
11130    a copy of the string is created and stored to INVALID_EXTENSION.  */
11131
11132 static enum aarch64_parse_opt_result
11133 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11134                    uint64_t *isa_flags, std::string *invalid_extension)
11135 {
11136   const char *ext;
11137   const struct processor *cpu;
11138   size_t len;
11139
11140   ext = strchr (to_parse, '+');
11141
11142   if (ext != NULL)
11143     len = ext - to_parse;
11144   else
11145     len = strlen (to_parse);
11146
11147   if (len == 0)
11148     return AARCH64_PARSE_MISSING_ARG;
11149
11150
11151   /* Loop through the list of supported CPUs to find a match.  */
11152   for (cpu = all_cores; cpu->name != NULL; cpu++)
11153     {
11154       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
11155         {
11156           uint64_t isa_temp = cpu->flags;
11157
11158
11159           if (ext != NULL)
11160             {
11161               /* TO_PARSE string contains at least one extension.  */
11162               enum aarch64_parse_opt_result ext_res
11163                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11164
11165               if (ext_res != AARCH64_PARSE_OK)
11166                 return ext_res;
11167             }
11168           /* Extension parsing was successfull.  Confirm the result
11169              cpu and ISA flags.  */
11170           *res = cpu;
11171           *isa_flags = isa_temp;
11172           return AARCH64_PARSE_OK;
11173         }
11174     }
11175
11176   /* CPU name not found in list.  */
11177   return AARCH64_PARSE_INVALID_ARG;
11178 }
11179
11180 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11181    Return an aarch64_parse_opt_result describing the parse result.
11182    If the parsing fails the RES does not change.  */
11183
11184 static enum aarch64_parse_opt_result
11185 aarch64_parse_tune (const char *to_parse, const struct processor **res)
11186 {
11187   const struct processor *cpu;
11188
11189   /* Loop through the list of supported CPUs to find a match.  */
11190   for (cpu = all_cores; cpu->name != NULL; cpu++)
11191     {
11192       if (strcmp (cpu->name, to_parse) == 0)
11193         {
11194           *res = cpu;
11195           return AARCH64_PARSE_OK;
11196         }
11197     }
11198
11199   /* CPU name not found in list.  */
11200   return AARCH64_PARSE_INVALID_ARG;
11201 }
11202
11203 /* Parse TOKEN, which has length LENGTH to see if it is an option
11204    described in FLAG.  If it is, return the index bit for that fusion type.
11205    If not, error (printing OPTION_NAME) and return zero.  */
11206
11207 static unsigned int
11208 aarch64_parse_one_option_token (const char *token,
11209                                 size_t length,
11210                                 const struct aarch64_flag_desc *flag,
11211                                 const char *option_name)
11212 {
11213   for (; flag->name != NULL; flag++)
11214     {
11215       if (length == strlen (flag->name)
11216           && !strncmp (flag->name, token, length))
11217         return flag->flag;
11218     }
11219
11220   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
11221   return 0;
11222 }
11223
11224 /* Parse OPTION which is a comma-separated list of flags to enable.
11225    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11226    default state we inherit from the CPU tuning structures.  OPTION_NAME
11227    gives the top-level option we are parsing in the -moverride string,
11228    for use in error messages.  */
11229
11230 static unsigned int
11231 aarch64_parse_boolean_options (const char *option,
11232                                const struct aarch64_flag_desc *flags,
11233                                unsigned int initial_state,
11234                                const char *option_name)
11235 {
11236   const char separator = '.';
11237   const char* specs = option;
11238   const char* ntoken = option;
11239   unsigned int found_flags = initial_state;
11240
11241   while ((ntoken = strchr (specs, separator)))
11242     {
11243       size_t token_length = ntoken - specs;
11244       unsigned token_ops = aarch64_parse_one_option_token (specs,
11245                                                            token_length,
11246                                                            flags,
11247                                                            option_name);
11248       /* If we find "none" (or, for simplicity's sake, an error) anywhere
11249          in the token stream, reset the supported operations.  So:
11250
11251            adrp+add.cmp+branch.none.adrp+add
11252
11253            would have the result of turning on only adrp+add fusion.  */
11254       if (!token_ops)
11255         found_flags = 0;
11256
11257       found_flags |= token_ops;
11258       specs = ++ntoken;
11259     }
11260
11261   /* We ended with a comma, print something.  */
11262   if (!(*specs))
11263     {
11264       error ("%s string ill-formed\n", option_name);
11265       return 0;
11266     }
11267
11268   /* We still have one more token to parse.  */
11269   size_t token_length = strlen (specs);
11270   unsigned token_ops = aarch64_parse_one_option_token (specs,
11271                                                        token_length,
11272                                                        flags,
11273                                                        option_name);
11274    if (!token_ops)
11275      found_flags = 0;
11276
11277   found_flags |= token_ops;
11278   return found_flags;
11279 }
11280
11281 /* Support for overriding instruction fusion.  */
11282
11283 static void
11284 aarch64_parse_fuse_string (const char *fuse_string,
11285                             struct tune_params *tune)
11286 {
11287   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
11288                                                      aarch64_fusible_pairs,
11289                                                      tune->fusible_ops,
11290                                                      "fuse=");
11291 }
11292
11293 /* Support for overriding other tuning flags.  */
11294
11295 static void
11296 aarch64_parse_tune_string (const char *tune_string,
11297                             struct tune_params *tune)
11298 {
11299   tune->extra_tuning_flags
11300     = aarch64_parse_boolean_options (tune_string,
11301                                      aarch64_tuning_flags,
11302                                      tune->extra_tuning_flags,
11303                                      "tune=");
11304 }
11305
11306 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11307    Accept the valid SVE vector widths allowed by
11308    aarch64_sve_vector_bits_enum and use it to override sve_width
11309    in TUNE.  */
11310
11311 static void
11312 aarch64_parse_sve_width_string (const char *tune_string,
11313                                 struct tune_params *tune)
11314 {
11315   int width = -1;
11316
11317   int n = sscanf (tune_string, "%d", &width);
11318   if (n == EOF)
11319     {
11320       error ("invalid format for sve_width");
11321       return;
11322     }
11323   switch (width)
11324     {
11325     case SVE_128:
11326     case SVE_256:
11327     case SVE_512:
11328     case SVE_1024:
11329     case SVE_2048:
11330       break;
11331     default:
11332       error ("invalid sve_width value: %d", width);
11333     }
11334   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
11335 }
11336
11337 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11338    we understand.  If it is, extract the option string and handoff to
11339    the appropriate function.  */
11340
11341 void
11342 aarch64_parse_one_override_token (const char* token,
11343                                   size_t length,
11344                                   struct tune_params *tune)
11345 {
11346   const struct aarch64_tuning_override_function *fn
11347     = aarch64_tuning_override_functions;
11348
11349   const char *option_part = strchr (token, '=');
11350   if (!option_part)
11351     {
11352       error ("tuning string missing in option (%s)", token);
11353       return;
11354     }
11355
11356   /* Get the length of the option name.  */
11357   length = option_part - token;
11358   /* Skip the '=' to get to the option string.  */
11359   option_part++;
11360
11361   for (; fn->name != NULL; fn++)
11362     {
11363       if (!strncmp (fn->name, token, length))
11364         {
11365           fn->parse_override (option_part, tune);
11366           return;
11367         }
11368     }
11369
11370   error ("unknown tuning option (%s)",token);
11371   return;
11372 }
11373
11374 /* A checking mechanism for the implementation of the tls size.  */
11375
11376 static void
11377 initialize_aarch64_tls_size (struct gcc_options *opts)
11378 {
11379   if (aarch64_tls_size == 0)
11380     aarch64_tls_size = 24;
11381
11382   switch (opts->x_aarch64_cmodel_var)
11383     {
11384     case AARCH64_CMODEL_TINY:
11385       /* Both the default and maximum TLS size allowed under tiny is 1M which
11386          needs two instructions to address, so we clamp the size to 24.  */
11387       if (aarch64_tls_size > 24)
11388         aarch64_tls_size = 24;
11389       break;
11390     case AARCH64_CMODEL_SMALL:
11391       /* The maximum TLS size allowed under small is 4G.  */
11392       if (aarch64_tls_size > 32)
11393         aarch64_tls_size = 32;
11394       break;
11395     case AARCH64_CMODEL_LARGE:
11396       /* The maximum TLS size allowed under large is 16E.
11397          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
11398       if (aarch64_tls_size > 48)
11399         aarch64_tls_size = 48;
11400       break;
11401     default:
11402       gcc_unreachable ();
11403     }
11404
11405   return;
11406 }
11407
11408 /* Parse STRING looking for options in the format:
11409      string     :: option:string
11410      option     :: name=substring
11411      name       :: {a-z}
11412      substring  :: defined by option.  */
11413
11414 static void
11415 aarch64_parse_override_string (const char* input_string,
11416                                struct tune_params* tune)
11417 {
11418   const char separator = ':';
11419   size_t string_length = strlen (input_string) + 1;
11420   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
11421   char *string = string_root;
11422   strncpy (string, input_string, string_length);
11423   string[string_length - 1] = '\0';
11424
11425   char* ntoken = string;
11426
11427   while ((ntoken = strchr (string, separator)))
11428     {
11429       size_t token_length = ntoken - string;
11430       /* Make this substring look like a string.  */
11431       *ntoken = '\0';
11432       aarch64_parse_one_override_token (string, token_length, tune);
11433       string = ++ntoken;
11434     }
11435
11436   /* One last option to parse.  */
11437   aarch64_parse_one_override_token (string, strlen (string), tune);
11438   free (string_root);
11439 }
11440
11441
11442 static void
11443 aarch64_override_options_after_change_1 (struct gcc_options *opts)
11444 {
11445   if (accepted_branch_protection_string)
11446     {
11447       opts->x_aarch64_branch_protection_string
11448         = xstrdup (accepted_branch_protection_string);
11449     }
11450
11451   /* PR 70044: We have to be careful about being called multiple times for the
11452      same function.  This means all changes should be repeatable.  */
11453
11454   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
11455      Disable the frame pointer flag so the mid-end will not use a frame
11456      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
11457      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
11458      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
11459   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
11460   if (opts->x_flag_omit_frame_pointer == 0)
11461     opts->x_flag_omit_frame_pointer = 2;
11462
11463   /* If not optimizing for size, set the default
11464      alignment to what the target wants.  */
11465   if (!opts->x_optimize_size)
11466     {
11467       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
11468         opts->x_str_align_loops = aarch64_tune_params.loop_align;
11469       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
11470         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
11471       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
11472         opts->x_str_align_functions = aarch64_tune_params.function_align;
11473     }
11474
11475   /* We default to no pc-relative literal loads.  */
11476
11477   aarch64_pcrelative_literal_loads = false;
11478
11479   /* If -mpc-relative-literal-loads is set on the command line, this
11480      implies that the user asked for PC relative literal loads.  */
11481   if (opts->x_pcrelative_literal_loads == 1)
11482     aarch64_pcrelative_literal_loads = true;
11483
11484   /* In the tiny memory model it makes no sense to disallow PC relative
11485      literal pool loads.  */
11486   if (aarch64_cmodel == AARCH64_CMODEL_TINY
11487       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11488     aarch64_pcrelative_literal_loads = true;
11489
11490   /* When enabling the lower precision Newton series for the square root, also
11491      enable it for the reciprocal square root, since the latter is an
11492      intermediary step for the former.  */
11493   if (flag_mlow_precision_sqrt)
11494     flag_mrecip_low_precision_sqrt = true;
11495 }
11496
11497 /* 'Unpack' up the internal tuning structs and update the options
11498     in OPTS.  The caller must have set up selected_tune and selected_arch
11499     as all the other target-specific codegen decisions are
11500     derived from them.  */
11501
11502 void
11503 aarch64_override_options_internal (struct gcc_options *opts)
11504 {
11505   aarch64_tune_flags = selected_tune->flags;
11506   aarch64_tune = selected_tune->sched_core;
11507   /* Make a copy of the tuning parameters attached to the core, which
11508      we may later overwrite.  */
11509   aarch64_tune_params = *(selected_tune->tune);
11510   aarch64_architecture_version = selected_arch->architecture_version;
11511
11512   if (opts->x_aarch64_override_tune_string)
11513     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
11514                                   &aarch64_tune_params);
11515
11516   /* This target defaults to strict volatile bitfields.  */
11517   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
11518     opts->x_flag_strict_volatile_bitfields = 1;
11519
11520   if (aarch64_stack_protector_guard == SSP_GLOBAL
11521       && opts->x_aarch64_stack_protector_guard_offset_str)
11522     {
11523       error ("incompatible options %<-mstack-protector-guard=global%> and "
11524              "%<-mstack-protector-guard-offset=%s%>",
11525              aarch64_stack_protector_guard_offset_str);
11526     }
11527
11528   if (aarch64_stack_protector_guard == SSP_SYSREG
11529       && !(opts->x_aarch64_stack_protector_guard_offset_str
11530            && opts->x_aarch64_stack_protector_guard_reg_str))
11531     {
11532       error ("both %<-mstack-protector-guard-offset%> and "
11533              "%<-mstack-protector-guard-reg%> must be used "
11534              "with %<-mstack-protector-guard=sysreg%>");
11535     }
11536
11537   if (opts->x_aarch64_stack_protector_guard_reg_str)
11538     {
11539       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
11540           error ("specify a system register with a small string length.");
11541     }
11542
11543   if (opts->x_aarch64_stack_protector_guard_offset_str)
11544     {
11545       char *end;
11546       const char *str = aarch64_stack_protector_guard_offset_str;
11547       errno = 0;
11548       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
11549       if (!*str || *end || errno)
11550         error ("%qs is not a valid offset in %qs", str,
11551                "-mstack-protector-guard-offset=");
11552       aarch64_stack_protector_guard_offset = offs;
11553     }
11554
11555   initialize_aarch64_code_model (opts);
11556   initialize_aarch64_tls_size (opts);
11557
11558   int queue_depth = 0;
11559   switch (aarch64_tune_params.autoprefetcher_model)
11560     {
11561       case tune_params::AUTOPREFETCHER_OFF:
11562         queue_depth = -1;
11563         break;
11564       case tune_params::AUTOPREFETCHER_WEAK:
11565         queue_depth = 0;
11566         break;
11567       case tune_params::AUTOPREFETCHER_STRONG:
11568         queue_depth = max_insn_queue_index + 1;
11569         break;
11570       default:
11571         gcc_unreachable ();
11572     }
11573
11574   /* We don't mind passing in global_options_set here as we don't use
11575      the *options_set structs anyway.  */
11576   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
11577                          queue_depth,
11578                          opts->x_param_values,
11579                          global_options_set.x_param_values);
11580
11581   /* Set up parameters to be used in prefetching algorithm.  Do not
11582      override the defaults unless we are tuning for a core we have
11583      researched values for.  */
11584   if (aarch64_tune_params.prefetch->num_slots > 0)
11585     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
11586                            aarch64_tune_params.prefetch->num_slots,
11587                            opts->x_param_values,
11588                            global_options_set.x_param_values);
11589   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
11590     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
11591                            aarch64_tune_params.prefetch->l1_cache_size,
11592                            opts->x_param_values,
11593                            global_options_set.x_param_values);
11594   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
11595     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
11596                            aarch64_tune_params.prefetch->l1_cache_line_size,
11597                            opts->x_param_values,
11598                            global_options_set.x_param_values);
11599   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
11600     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
11601                            aarch64_tune_params.prefetch->l2_cache_size,
11602                            opts->x_param_values,
11603                            global_options_set.x_param_values);
11604   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
11605     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
11606                            0,
11607                            opts->x_param_values,
11608                            global_options_set.x_param_values);
11609   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
11610     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
11611                            aarch64_tune_params.prefetch->minimum_stride,
11612                            opts->x_param_values,
11613                            global_options_set.x_param_values);
11614
11615   /* Use the alternative scheduling-pressure algorithm by default.  */
11616   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
11617                          opts->x_param_values,
11618                          global_options_set.x_param_values);
11619
11620   /* If the user hasn't changed it via configure then set the default to 64 KB
11621      for the backend.  */
11622   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
11623                          DEFAULT_STK_CLASH_GUARD_SIZE == 0
11624                            ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
11625                          opts->x_param_values,
11626                          global_options_set.x_param_values);
11627
11628   /* Validate the guard size.  */
11629   int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
11630
11631   /* Enforce that interval is the same size as size so the mid-end does the
11632      right thing.  */
11633   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
11634                          guard_size,
11635                          opts->x_param_values,
11636                          global_options_set.x_param_values);
11637
11638   /* The maybe_set calls won't update the value if the user has explicitly set
11639      one.  Which means we need to validate that probing interval and guard size
11640      are equal.  */
11641   int probe_interval
11642     = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
11643   if (guard_size != probe_interval)
11644     error ("stack clash guard size %<%d%> must be equal to probing interval "
11645            "%<%d%>", guard_size, probe_interval);
11646
11647   /* Enable sw prefetching at specified optimization level for
11648      CPUS that have prefetch.  Lower optimization level threshold by 1
11649      when profiling is enabled.  */
11650   if (opts->x_flag_prefetch_loop_arrays < 0
11651       && !opts->x_optimize_size
11652       && aarch64_tune_params.prefetch->default_opt_level >= 0
11653       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
11654     opts->x_flag_prefetch_loop_arrays = 1;
11655
11656   if (opts->x_aarch64_arch_string == NULL)
11657     opts->x_aarch64_arch_string = selected_arch->name;
11658   if (opts->x_aarch64_cpu_string == NULL)
11659     opts->x_aarch64_cpu_string = selected_cpu->name;
11660   if (opts->x_aarch64_tune_string == NULL)
11661     opts->x_aarch64_tune_string = selected_tune->name;
11662
11663   aarch64_override_options_after_change_1 (opts);
11664 }
11665
11666 /* Print a hint with a suggestion for a core or architecture name that
11667    most closely resembles what the user passed in STR.  ARCH is true if
11668    the user is asking for an architecture name.  ARCH is false if the user
11669    is asking for a core name.  */
11670
11671 static void
11672 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
11673 {
11674   auto_vec<const char *> candidates;
11675   const struct processor *entry = arch ? all_architectures : all_cores;
11676   for (; entry->name != NULL; entry++)
11677     candidates.safe_push (entry->name);
11678
11679 #ifdef HAVE_LOCAL_CPU_DETECT
11680   /* Add also "native" as possible value.  */
11681   if (arch)
11682     candidates.safe_push ("native");
11683 #endif
11684
11685   char *s;
11686   const char *hint = candidates_list_and_hint (str, s, candidates);
11687   if (hint)
11688     inform (input_location, "valid arguments are: %s;"
11689                              " did you mean %qs?", s, hint);
11690   else
11691     inform (input_location, "valid arguments are: %s", s);
11692
11693   XDELETEVEC (s);
11694 }
11695
11696 /* Print a hint with a suggestion for a core name that most closely resembles
11697    what the user passed in STR.  */
11698
11699 inline static void
11700 aarch64_print_hint_for_core (const char *str)
11701 {
11702   aarch64_print_hint_for_core_or_arch (str, false);
11703 }
11704
11705 /* Print a hint with a suggestion for an architecture name that most closely
11706    resembles what the user passed in STR.  */
11707
11708 inline static void
11709 aarch64_print_hint_for_arch (const char *str)
11710 {
11711   aarch64_print_hint_for_core_or_arch (str, true);
11712 }
11713
11714
11715 /* Print a hint with a suggestion for an extension name
11716    that most closely resembles what the user passed in STR.  */
11717
11718 void
11719 aarch64_print_hint_for_extensions (const std::string &str)
11720 {
11721   auto_vec<const char *> candidates;
11722   aarch64_get_all_extension_candidates (&candidates);
11723   char *s;
11724   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
11725   if (hint)
11726     inform (input_location, "valid arguments are: %s;"
11727                              " did you mean %qs?", s, hint);
11728   else
11729     inform (input_location, "valid arguments are: %s;", s);
11730
11731   XDELETEVEC (s);
11732 }
11733
11734 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
11735    specified in STR and throw errors if appropriate.  Put the results if
11736    they are valid in RES and ISA_FLAGS.  Return whether the option is
11737    valid.  */
11738
11739 static bool
11740 aarch64_validate_mcpu (const char *str, const struct processor **res,
11741                        uint64_t *isa_flags)
11742 {
11743   std::string invalid_extension;
11744   enum aarch64_parse_opt_result parse_res
11745     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
11746
11747   if (parse_res == AARCH64_PARSE_OK)
11748     return true;
11749
11750   switch (parse_res)
11751     {
11752       case AARCH64_PARSE_MISSING_ARG:
11753         error ("missing cpu name in %<-mcpu=%s%>", str);
11754         break;
11755       case AARCH64_PARSE_INVALID_ARG:
11756         error ("unknown value %qs for %<-mcpu%>", str);
11757         aarch64_print_hint_for_core (str);
11758         break;
11759       case AARCH64_PARSE_INVALID_FEATURE:
11760         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
11761                invalid_extension.c_str (), str);
11762         aarch64_print_hint_for_extensions (invalid_extension);
11763         break;
11764       default:
11765         gcc_unreachable ();
11766     }
11767
11768   return false;
11769 }
11770
11771 /* Parses CONST_STR for branch protection features specified in
11772    aarch64_branch_protect_types, and set any global variables required.  Returns
11773    the parsing result and assigns LAST_STR to the last processed token from
11774    CONST_STR so that it can be used for error reporting.  */
11775
11776 static enum
11777 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
11778                                                           char** last_str)
11779 {
11780   char *str_root = xstrdup (const_str);
11781   char* token_save = NULL;
11782   char *str = strtok_r (str_root, "+", &token_save);
11783   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
11784   if (!str)
11785     res = AARCH64_PARSE_MISSING_ARG;
11786   else
11787     {
11788       char *next_str = strtok_r (NULL, "+", &token_save);
11789       /* Reset the branch protection features to their defaults.  */
11790       aarch64_handle_no_branch_protection (NULL, NULL);
11791
11792       while (str && res == AARCH64_PARSE_OK)
11793         {
11794           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
11795           bool found = false;
11796           /* Search for this type.  */
11797           while (type && type->name && !found && res == AARCH64_PARSE_OK)
11798             {
11799               if (strcmp (str, type->name) == 0)
11800                 {
11801                   found = true;
11802                   res = type->handler (str, next_str);
11803                   str = next_str;
11804                   next_str = strtok_r (NULL, "+", &token_save);
11805                 }
11806               else
11807                 type++;
11808             }
11809           if (found && res == AARCH64_PARSE_OK)
11810             {
11811               bool found_subtype = true;
11812               /* Loop through each token until we find one that isn't a
11813                  subtype.  */
11814               while (found_subtype)
11815                 {
11816                   found_subtype = false;
11817                   const aarch64_branch_protect_type *subtype = type->subtypes;
11818                   /* Search for the subtype.  */
11819                   while (str && subtype && subtype->name && !found_subtype
11820                           && res == AARCH64_PARSE_OK)
11821                     {
11822                       if (strcmp (str, subtype->name) == 0)
11823                         {
11824                           found_subtype = true;
11825                           res = subtype->handler (str, next_str);
11826                           str = next_str;
11827                           next_str = strtok_r (NULL, "+", &token_save);
11828                         }
11829                       else
11830                         subtype++;
11831                     }
11832                 }
11833             }
11834           else if (!found)
11835             res = AARCH64_PARSE_INVALID_ARG;
11836         }
11837     }
11838   /* Copy the last processed token into the argument to pass it back.
11839     Used by option and attribute validation to print the offending token.  */
11840   if (last_str)
11841     {
11842       if (str) strcpy (*last_str, str);
11843       else *last_str = NULL;
11844     }
11845   if (res == AARCH64_PARSE_OK)
11846     {
11847       /* If needed, alloc the accepted string then copy in const_str.
11848         Used by override_option_after_change_1.  */
11849       if (!accepted_branch_protection_string)
11850         accepted_branch_protection_string = (char *) xmalloc (
11851                                                       BRANCH_PROTECT_STR_MAX
11852                                                         + 1);
11853       strncpy (accepted_branch_protection_string, const_str,
11854                 BRANCH_PROTECT_STR_MAX + 1);
11855       /* Forcibly null-terminate.  */
11856       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
11857     }
11858   return res;
11859 }
11860
11861 static bool
11862 aarch64_validate_mbranch_protection (const char *const_str)
11863 {
11864   char *str = (char *) xmalloc (strlen (const_str));
11865   enum aarch64_parse_opt_result res =
11866     aarch64_parse_branch_protection (const_str, &str);
11867   if (res == AARCH64_PARSE_INVALID_ARG)
11868     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
11869   else if (res == AARCH64_PARSE_MISSING_ARG)
11870     error ("missing argument for %<-mbranch-protection=%>");
11871   free (str);
11872   return res == AARCH64_PARSE_OK;
11873 }
11874
11875 /* Validate a command-line -march option.  Parse the arch and extensions
11876    (if any) specified in STR and throw errors if appropriate.  Put the
11877    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
11878    option is valid.  */
11879
11880 static bool
11881 aarch64_validate_march (const char *str, const struct processor **res,
11882                          uint64_t *isa_flags)
11883 {
11884   std::string invalid_extension;
11885   enum aarch64_parse_opt_result parse_res
11886     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
11887
11888   if (parse_res == AARCH64_PARSE_OK)
11889     return true;
11890
11891   switch (parse_res)
11892     {
11893       case AARCH64_PARSE_MISSING_ARG:
11894         error ("missing arch name in %<-march=%s%>", str);
11895         break;
11896       case AARCH64_PARSE_INVALID_ARG:
11897         error ("unknown value %qs for %<-march%>", str);
11898         aarch64_print_hint_for_arch (str);
11899         break;
11900       case AARCH64_PARSE_INVALID_FEATURE:
11901         error ("invalid feature modifier %qs in %<-march=%s%>",
11902                invalid_extension.c_str (), str);
11903         aarch64_print_hint_for_extensions (invalid_extension);
11904         break;
11905       default:
11906         gcc_unreachable ();
11907     }
11908
11909   return false;
11910 }
11911
11912 /* Validate a command-line -mtune option.  Parse the cpu
11913    specified in STR and throw errors if appropriate.  Put the
11914    result, if it is valid, in RES.  Return whether the option is
11915    valid.  */
11916
11917 static bool
11918 aarch64_validate_mtune (const char *str, const struct processor **res)
11919 {
11920   enum aarch64_parse_opt_result parse_res
11921     = aarch64_parse_tune (str, res);
11922
11923   if (parse_res == AARCH64_PARSE_OK)
11924     return true;
11925
11926   switch (parse_res)
11927     {
11928       case AARCH64_PARSE_MISSING_ARG:
11929         error ("missing cpu name in %<-mtune=%s%>", str);
11930         break;
11931       case AARCH64_PARSE_INVALID_ARG:
11932         error ("unknown value %qs for %<-mtune%>", str);
11933         aarch64_print_hint_for_core (str);
11934         break;
11935       default:
11936         gcc_unreachable ();
11937     }
11938   return false;
11939 }
11940
11941 /* Return the CPU corresponding to the enum CPU.
11942    If it doesn't specify a cpu, return the default.  */
11943
11944 static const struct processor *
11945 aarch64_get_tune_cpu (enum aarch64_processor cpu)
11946 {
11947   if (cpu != aarch64_none)
11948     return &all_cores[cpu];
11949
11950   /* The & 0x3f is to extract the bottom 6 bits that encode the
11951      default cpu as selected by the --with-cpu GCC configure option
11952      in config.gcc.
11953      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
11954      flags mechanism should be reworked to make it more sane.  */
11955   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11956 }
11957
11958 /* Return the architecture corresponding to the enum ARCH.
11959    If it doesn't specify a valid architecture, return the default.  */
11960
11961 static const struct processor *
11962 aarch64_get_arch (enum aarch64_arch arch)
11963 {
11964   if (arch != aarch64_no_arch)
11965     return &all_architectures[arch];
11966
11967   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11968
11969   return &all_architectures[cpu->arch];
11970 }
11971
11972 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
11973
11974 static poly_uint16
11975 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
11976 {
11977   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
11978      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
11979      deciding which .md file patterns to use and when deciding whether
11980      something is a legitimate address or constant.  */
11981   if (value == SVE_SCALABLE || value == SVE_128)
11982     return poly_uint16 (2, 2);
11983   else
11984     return (int) value / 64;
11985 }
11986
11987 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
11988    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
11989    tuning structs.  In particular it must set selected_tune and
11990    aarch64_isa_flags that define the available ISA features and tuning
11991    decisions.  It must also set selected_arch as this will be used to
11992    output the .arch asm tags for each function.  */
11993
11994 static void
11995 aarch64_override_options (void)
11996 {
11997   uint64_t cpu_isa = 0;
11998   uint64_t arch_isa = 0;
11999   aarch64_isa_flags = 0;
12000
12001   bool valid_cpu = true;
12002   bool valid_tune = true;
12003   bool valid_arch = true;
12004
12005   selected_cpu = NULL;
12006   selected_arch = NULL;
12007   selected_tune = NULL;
12008
12009   if (aarch64_branch_protection_string)
12010     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12011
12012   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12013      If either of -march or -mtune is given, they override their
12014      respective component of -mcpu.  */
12015   if (aarch64_cpu_string)
12016     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12017                                         &cpu_isa);
12018
12019   if (aarch64_arch_string)
12020     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
12021                                           &arch_isa);
12022
12023   if (aarch64_tune_string)
12024     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
12025
12026 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12027   SUBTARGET_OVERRIDE_OPTIONS;
12028 #endif
12029
12030   /* If the user did not specify a processor, choose the default
12031      one for them.  This will be the CPU set during configuration using
12032      --with-cpu, otherwise it is "generic".  */
12033   if (!selected_cpu)
12034     {
12035       if (selected_arch)
12036         {
12037           selected_cpu = &all_cores[selected_arch->ident];
12038           aarch64_isa_flags = arch_isa;
12039           explicit_arch = selected_arch->arch;
12040         }
12041       else
12042         {
12043           /* Get default configure-time CPU.  */
12044           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
12045           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
12046         }
12047
12048       if (selected_tune)
12049         explicit_tune_core = selected_tune->ident;
12050     }
12051   /* If both -mcpu and -march are specified check that they are architecturally
12052      compatible, warn if they're not and prefer the -march ISA flags.  */
12053   else if (selected_arch)
12054     {
12055       if (selected_arch->arch != selected_cpu->arch)
12056         {
12057           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12058                        all_architectures[selected_cpu->arch].name,
12059                        selected_arch->name);
12060         }
12061       aarch64_isa_flags = arch_isa;
12062       explicit_arch = selected_arch->arch;
12063       explicit_tune_core = selected_tune ? selected_tune->ident
12064                                           : selected_cpu->ident;
12065     }
12066   else
12067     {
12068       /* -mcpu but no -march.  */
12069       aarch64_isa_flags = cpu_isa;
12070       explicit_tune_core = selected_tune ? selected_tune->ident
12071                                           : selected_cpu->ident;
12072       gcc_assert (selected_cpu);
12073       selected_arch = &all_architectures[selected_cpu->arch];
12074       explicit_arch = selected_arch->arch;
12075     }
12076
12077   /* Set the arch as well as we will need it when outputing
12078      the .arch directive in assembly.  */
12079   if (!selected_arch)
12080     {
12081       gcc_assert (selected_cpu);
12082       selected_arch = &all_architectures[selected_cpu->arch];
12083     }
12084
12085   if (!selected_tune)
12086     selected_tune = selected_cpu;
12087
12088   if (aarch64_enable_bti == 2)
12089     {
12090 #ifdef TARGET_ENABLE_BTI
12091       aarch64_enable_bti = 1;
12092 #else
12093       aarch64_enable_bti = 0;
12094 #endif
12095     }
12096
12097   /* Return address signing is currently not supported for ILP32 targets.  For
12098      LP64 targets use the configured option in the absence of a command-line
12099      option for -mbranch-protection.  */
12100   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12101     {
12102 #ifdef TARGET_ENABLE_PAC_RET
12103       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12104 #else
12105       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12106 #endif
12107     }
12108
12109 #ifndef HAVE_AS_MABI_OPTION
12110   /* The compiler may have been configured with 2.23.* binutils, which does
12111      not have support for ILP32.  */
12112   if (TARGET_ILP32)
12113     error ("assembler does not support %<-mabi=ilp32%>");
12114 #endif
12115
12116   /* Convert -msve-vector-bits to a VG count.  */
12117   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12118
12119   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12120     sorry ("return address signing is only supported for %<-mabi=lp64%>");
12121
12122   /* Make sure we properly set up the explicit options.  */
12123   if ((aarch64_cpu_string && valid_cpu)
12124        || (aarch64_tune_string && valid_tune))
12125     gcc_assert (explicit_tune_core != aarch64_none);
12126
12127   if ((aarch64_cpu_string && valid_cpu)
12128        || (aarch64_arch_string && valid_arch))
12129     gcc_assert (explicit_arch != aarch64_no_arch);
12130
12131   /* The pass to insert speculation tracking runs before
12132      shrink-wrapping and the latter does not know how to update the
12133      tracking status.  So disable it in this case.  */
12134   if (aarch64_track_speculation)
12135     flag_shrink_wrap = 0;
12136
12137   aarch64_override_options_internal (&global_options);
12138
12139   /* Save these options as the default ones in case we push and pop them later
12140      while processing functions with potential target attributes.  */
12141   target_option_default_node = target_option_current_node
12142       = build_target_option_node (&global_options);
12143 }
12144
12145 /* Implement targetm.override_options_after_change.  */
12146
12147 static void
12148 aarch64_override_options_after_change (void)
12149 {
12150   aarch64_override_options_after_change_1 (&global_options);
12151 }
12152
12153 static struct machine_function *
12154 aarch64_init_machine_status (void)
12155 {
12156   struct machine_function *machine;
12157   machine = ggc_cleared_alloc<machine_function> ();
12158   return machine;
12159 }
12160
12161 void
12162 aarch64_init_expanders (void)
12163 {
12164   init_machine_status = aarch64_init_machine_status;
12165 }
12166
12167 /* A checking mechanism for the implementation of the various code models.  */
12168 static void
12169 initialize_aarch64_code_model (struct gcc_options *opts)
12170 {
12171    if (opts->x_flag_pic)
12172      {
12173        switch (opts->x_aarch64_cmodel_var)
12174          {
12175          case AARCH64_CMODEL_TINY:
12176            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
12177            break;
12178          case AARCH64_CMODEL_SMALL:
12179 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12180            aarch64_cmodel = (flag_pic == 2
12181                              ? AARCH64_CMODEL_SMALL_PIC
12182                              : AARCH64_CMODEL_SMALL_SPIC);
12183 #else
12184            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
12185 #endif
12186            break;
12187          case AARCH64_CMODEL_LARGE:
12188            sorry ("code model %qs with %<-f%s%>", "large",
12189                   opts->x_flag_pic > 1 ? "PIC" : "pic");
12190            break;
12191          default:
12192            gcc_unreachable ();
12193          }
12194      }
12195    else
12196      aarch64_cmodel = opts->x_aarch64_cmodel_var;
12197 }
12198
12199 /* Implement TARGET_OPTION_SAVE.  */
12200
12201 static void
12202 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
12203 {
12204   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
12205   ptr->x_aarch64_branch_protection_string
12206     = opts->x_aarch64_branch_protection_string;
12207 }
12208
12209 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
12210    using the information saved in PTR.  */
12211
12212 static void
12213 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
12214 {
12215   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
12216   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12217   opts->x_explicit_arch = ptr->x_explicit_arch;
12218   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
12219   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
12220   opts->x_aarch64_branch_protection_string
12221     = ptr->x_aarch64_branch_protection_string;
12222   if (opts->x_aarch64_branch_protection_string)
12223     {
12224       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
12225                                         NULL);
12226     }
12227
12228   aarch64_override_options_internal (opts);
12229 }
12230
12231 /* Implement TARGET_OPTION_PRINT.  */
12232
12233 static void
12234 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
12235 {
12236   const struct processor *cpu
12237     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12238   uint64_t isa_flags = ptr->x_aarch64_isa_flags;
12239   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
12240   std::string extension
12241     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
12242
12243   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
12244   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
12245            arch->name, extension.c_str ());
12246 }
12247
12248 static GTY(()) tree aarch64_previous_fndecl;
12249
12250 void
12251 aarch64_reset_previous_fndecl (void)
12252 {
12253   aarch64_previous_fndecl = NULL;
12254 }
12255
12256 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
12257    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
12258    make sure optab availability predicates are recomputed when necessary.  */
12259
12260 void
12261 aarch64_save_restore_target_globals (tree new_tree)
12262 {
12263   if (TREE_TARGET_GLOBALS (new_tree))
12264     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
12265   else if (new_tree == target_option_default_node)
12266     restore_target_globals (&default_target_globals);
12267   else
12268     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
12269 }
12270
12271 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
12272    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
12273    of the function, if such exists.  This function may be called multiple
12274    times on a single function so use aarch64_previous_fndecl to avoid
12275    setting up identical state.  */
12276
12277 static void
12278 aarch64_set_current_function (tree fndecl)
12279 {
12280   if (!fndecl || fndecl == aarch64_previous_fndecl)
12281     return;
12282
12283   tree old_tree = (aarch64_previous_fndecl
12284                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
12285                    : NULL_TREE);
12286
12287   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12288
12289   /* If current function has no attributes but the previous one did,
12290      use the default node.  */
12291   if (!new_tree && old_tree)
12292     new_tree = target_option_default_node;
12293
12294   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
12295      the default have been handled by aarch64_save_restore_target_globals from
12296      aarch64_pragma_target_parse.  */
12297   if (old_tree == new_tree)
12298     return;
12299
12300   aarch64_previous_fndecl = fndecl;
12301
12302   /* First set the target options.  */
12303   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
12304
12305   aarch64_save_restore_target_globals (new_tree);
12306 }
12307
12308 /* Enum describing the various ways we can handle attributes.
12309    In many cases we can reuse the generic option handling machinery.  */
12310
12311 enum aarch64_attr_opt_type
12312 {
12313   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
12314   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
12315   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
12316   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
12317 };
12318
12319 /* All the information needed to handle a target attribute.
12320    NAME is the name of the attribute.
12321    ATTR_TYPE specifies the type of behavior of the attribute as described
12322    in the definition of enum aarch64_attr_opt_type.
12323    ALLOW_NEG is true if the attribute supports a "no-" form.
12324    HANDLER is the function that takes the attribute string as an argument
12325    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12326    OPT_NUM is the enum specifying the option that the attribute modifies.
12327    This is needed for attributes that mirror the behavior of a command-line
12328    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12329    aarch64_attr_enum.  */
12330
12331 struct aarch64_attribute_info
12332 {
12333   const char *name;
12334   enum aarch64_attr_opt_type attr_type;
12335   bool allow_neg;
12336   bool (*handler) (const char *);
12337   enum opt_code opt_num;
12338 };
12339
12340 /* Handle the ARCH_STR argument to the arch= target attribute.  */
12341
12342 static bool
12343 aarch64_handle_attr_arch (const char *str)
12344 {
12345   const struct processor *tmp_arch = NULL;
12346   std::string invalid_extension;
12347   enum aarch64_parse_opt_result parse_res
12348     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
12349
12350   if (parse_res == AARCH64_PARSE_OK)
12351     {
12352       gcc_assert (tmp_arch);
12353       selected_arch = tmp_arch;
12354       explicit_arch = selected_arch->arch;
12355       return true;
12356     }
12357
12358   switch (parse_res)
12359     {
12360       case AARCH64_PARSE_MISSING_ARG:
12361         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12362         break;
12363       case AARCH64_PARSE_INVALID_ARG:
12364         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
12365         aarch64_print_hint_for_arch (str);
12366         break;
12367       case AARCH64_PARSE_INVALID_FEATURE:
12368         error ("invalid feature modifier %s of value (\"%s\") in "
12369                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12370         aarch64_print_hint_for_extensions (invalid_extension);
12371         break;
12372       default:
12373         gcc_unreachable ();
12374     }
12375
12376   return false;
12377 }
12378
12379 /* Handle the argument CPU_STR to the cpu= target attribute.  */
12380
12381 static bool
12382 aarch64_handle_attr_cpu (const char *str)
12383 {
12384   const struct processor *tmp_cpu = NULL;
12385   std::string invalid_extension;
12386   enum aarch64_parse_opt_result parse_res
12387     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
12388
12389   if (parse_res == AARCH64_PARSE_OK)
12390     {
12391       gcc_assert (tmp_cpu);
12392       selected_tune = tmp_cpu;
12393       explicit_tune_core = selected_tune->ident;
12394
12395       selected_arch = &all_architectures[tmp_cpu->arch];
12396       explicit_arch = selected_arch->arch;
12397       return true;
12398     }
12399
12400   switch (parse_res)
12401     {
12402       case AARCH64_PARSE_MISSING_ARG:
12403         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12404         break;
12405       case AARCH64_PARSE_INVALID_ARG:
12406         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
12407         aarch64_print_hint_for_core (str);
12408         break;
12409       case AARCH64_PARSE_INVALID_FEATURE:
12410         error ("invalid feature modifier %s of value (\"%s\") in "
12411                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12412         aarch64_print_hint_for_extensions (invalid_extension);
12413         break;
12414       default:
12415         gcc_unreachable ();
12416     }
12417
12418   return false;
12419 }
12420
12421 /* Handle the argument STR to the branch-protection= attribute.  */
12422
12423  static bool
12424  aarch64_handle_attr_branch_protection (const char* str)
12425  {
12426   char *err_str = (char *) xmalloc (strlen (str));
12427   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
12428                                                                       &err_str);
12429   bool success = false;
12430   switch (res)
12431     {
12432      case AARCH64_PARSE_MISSING_ARG:
12433        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12434               " attribute");
12435        break;
12436      case AARCH64_PARSE_INVALID_ARG:
12437        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12438               "=\")%> pragma or attribute", err_str);
12439        break;
12440      case AARCH64_PARSE_OK:
12441        success = true;
12442       /* Fall through.  */
12443      case AARCH64_PARSE_INVALID_FEATURE:
12444        break;
12445      default:
12446        gcc_unreachable ();
12447     }
12448   free (err_str);
12449   return success;
12450  }
12451
12452 /* Handle the argument STR to the tune= target attribute.  */
12453
12454 static bool
12455 aarch64_handle_attr_tune (const char *str)
12456 {
12457   const struct processor *tmp_tune = NULL;
12458   enum aarch64_parse_opt_result parse_res
12459     = aarch64_parse_tune (str, &tmp_tune);
12460
12461   if (parse_res == AARCH64_PARSE_OK)
12462     {
12463       gcc_assert (tmp_tune);
12464       selected_tune = tmp_tune;
12465       explicit_tune_core = selected_tune->ident;
12466       return true;
12467     }
12468
12469   switch (parse_res)
12470     {
12471       case AARCH64_PARSE_INVALID_ARG:
12472         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
12473         aarch64_print_hint_for_core (str);
12474         break;
12475       default:
12476         gcc_unreachable ();
12477     }
12478
12479   return false;
12480 }
12481
12482 /* Parse an architecture extensions target attribute string specified in STR.
12483    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
12484    if successful.  Update aarch64_isa_flags to reflect the ISA features
12485    modified.  */
12486
12487 static bool
12488 aarch64_handle_attr_isa_flags (char *str)
12489 {
12490   enum aarch64_parse_opt_result parse_res;
12491   uint64_t isa_flags = aarch64_isa_flags;
12492
12493   /* We allow "+nothing" in the beginning to clear out all architectural
12494      features if the user wants to handpick specific features.  */
12495   if (strncmp ("+nothing", str, 8) == 0)
12496     {
12497       isa_flags = 0;
12498       str += 8;
12499     }
12500
12501   std::string invalid_extension;
12502   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
12503
12504   if (parse_res == AARCH64_PARSE_OK)
12505     {
12506       aarch64_isa_flags = isa_flags;
12507       return true;
12508     }
12509
12510   switch (parse_res)
12511     {
12512       case AARCH64_PARSE_MISSING_ARG:
12513         error ("missing value in %<target()%> pragma or attribute");
12514         break;
12515
12516       case AARCH64_PARSE_INVALID_FEATURE:
12517         error ("invalid feature modifier %s of value (\"%s\") in "
12518                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12519         break;
12520
12521       default:
12522         gcc_unreachable ();
12523     }
12524
12525  return false;
12526 }
12527
12528 /* The target attributes that we support.  On top of these we also support just
12529    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
12530    handled explicitly in aarch64_process_one_target_attr.  */
12531
12532 static const struct aarch64_attribute_info aarch64_attributes[] =
12533 {
12534   { "general-regs-only", aarch64_attr_mask, false, NULL,
12535      OPT_mgeneral_regs_only },
12536   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
12537      OPT_mfix_cortex_a53_835769 },
12538   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
12539      OPT_mfix_cortex_a53_843419 },
12540   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
12541   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
12542   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
12543      OPT_momit_leaf_frame_pointer },
12544   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
12545   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
12546      OPT_march_ },
12547   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
12548   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
12549      OPT_mtune_ },
12550   { "branch-protection", aarch64_attr_custom, false,
12551      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
12552   { "sign-return-address", aarch64_attr_enum, false, NULL,
12553      OPT_msign_return_address_ },
12554   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
12555 };
12556
12557 /* Parse ARG_STR which contains the definition of one target attribute.
12558    Show appropriate errors if any or return true if the attribute is valid.  */
12559
12560 static bool
12561 aarch64_process_one_target_attr (char *arg_str)
12562 {
12563   bool invert = false;
12564
12565   size_t len = strlen (arg_str);
12566
12567   if (len == 0)
12568     {
12569       error ("malformed %<target()%> pragma or attribute");
12570       return false;
12571     }
12572
12573   char *str_to_check = (char *) alloca (len + 1);
12574   strcpy (str_to_check, arg_str);
12575
12576   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
12577      It is easier to detect and handle it explicitly here rather than going
12578      through the machinery for the rest of the target attributes in this
12579      function.  */
12580   if (*str_to_check == '+')
12581     return aarch64_handle_attr_isa_flags (str_to_check);
12582
12583   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
12584     {
12585       invert = true;
12586       str_to_check += 3;
12587     }
12588   char *arg = strchr (str_to_check, '=');
12589
12590   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
12591      and point ARG to "foo".  */
12592   if (arg)
12593     {
12594       *arg = '\0';
12595       arg++;
12596     }
12597   const struct aarch64_attribute_info *p_attr;
12598   bool found = false;
12599   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
12600     {
12601       /* If the names don't match up, or the user has given an argument
12602          to an attribute that doesn't accept one, or didn't give an argument
12603          to an attribute that expects one, fail to match.  */
12604       if (strcmp (str_to_check, p_attr->name) != 0)
12605         continue;
12606
12607       found = true;
12608       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
12609                               || p_attr->attr_type == aarch64_attr_enum;
12610
12611       if (attr_need_arg_p ^ (arg != NULL))
12612         {
12613           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
12614           return false;
12615         }
12616
12617       /* If the name matches but the attribute does not allow "no-" versions
12618          then we can't match.  */
12619       if (invert && !p_attr->allow_neg)
12620         {
12621           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
12622           return false;
12623         }
12624
12625       switch (p_attr->attr_type)
12626         {
12627         /* Has a custom handler registered.
12628            For example, cpu=, arch=, tune=.  */
12629           case aarch64_attr_custom:
12630             gcc_assert (p_attr->handler);
12631             if (!p_attr->handler (arg))
12632               return false;
12633             break;
12634
12635           /* Either set or unset a boolean option.  */
12636           case aarch64_attr_bool:
12637             {
12638               struct cl_decoded_option decoded;
12639
12640               generate_option (p_attr->opt_num, NULL, !invert,
12641                                CL_TARGET, &decoded);
12642               aarch64_handle_option (&global_options, &global_options_set,
12643                                       &decoded, input_location);
12644               break;
12645             }
12646           /* Set or unset a bit in the target_flags.  aarch64_handle_option
12647              should know what mask to apply given the option number.  */
12648           case aarch64_attr_mask:
12649             {
12650               struct cl_decoded_option decoded;
12651               /* We only need to specify the option number.
12652                  aarch64_handle_option will know which mask to apply.  */
12653               decoded.opt_index = p_attr->opt_num;
12654               decoded.value = !invert;
12655               aarch64_handle_option (&global_options, &global_options_set,
12656                                       &decoded, input_location);
12657               break;
12658             }
12659           /* Use the option setting machinery to set an option to an enum.  */
12660           case aarch64_attr_enum:
12661             {
12662               gcc_assert (arg);
12663               bool valid;
12664               int value;
12665               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
12666                                               &value, CL_TARGET);
12667               if (valid)
12668                 {
12669                   set_option (&global_options, NULL, p_attr->opt_num, value,
12670                               NULL, DK_UNSPECIFIED, input_location,
12671                               global_dc);
12672                 }
12673               else
12674                 {
12675                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
12676                 }
12677               break;
12678             }
12679           default:
12680             gcc_unreachable ();
12681         }
12682     }
12683
12684   /* If we reached here we either have found an attribute and validated
12685      it or didn't match any.  If we matched an attribute but its arguments
12686      were malformed we will have returned false already.  */
12687   return found;
12688 }
12689
12690 /* Count how many times the character C appears in
12691    NULL-terminated string STR.  */
12692
12693 static unsigned int
12694 num_occurences_in_str (char c, char *str)
12695 {
12696   unsigned int res = 0;
12697   while (*str != '\0')
12698     {
12699       if (*str == c)
12700         res++;
12701
12702       str++;
12703     }
12704
12705   return res;
12706 }
12707
12708 /* Parse the tree in ARGS that contains the target attribute information
12709    and update the global target options space.  */
12710
12711 bool
12712 aarch64_process_target_attr (tree args)
12713 {
12714   if (TREE_CODE (args) == TREE_LIST)
12715     {
12716       do
12717         {
12718           tree head = TREE_VALUE (args);
12719           if (head)
12720             {
12721               if (!aarch64_process_target_attr (head))
12722                 return false;
12723             }
12724           args = TREE_CHAIN (args);
12725         } while (args);
12726
12727       return true;
12728     }
12729
12730   if (TREE_CODE (args) != STRING_CST)
12731     {
12732       error ("attribute %<target%> argument not a string");
12733       return false;
12734     }
12735
12736   size_t len = strlen (TREE_STRING_POINTER (args));
12737   char *str_to_check = (char *) alloca (len + 1);
12738   strcpy (str_to_check, TREE_STRING_POINTER (args));
12739
12740   if (len == 0)
12741     {
12742       error ("malformed %<target()%> pragma or attribute");
12743       return false;
12744     }
12745
12746   /* Used to catch empty spaces between commas i.e.
12747      attribute ((target ("attr1,,attr2"))).  */
12748   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
12749
12750   /* Handle multiple target attributes separated by ','.  */
12751   char *token = strtok_r (str_to_check, ",", &str_to_check);
12752
12753   unsigned int num_attrs = 0;
12754   while (token)
12755     {
12756       num_attrs++;
12757       if (!aarch64_process_one_target_attr (token))
12758         {
12759           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
12760           return false;
12761         }
12762
12763       token = strtok_r (NULL, ",", &str_to_check);
12764     }
12765
12766   if (num_attrs != num_commas + 1)
12767     {
12768       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
12769       return false;
12770     }
12771
12772   return true;
12773 }
12774
12775 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
12776    process attribute ((target ("..."))).  */
12777
12778 static bool
12779 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
12780 {
12781   struct cl_target_option cur_target;
12782   bool ret;
12783   tree old_optimize;
12784   tree new_target, new_optimize;
12785   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12786
12787   /* If what we're processing is the current pragma string then the
12788      target option node is already stored in target_option_current_node
12789      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
12790      having to re-parse the string.  This is especially useful to keep
12791      arm_neon.h compile times down since that header contains a lot
12792      of intrinsics enclosed in pragmas.  */
12793   if (!existing_target && args == current_target_pragma)
12794     {
12795       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
12796       return true;
12797     }
12798   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12799
12800   old_optimize = build_optimization_node (&global_options);
12801   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12802
12803   /* If the function changed the optimization levels as well as setting
12804      target options, start with the optimizations specified.  */
12805   if (func_optimize && func_optimize != old_optimize)
12806     cl_optimization_restore (&global_options,
12807                              TREE_OPTIMIZATION (func_optimize));
12808
12809   /* Save the current target options to restore at the end.  */
12810   cl_target_option_save (&cur_target, &global_options);
12811
12812   /* If fndecl already has some target attributes applied to it, unpack
12813      them so that we add this attribute on top of them, rather than
12814      overwriting them.  */
12815   if (existing_target)
12816     {
12817       struct cl_target_option *existing_options
12818         = TREE_TARGET_OPTION (existing_target);
12819
12820       if (existing_options)
12821         cl_target_option_restore (&global_options, existing_options);
12822     }
12823   else
12824     cl_target_option_restore (&global_options,
12825                         TREE_TARGET_OPTION (target_option_current_node));
12826
12827   ret = aarch64_process_target_attr (args);
12828
12829   /* Set up any additional state.  */
12830   if (ret)
12831     {
12832       aarch64_override_options_internal (&global_options);
12833       /* Initialize SIMD builtins if we haven't already.
12834          Set current_target_pragma to NULL for the duration so that
12835          the builtin initialization code doesn't try to tag the functions
12836          being built with the attributes specified by any current pragma, thus
12837          going into an infinite recursion.  */
12838       if (TARGET_SIMD)
12839         {
12840           tree saved_current_target_pragma = current_target_pragma;
12841           current_target_pragma = NULL;
12842           aarch64_init_simd_builtins ();
12843           current_target_pragma = saved_current_target_pragma;
12844         }
12845       new_target = build_target_option_node (&global_options);
12846     }
12847   else
12848     new_target = NULL;
12849
12850   new_optimize = build_optimization_node (&global_options);
12851
12852   if (fndecl && ret)
12853     {
12854       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
12855
12856       if (old_optimize != new_optimize)
12857         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
12858     }
12859
12860   cl_target_option_restore (&global_options, &cur_target);
12861
12862   if (old_optimize != new_optimize)
12863     cl_optimization_restore (&global_options,
12864                              TREE_OPTIMIZATION (old_optimize));
12865   return ret;
12866 }
12867
12868 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
12869    tri-bool options (yes, no, don't care) and the default value is
12870    DEF, determine whether to reject inlining.  */
12871
12872 static bool
12873 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
12874                                      int dont_care, int def)
12875 {
12876   /* If the callee doesn't care, always allow inlining.  */
12877   if (callee == dont_care)
12878     return true;
12879
12880   /* If the caller doesn't care, always allow inlining.  */
12881   if (caller == dont_care)
12882     return true;
12883
12884   /* Otherwise, allow inlining if either the callee and caller values
12885      agree, or if the callee is using the default value.  */
12886   return (callee == caller || callee == def);
12887 }
12888
12889 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
12890    to inline CALLEE into CALLER based on target-specific info.
12891    Make sure that the caller and callee have compatible architectural
12892    features.  Then go through the other possible target attributes
12893    and see if they can block inlining.  Try not to reject always_inline
12894    callees unless they are incompatible architecturally.  */
12895
12896 static bool
12897 aarch64_can_inline_p (tree caller, tree callee)
12898 {
12899   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
12900   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
12901
12902   struct cl_target_option *caller_opts
12903         = TREE_TARGET_OPTION (caller_tree ? caller_tree
12904                                            : target_option_default_node);
12905
12906   struct cl_target_option *callee_opts
12907         = TREE_TARGET_OPTION (callee_tree ? callee_tree
12908                                            : target_option_default_node);
12909
12910   /* Callee's ISA flags should be a subset of the caller's.  */
12911   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
12912        != callee_opts->x_aarch64_isa_flags)
12913     return false;
12914
12915   /* Allow non-strict aligned functions inlining into strict
12916      aligned ones.  */
12917   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
12918        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
12919       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
12920            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
12921     return false;
12922
12923   bool always_inline = lookup_attribute ("always_inline",
12924                                           DECL_ATTRIBUTES (callee));
12925
12926   /* If the architectural features match up and the callee is always_inline
12927      then the other attributes don't matter.  */
12928   if (always_inline)
12929     return true;
12930
12931   if (caller_opts->x_aarch64_cmodel_var
12932       != callee_opts->x_aarch64_cmodel_var)
12933     return false;
12934
12935   if (caller_opts->x_aarch64_tls_dialect
12936       != callee_opts->x_aarch64_tls_dialect)
12937     return false;
12938
12939   /* Honour explicit requests to workaround errata.  */
12940   if (!aarch64_tribools_ok_for_inlining_p (
12941           caller_opts->x_aarch64_fix_a53_err835769,
12942           callee_opts->x_aarch64_fix_a53_err835769,
12943           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
12944     return false;
12945
12946   if (!aarch64_tribools_ok_for_inlining_p (
12947           caller_opts->x_aarch64_fix_a53_err843419,
12948           callee_opts->x_aarch64_fix_a53_err843419,
12949           2, TARGET_FIX_ERR_A53_843419))
12950     return false;
12951
12952   /* If the user explicitly specified -momit-leaf-frame-pointer for the
12953      caller and calle and they don't match up, reject inlining.  */
12954   if (!aarch64_tribools_ok_for_inlining_p (
12955           caller_opts->x_flag_omit_leaf_frame_pointer,
12956           callee_opts->x_flag_omit_leaf_frame_pointer,
12957           2, 1))
12958     return false;
12959
12960   /* If the callee has specific tuning overrides, respect them.  */
12961   if (callee_opts->x_aarch64_override_tune_string != NULL
12962       && caller_opts->x_aarch64_override_tune_string == NULL)
12963     return false;
12964
12965   /* If the user specified tuning override strings for the
12966      caller and callee and they don't match up, reject inlining.
12967      We just do a string compare here, we don't analyze the meaning
12968      of the string, as it would be too costly for little gain.  */
12969   if (callee_opts->x_aarch64_override_tune_string
12970       && caller_opts->x_aarch64_override_tune_string
12971       && (strcmp (callee_opts->x_aarch64_override_tune_string,
12972                   caller_opts->x_aarch64_override_tune_string) != 0))
12973     return false;
12974
12975   return true;
12976 }
12977
12978 /* Return true if SYMBOL_REF X binds locally.  */
12979
12980 static bool
12981 aarch64_symbol_binds_local_p (const_rtx x)
12982 {
12983   return (SYMBOL_REF_DECL (x)
12984           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
12985           : SYMBOL_REF_LOCAL_P (x));
12986 }
12987
12988 /* Return true if SYMBOL_REF X is thread local */
12989 static bool
12990 aarch64_tls_symbol_p (rtx x)
12991 {
12992   if (! TARGET_HAVE_TLS)
12993     return false;
12994
12995   if (GET_CODE (x) != SYMBOL_REF)
12996     return false;
12997
12998   return SYMBOL_REF_TLS_MODEL (x) != 0;
12999 }
13000
13001 /* Classify a TLS symbol into one of the TLS kinds.  */
13002 enum aarch64_symbol_type
13003 aarch64_classify_tls_symbol (rtx x)
13004 {
13005   enum tls_model tls_kind = tls_symbolic_operand_type (x);
13006
13007   switch (tls_kind)
13008     {
13009     case TLS_MODEL_GLOBAL_DYNAMIC:
13010     case TLS_MODEL_LOCAL_DYNAMIC:
13011       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
13012
13013     case TLS_MODEL_INITIAL_EXEC:
13014       switch (aarch64_cmodel)
13015         {
13016         case AARCH64_CMODEL_TINY:
13017         case AARCH64_CMODEL_TINY_PIC:
13018           return SYMBOL_TINY_TLSIE;
13019         default:
13020           return SYMBOL_SMALL_TLSIE;
13021         }
13022
13023     case TLS_MODEL_LOCAL_EXEC:
13024       if (aarch64_tls_size == 12)
13025         return SYMBOL_TLSLE12;
13026       else if (aarch64_tls_size == 24)
13027         return SYMBOL_TLSLE24;
13028       else if (aarch64_tls_size == 32)
13029         return SYMBOL_TLSLE32;
13030       else if (aarch64_tls_size == 48)
13031         return SYMBOL_TLSLE48;
13032       else
13033         gcc_unreachable ();
13034
13035     case TLS_MODEL_EMULATED:
13036     case TLS_MODEL_NONE:
13037       return SYMBOL_FORCE_TO_MEM;
13038
13039     default:
13040       gcc_unreachable ();
13041     }
13042 }
13043
13044 /* Return the correct method for accessing X + OFFSET, where X is either
13045    a SYMBOL_REF or LABEL_REF.  */
13046
13047 enum aarch64_symbol_type
13048 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
13049 {
13050   if (GET_CODE (x) == LABEL_REF)
13051     {
13052       switch (aarch64_cmodel)
13053         {
13054         case AARCH64_CMODEL_LARGE:
13055           return SYMBOL_FORCE_TO_MEM;
13056
13057         case AARCH64_CMODEL_TINY_PIC:
13058         case AARCH64_CMODEL_TINY:
13059           return SYMBOL_TINY_ABSOLUTE;
13060
13061         case AARCH64_CMODEL_SMALL_SPIC:
13062         case AARCH64_CMODEL_SMALL_PIC:
13063         case AARCH64_CMODEL_SMALL:
13064           return SYMBOL_SMALL_ABSOLUTE;
13065
13066         default:
13067           gcc_unreachable ();
13068         }
13069     }
13070
13071   if (GET_CODE (x) == SYMBOL_REF)
13072     {
13073       if (aarch64_tls_symbol_p (x))
13074         return aarch64_classify_tls_symbol (x);
13075
13076       switch (aarch64_cmodel)
13077         {
13078         case AARCH64_CMODEL_TINY:
13079           /* When we retrieve symbol + offset address, we have to make sure
13080              the offset does not cause overflow of the final address.  But
13081              we have no way of knowing the address of symbol at compile time
13082              so we can't accurately say if the distance between the PC and
13083              symbol + offset is outside the addressible range of +/-1M in the
13084              TINY code model.  So we rely on images not being greater than
13085              1M and cap the offset at 1M and anything beyond 1M will have to
13086              be loaded using an alternative mechanism.  Furthermore if the
13087              symbol is a weak reference to something that isn't known to
13088              resolve to a symbol in this module, then force to memory.  */
13089           if ((SYMBOL_REF_WEAK (x)
13090                && !aarch64_symbol_binds_local_p (x))
13091               || !IN_RANGE (offset, -1048575, 1048575))
13092             return SYMBOL_FORCE_TO_MEM;
13093           return SYMBOL_TINY_ABSOLUTE;
13094
13095         case AARCH64_CMODEL_SMALL:
13096           /* Same reasoning as the tiny code model, but the offset cap here is
13097              4G.  */
13098           if ((SYMBOL_REF_WEAK (x)
13099                && !aarch64_symbol_binds_local_p (x))
13100               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13101                             HOST_WIDE_INT_C (4294967264)))
13102             return SYMBOL_FORCE_TO_MEM;
13103           return SYMBOL_SMALL_ABSOLUTE;
13104
13105         case AARCH64_CMODEL_TINY_PIC:
13106           if (!aarch64_symbol_binds_local_p (x))
13107             return SYMBOL_TINY_GOT;
13108           return SYMBOL_TINY_ABSOLUTE;
13109
13110         case AARCH64_CMODEL_SMALL_SPIC:
13111         case AARCH64_CMODEL_SMALL_PIC:
13112           if (!aarch64_symbol_binds_local_p (x))
13113             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13114                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13115           return SYMBOL_SMALL_ABSOLUTE;
13116
13117         case AARCH64_CMODEL_LARGE:
13118           /* This is alright even in PIC code as the constant
13119              pool reference is always PC relative and within
13120              the same translation unit.  */
13121           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13122             return SYMBOL_SMALL_ABSOLUTE;
13123           else
13124             return SYMBOL_FORCE_TO_MEM;
13125
13126         default:
13127           gcc_unreachable ();
13128         }
13129     }
13130
13131   /* By default push everything into the constant pool.  */
13132   return SYMBOL_FORCE_TO_MEM;
13133 }
13134
13135 bool
13136 aarch64_constant_address_p (rtx x)
13137 {
13138   return (CONSTANT_P (x) && memory_address_p (DImode, x));
13139 }
13140
13141 bool
13142 aarch64_legitimate_pic_operand_p (rtx x)
13143 {
13144   if (GET_CODE (x) == SYMBOL_REF
13145       || (GET_CODE (x) == CONST
13146           && GET_CODE (XEXP (x, 0)) == PLUS
13147           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13148      return false;
13149
13150   return true;
13151 }
13152
13153 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
13154    that should be rematerialized rather than spilled.  */
13155
13156 static bool
13157 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
13158 {
13159   /* Support CSE and rematerialization of common constants.  */
13160   if (CONST_INT_P (x)
13161       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
13162       || GET_CODE (x) == CONST_VECTOR)
13163     return true;
13164
13165   /* Do not allow vector struct mode constants for Advanced SIMD.
13166      We could support 0 and -1 easily, but they need support in
13167      aarch64-simd.md.  */
13168   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13169   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13170     return false;
13171
13172   /* Only accept variable-length vector constants if they can be
13173      handled directly.
13174
13175      ??? It would be possible to handle rematerialization of other
13176      constants via secondary reloads.  */
13177   if (vec_flags & VEC_ANY_SVE)
13178     return aarch64_simd_valid_immediate (x, NULL);
13179
13180   if (GET_CODE (x) == HIGH)
13181     x = XEXP (x, 0);
13182
13183   /* Accept polynomial constants that can be calculated by using the
13184      destination of a move as the sole temporary.  Constants that
13185      require a second temporary cannot be rematerialized (they can't be
13186      forced to memory and also aren't legitimate constants).  */
13187   poly_int64 offset;
13188   if (poly_int_rtx_p (x, &offset))
13189     return aarch64_offset_temporaries (false, offset) <= 1;
13190
13191   /* If an offset is being added to something else, we need to allow the
13192      base to be moved into the destination register, meaning that there
13193      are no free temporaries for the offset.  */
13194   x = strip_offset (x, &offset);
13195   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
13196     return false;
13197
13198   /* Do not allow const (plus (anchor_symbol, const_int)).  */
13199   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
13200     return false;
13201
13202   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
13203      so spilling them is better than rematerialization.  */
13204   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
13205     return true;
13206
13207   /* Label references are always constant.  */
13208   if (GET_CODE (x) == LABEL_REF)
13209     return true;
13210
13211   return false;
13212 }
13213
13214 rtx
13215 aarch64_load_tp (rtx target)
13216 {
13217   if (!target
13218       || GET_MODE (target) != Pmode
13219       || !register_operand (target, Pmode))
13220     target = gen_reg_rtx (Pmode);
13221
13222   /* Can return in any reg.  */
13223   emit_insn (gen_aarch64_load_tp_hard (target));
13224   return target;
13225 }
13226
13227 /* On AAPCS systems, this is the "struct __va_list".  */
13228 static GTY(()) tree va_list_type;
13229
13230 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13231    Return the type to use as __builtin_va_list.
13232
13233    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
13234
13235    struct __va_list
13236    {
13237      void *__stack;
13238      void *__gr_top;
13239      void *__vr_top;
13240      int   __gr_offs;
13241      int   __vr_offs;
13242    };  */
13243
13244 static tree
13245 aarch64_build_builtin_va_list (void)
13246 {
13247   tree va_list_name;
13248   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13249
13250   /* Create the type.  */
13251   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
13252   /* Give it the required name.  */
13253   va_list_name = build_decl (BUILTINS_LOCATION,
13254                              TYPE_DECL,
13255                              get_identifier ("__va_list"),
13256                              va_list_type);
13257   DECL_ARTIFICIAL (va_list_name) = 1;
13258   TYPE_NAME (va_list_type) = va_list_name;
13259   TYPE_STUB_DECL (va_list_type) = va_list_name;
13260
13261   /* Create the fields.  */
13262   f_stack = build_decl (BUILTINS_LOCATION,
13263                         FIELD_DECL, get_identifier ("__stack"),
13264                         ptr_type_node);
13265   f_grtop = build_decl (BUILTINS_LOCATION,
13266                         FIELD_DECL, get_identifier ("__gr_top"),
13267                         ptr_type_node);
13268   f_vrtop = build_decl (BUILTINS_LOCATION,
13269                         FIELD_DECL, get_identifier ("__vr_top"),
13270                         ptr_type_node);
13271   f_groff = build_decl (BUILTINS_LOCATION,
13272                         FIELD_DECL, get_identifier ("__gr_offs"),
13273                         integer_type_node);
13274   f_vroff = build_decl (BUILTINS_LOCATION,
13275                         FIELD_DECL, get_identifier ("__vr_offs"),
13276                         integer_type_node);
13277
13278   /* Tell tree-stdarg pass about our internal offset fields.
13279      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
13280      purpose to identify whether the code is updating va_list internal
13281      offset fields through irregular way.  */
13282   va_list_gpr_counter_field = f_groff;
13283   va_list_fpr_counter_field = f_vroff;
13284
13285   DECL_ARTIFICIAL (f_stack) = 1;
13286   DECL_ARTIFICIAL (f_grtop) = 1;
13287   DECL_ARTIFICIAL (f_vrtop) = 1;
13288   DECL_ARTIFICIAL (f_groff) = 1;
13289   DECL_ARTIFICIAL (f_vroff) = 1;
13290
13291   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
13292   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
13293   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
13294   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
13295   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
13296
13297   TYPE_FIELDS (va_list_type) = f_stack;
13298   DECL_CHAIN (f_stack) = f_grtop;
13299   DECL_CHAIN (f_grtop) = f_vrtop;
13300   DECL_CHAIN (f_vrtop) = f_groff;
13301   DECL_CHAIN (f_groff) = f_vroff;
13302
13303   /* Compute its layout.  */
13304   layout_type (va_list_type);
13305
13306   return va_list_type;
13307 }
13308
13309 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
13310 static void
13311 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
13312 {
13313   const CUMULATIVE_ARGS *cum;
13314   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13315   tree stack, grtop, vrtop, groff, vroff;
13316   tree t;
13317   int gr_save_area_size = cfun->va_list_gpr_size;
13318   int vr_save_area_size = cfun->va_list_fpr_size;
13319   int vr_offset;
13320
13321   cum = &crtl->args.info;
13322   if (cfun->va_list_gpr_size)
13323     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
13324                              cfun->va_list_gpr_size);
13325   if (cfun->va_list_fpr_size)
13326     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
13327                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
13328
13329   if (!TARGET_FLOAT)
13330     {
13331       gcc_assert (cum->aapcs_nvrn == 0);
13332       vr_save_area_size = 0;
13333     }
13334
13335   f_stack = TYPE_FIELDS (va_list_type_node);
13336   f_grtop = DECL_CHAIN (f_stack);
13337   f_vrtop = DECL_CHAIN (f_grtop);
13338   f_groff = DECL_CHAIN (f_vrtop);
13339   f_vroff = DECL_CHAIN (f_groff);
13340
13341   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
13342                   NULL_TREE);
13343   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
13344                   NULL_TREE);
13345   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
13346                   NULL_TREE);
13347   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
13348                   NULL_TREE);
13349   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
13350                   NULL_TREE);
13351
13352   /* Emit code to initialize STACK, which points to the next varargs stack
13353      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
13354      by named arguments.  STACK is 8-byte aligned.  */
13355   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
13356   if (cum->aapcs_stack_size > 0)
13357     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
13358   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
13359   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13360
13361   /* Emit code to initialize GRTOP, the top of the GR save area.
13362      virtual_incoming_args_rtx should have been 16 byte aligned.  */
13363   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
13364   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
13365   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13366
13367   /* Emit code to initialize VRTOP, the top of the VR save area.
13368      This address is gr_save_area_bytes below GRTOP, rounded
13369      down to the next 16-byte boundary.  */
13370   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
13371   vr_offset = ROUND_UP (gr_save_area_size,
13372                         STACK_BOUNDARY / BITS_PER_UNIT);
13373
13374   if (vr_offset)
13375     t = fold_build_pointer_plus_hwi (t, -vr_offset);
13376   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
13377   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13378
13379   /* Emit code to initialize GROFF, the offset from GRTOP of the
13380      next GPR argument.  */
13381   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
13382               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
13383   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13384
13385   /* Likewise emit code to initialize VROFF, the offset from FTOP
13386      of the next VR argument.  */
13387   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
13388               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
13389   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13390 }
13391
13392 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
13393
13394 static tree
13395 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
13396                               gimple_seq *post_p ATTRIBUTE_UNUSED)
13397 {
13398   tree addr;
13399   bool indirect_p;
13400   bool is_ha;           /* is HFA or HVA.  */
13401   bool dw_align;        /* double-word align.  */
13402   machine_mode ag_mode = VOIDmode;
13403   int nregs;
13404   machine_mode mode;
13405
13406   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13407   tree stack, f_top, f_off, off, arg, roundup, on_stack;
13408   HOST_WIDE_INT size, rsize, adjust, align;
13409   tree t, u, cond1, cond2;
13410
13411   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
13412   if (indirect_p)
13413     type = build_pointer_type (type);
13414
13415   mode = TYPE_MODE (type);
13416
13417   f_stack = TYPE_FIELDS (va_list_type_node);
13418   f_grtop = DECL_CHAIN (f_stack);
13419   f_vrtop = DECL_CHAIN (f_grtop);
13420   f_groff = DECL_CHAIN (f_vrtop);
13421   f_vroff = DECL_CHAIN (f_groff);
13422
13423   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
13424                   f_stack, NULL_TREE);
13425   size = int_size_in_bytes (type);
13426
13427   bool abi_break;
13428   align
13429     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
13430
13431   dw_align = false;
13432   adjust = 0;
13433   if (aarch64_vfp_is_call_or_return_candidate (mode,
13434                                                type,
13435                                                &ag_mode,
13436                                                &nregs,
13437                                                &is_ha))
13438     {
13439       /* No frontends can create types with variable-sized modes, so we
13440          shouldn't be asked to pass or return them.  */
13441       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
13442
13443       /* TYPE passed in fp/simd registers.  */
13444       if (!TARGET_FLOAT)
13445         aarch64_err_no_fpadvsimd (mode);
13446
13447       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
13448                       unshare_expr (valist), f_vrtop, NULL_TREE);
13449       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
13450                       unshare_expr (valist), f_vroff, NULL_TREE);
13451
13452       rsize = nregs * UNITS_PER_VREG;
13453
13454       if (is_ha)
13455         {
13456           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
13457             adjust = UNITS_PER_VREG - ag_size;
13458         }
13459       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13460                && size < UNITS_PER_VREG)
13461         {
13462           adjust = UNITS_PER_VREG - size;
13463         }
13464     }
13465   else
13466     {
13467       /* TYPE passed in general registers.  */
13468       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
13469                       unshare_expr (valist), f_grtop, NULL_TREE);
13470       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
13471                       unshare_expr (valist), f_groff, NULL_TREE);
13472       rsize = ROUND_UP (size, UNITS_PER_WORD);
13473       nregs = rsize / UNITS_PER_WORD;
13474
13475       if (align > 8)
13476         {
13477           if (abi_break && warn_psabi)
13478             inform (input_location, "parameter passing for argument of type "
13479                     "%qT changed in GCC 9.1", type);
13480           dw_align = true;
13481         }
13482
13483       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13484           && size < UNITS_PER_WORD)
13485         {
13486           adjust = UNITS_PER_WORD  - size;
13487         }
13488     }
13489
13490   /* Get a local temporary for the field value.  */
13491   off = get_initialized_tmp_var (f_off, pre_p, NULL);
13492
13493   /* Emit code to branch if off >= 0.  */
13494   t = build2 (GE_EXPR, boolean_type_node, off,
13495               build_int_cst (TREE_TYPE (off), 0));
13496   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
13497
13498   if (dw_align)
13499     {
13500       /* Emit: offs = (offs + 15) & -16.  */
13501       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13502                   build_int_cst (TREE_TYPE (off), 15));
13503       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
13504                   build_int_cst (TREE_TYPE (off), -16));
13505       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
13506     }
13507   else
13508     roundup = NULL;
13509
13510   /* Update ap.__[g|v]r_offs  */
13511   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13512               build_int_cst (TREE_TYPE (off), rsize));
13513   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
13514
13515   /* String up.  */
13516   if (roundup)
13517     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13518
13519   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
13520   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
13521               build_int_cst (TREE_TYPE (f_off), 0));
13522   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
13523
13524   /* String up: make sure the assignment happens before the use.  */
13525   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
13526   COND_EXPR_ELSE (cond1) = t;
13527
13528   /* Prepare the trees handling the argument that is passed on the stack;
13529      the top level node will store in ON_STACK.  */
13530   arg = get_initialized_tmp_var (stack, pre_p, NULL);
13531   if (align > 8)
13532     {
13533       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
13534       t = fold_build_pointer_plus_hwi (arg, 15);
13535       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13536                   build_int_cst (TREE_TYPE (t), -16));
13537       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
13538     }
13539   else
13540     roundup = NULL;
13541   /* Advance ap.__stack  */
13542   t = fold_build_pointer_plus_hwi (arg, size + 7);
13543   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13544               build_int_cst (TREE_TYPE (t), -8));
13545   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
13546   /* String up roundup and advance.  */
13547   if (roundup)
13548     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13549   /* String up with arg */
13550   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
13551   /* Big-endianness related address adjustment.  */
13552   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13553       && size < UNITS_PER_WORD)
13554   {
13555     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
13556                 size_int (UNITS_PER_WORD - size));
13557     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
13558   }
13559
13560   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
13561   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
13562
13563   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
13564   t = off;
13565   if (adjust)
13566     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
13567                 build_int_cst (TREE_TYPE (off), adjust));
13568
13569   t = fold_convert (sizetype, t);
13570   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
13571
13572   if (is_ha)
13573     {
13574       /* type ha; // treat as "struct {ftype field[n];}"
13575          ... [computing offs]
13576          for (i = 0; i <nregs; ++i, offs += 16)
13577            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
13578          return ha;  */
13579       int i;
13580       tree tmp_ha, field_t, field_ptr_t;
13581
13582       /* Declare a local variable.  */
13583       tmp_ha = create_tmp_var_raw (type, "ha");
13584       gimple_add_tmp_var (tmp_ha);
13585
13586       /* Establish the base type.  */
13587       switch (ag_mode)
13588         {
13589         case E_SFmode:
13590           field_t = float_type_node;
13591           field_ptr_t = float_ptr_type_node;
13592           break;
13593         case E_DFmode:
13594           field_t = double_type_node;
13595           field_ptr_t = double_ptr_type_node;
13596           break;
13597         case E_TFmode:
13598           field_t = long_double_type_node;
13599           field_ptr_t = long_double_ptr_type_node;
13600           break;
13601         case E_HFmode:
13602           field_t = aarch64_fp16_type_node;
13603           field_ptr_t = aarch64_fp16_ptr_type_node;
13604           break;
13605         case E_V2SImode:
13606         case E_V4SImode:
13607             {
13608               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
13609               field_t = build_vector_type_for_mode (innertype, ag_mode);
13610               field_ptr_t = build_pointer_type (field_t);
13611             }
13612           break;
13613         default:
13614           gcc_assert (0);
13615         }
13616
13617       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
13618       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
13619       addr = t;
13620       t = fold_convert (field_ptr_t, addr);
13621       t = build2 (MODIFY_EXPR, field_t,
13622                   build1 (INDIRECT_REF, field_t, tmp_ha),
13623                   build1 (INDIRECT_REF, field_t, t));
13624
13625       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
13626       for (i = 1; i < nregs; ++i)
13627         {
13628           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
13629           u = fold_convert (field_ptr_t, addr);
13630           u = build2 (MODIFY_EXPR, field_t,
13631                       build2 (MEM_REF, field_t, tmp_ha,
13632                               build_int_cst (field_ptr_t,
13633                                              (i *
13634                                               int_size_in_bytes (field_t)))),
13635                       build1 (INDIRECT_REF, field_t, u));
13636           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
13637         }
13638
13639       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
13640       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
13641     }
13642
13643   COND_EXPR_ELSE (cond2) = t;
13644   addr = fold_convert (build_pointer_type (type), cond1);
13645   addr = build_va_arg_indirect_ref (addr);
13646
13647   if (indirect_p)
13648     addr = build_va_arg_indirect_ref (addr);
13649
13650   return addr;
13651 }
13652
13653 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
13654
13655 static void
13656 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
13657                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
13658                                 int no_rtl)
13659 {
13660   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
13661   CUMULATIVE_ARGS local_cum;
13662   int gr_saved = cfun->va_list_gpr_size;
13663   int vr_saved = cfun->va_list_fpr_size;
13664
13665   /* The caller has advanced CUM up to, but not beyond, the last named
13666      argument.  Advance a local copy of CUM past the last "real" named
13667      argument, to find out how many registers are left over.  */
13668   local_cum = *cum;
13669   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
13670
13671   /* Found out how many registers we need to save.
13672      Honor tree-stdvar analysis results.  */
13673   if (cfun->va_list_gpr_size)
13674     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
13675                     cfun->va_list_gpr_size / UNITS_PER_WORD);
13676   if (cfun->va_list_fpr_size)
13677     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
13678                     cfun->va_list_fpr_size / UNITS_PER_VREG);
13679
13680   if (!TARGET_FLOAT)
13681     {
13682       gcc_assert (local_cum.aapcs_nvrn == 0);
13683       vr_saved = 0;
13684     }
13685
13686   if (!no_rtl)
13687     {
13688       if (gr_saved > 0)
13689         {
13690           rtx ptr, mem;
13691
13692           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
13693           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
13694                                - gr_saved * UNITS_PER_WORD);
13695           mem = gen_frame_mem (BLKmode, ptr);
13696           set_mem_alias_set (mem, get_varargs_alias_set ());
13697
13698           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
13699                                mem, gr_saved);
13700         }
13701       if (vr_saved > 0)
13702         {
13703           /* We can't use move_block_from_reg, because it will use
13704              the wrong mode, storing D regs only.  */
13705           machine_mode mode = TImode;
13706           int off, i, vr_start;
13707
13708           /* Set OFF to the offset from virtual_incoming_args_rtx of
13709              the first vector register.  The VR save area lies below
13710              the GR one, and is aligned to 16 bytes.  */
13711           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
13712                            STACK_BOUNDARY / BITS_PER_UNIT);
13713           off -= vr_saved * UNITS_PER_VREG;
13714
13715           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
13716           for (i = 0; i < vr_saved; ++i)
13717             {
13718               rtx ptr, mem;
13719
13720               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
13721               mem = gen_frame_mem (mode, ptr);
13722               set_mem_alias_set (mem, get_varargs_alias_set ());
13723               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
13724               off += UNITS_PER_VREG;
13725             }
13726         }
13727     }
13728
13729   /* We don't save the size into *PRETEND_SIZE because we want to avoid
13730      any complication of having crtl->args.pretend_args_size changed.  */
13731   cfun->machine->frame.saved_varargs_size
13732     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
13733                  STACK_BOUNDARY / BITS_PER_UNIT)
13734        + vr_saved * UNITS_PER_VREG);
13735 }
13736
13737 static void
13738 aarch64_conditional_register_usage (void)
13739 {
13740   int i;
13741   if (!TARGET_FLOAT)
13742     {
13743       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
13744         {
13745           fixed_regs[i] = 1;
13746           call_used_regs[i] = 1;
13747         }
13748     }
13749   if (!TARGET_SVE)
13750     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
13751       {
13752         fixed_regs[i] = 1;
13753         call_used_regs[i] = 1;
13754       }
13755
13756   /* When tracking speculation, we need a couple of call-clobbered registers
13757      to track the speculation state.  It would be nice to just use
13758      IP0 and IP1, but currently there are numerous places that just
13759      assume these registers are free for other uses (eg pointer
13760      authentication).  */
13761   if (aarch64_track_speculation)
13762     {
13763       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
13764       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
13765       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13766       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13767     }
13768 }
13769
13770 /* Walk down the type tree of TYPE counting consecutive base elements.
13771    If *MODEP is VOIDmode, then set it to the first valid floating point
13772    type.  If a non-floating point type is found, or if a floating point
13773    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
13774    otherwise return the count in the sub-tree.  */
13775 static int
13776 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
13777 {
13778   machine_mode mode;
13779   HOST_WIDE_INT size;
13780
13781   switch (TREE_CODE (type))
13782     {
13783     case REAL_TYPE:
13784       mode = TYPE_MODE (type);
13785       if (mode != DFmode && mode != SFmode
13786           && mode != TFmode && mode != HFmode)
13787         return -1;
13788
13789       if (*modep == VOIDmode)
13790         *modep = mode;
13791
13792       if (*modep == mode)
13793         return 1;
13794
13795       break;
13796
13797     case COMPLEX_TYPE:
13798       mode = TYPE_MODE (TREE_TYPE (type));
13799       if (mode != DFmode && mode != SFmode
13800           && mode != TFmode && mode != HFmode)
13801         return -1;
13802
13803       if (*modep == VOIDmode)
13804         *modep = mode;
13805
13806       if (*modep == mode)
13807         return 2;
13808
13809       break;
13810
13811     case VECTOR_TYPE:
13812       /* Use V2SImode and V4SImode as representatives of all 64-bit
13813          and 128-bit vector types.  */
13814       size = int_size_in_bytes (type);
13815       switch (size)
13816         {
13817         case 8:
13818           mode = V2SImode;
13819           break;
13820         case 16:
13821           mode = V4SImode;
13822           break;
13823         default:
13824           return -1;
13825         }
13826
13827       if (*modep == VOIDmode)
13828         *modep = mode;
13829
13830       /* Vector modes are considered to be opaque: two vectors are
13831          equivalent for the purposes of being homogeneous aggregates
13832          if they are the same size.  */
13833       if (*modep == mode)
13834         return 1;
13835
13836       break;
13837
13838     case ARRAY_TYPE:
13839       {
13840         int count;
13841         tree index = TYPE_DOMAIN (type);
13842
13843         /* Can't handle incomplete types nor sizes that are not
13844            fixed.  */
13845         if (!COMPLETE_TYPE_P (type)
13846             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13847           return -1;
13848
13849         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
13850         if (count == -1
13851             || !index
13852             || !TYPE_MAX_VALUE (index)
13853             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
13854             || !TYPE_MIN_VALUE (index)
13855             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
13856             || count < 0)
13857           return -1;
13858
13859         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
13860                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
13861
13862         /* There must be no padding.  */
13863         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13864                       count * GET_MODE_BITSIZE (*modep)))
13865           return -1;
13866
13867         return count;
13868       }
13869
13870     case RECORD_TYPE:
13871       {
13872         int count = 0;
13873         int sub_count;
13874         tree field;
13875
13876         /* Can't handle incomplete types nor sizes that are not
13877            fixed.  */
13878         if (!COMPLETE_TYPE_P (type)
13879             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13880           return -1;
13881
13882         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13883           {
13884             if (TREE_CODE (field) != FIELD_DECL)
13885               continue;
13886
13887             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13888             if (sub_count < 0)
13889               return -1;
13890             count += sub_count;
13891           }
13892
13893         /* There must be no padding.  */
13894         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13895                       count * GET_MODE_BITSIZE (*modep)))
13896           return -1;
13897
13898         return count;
13899       }
13900
13901     case UNION_TYPE:
13902     case QUAL_UNION_TYPE:
13903       {
13904         /* These aren't very interesting except in a degenerate case.  */
13905         int count = 0;
13906         int sub_count;
13907         tree field;
13908
13909         /* Can't handle incomplete types nor sizes that are not
13910            fixed.  */
13911         if (!COMPLETE_TYPE_P (type)
13912             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13913           return -1;
13914
13915         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13916           {
13917             if (TREE_CODE (field) != FIELD_DECL)
13918               continue;
13919
13920             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13921             if (sub_count < 0)
13922               return -1;
13923             count = count > sub_count ? count : sub_count;
13924           }
13925
13926         /* There must be no padding.  */
13927         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13928                       count * GET_MODE_BITSIZE (*modep)))
13929           return -1;
13930
13931         return count;
13932       }
13933
13934     default:
13935       break;
13936     }
13937
13938   return -1;
13939 }
13940
13941 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
13942    type as described in AAPCS64 \S 4.1.2.
13943
13944    See the comment above aarch64_composite_type_p for the notes on MODE.  */
13945
13946 static bool
13947 aarch64_short_vector_p (const_tree type,
13948                         machine_mode mode)
13949 {
13950   poly_int64 size = -1;
13951
13952   if (type && TREE_CODE (type) == VECTOR_TYPE)
13953     size = int_size_in_bytes (type);
13954   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
13955             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
13956     size = GET_MODE_SIZE (mode);
13957
13958   return known_eq (size, 8) || known_eq (size, 16);
13959 }
13960
13961 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
13962    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
13963    array types.  The C99 floating-point complex types are also considered
13964    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
13965    types, which are GCC extensions and out of the scope of AAPCS64, are
13966    treated as composite types here as well.
13967
13968    Note that MODE itself is not sufficient in determining whether a type
13969    is such a composite type or not.  This is because
13970    stor-layout.c:compute_record_mode may have already changed the MODE
13971    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
13972    structure with only one field may have its MODE set to the mode of the
13973    field.  Also an integer mode whose size matches the size of the
13974    RECORD_TYPE type may be used to substitute the original mode
13975    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
13976    solely relied on.  */
13977
13978 static bool
13979 aarch64_composite_type_p (const_tree type,
13980                           machine_mode mode)
13981 {
13982   if (aarch64_short_vector_p (type, mode))
13983     return false;
13984
13985   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
13986     return true;
13987
13988   if (mode == BLKmode
13989       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
13990       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
13991     return true;
13992
13993   return false;
13994 }
13995
13996 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
13997    shall be passed or returned in simd/fp register(s) (providing these
13998    parameter passing registers are available).
13999
14000    Upon successful return, *COUNT returns the number of needed registers,
14001    *BASE_MODE returns the mode of the individual register and when IS_HAF
14002    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14003    floating-point aggregate or a homogeneous short-vector aggregate.  */
14004
14005 static bool
14006 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
14007                                          const_tree type,
14008                                          machine_mode *base_mode,
14009                                          int *count,
14010                                          bool *is_ha)
14011 {
14012   machine_mode new_mode = VOIDmode;
14013   bool composite_p = aarch64_composite_type_p (type, mode);
14014
14015   if (is_ha != NULL) *is_ha = false;
14016
14017   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
14018       || aarch64_short_vector_p (type, mode))
14019     {
14020       *count = 1;
14021       new_mode = mode;
14022     }
14023   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
14024     {
14025       if (is_ha != NULL) *is_ha = true;
14026       *count = 2;
14027       new_mode = GET_MODE_INNER (mode);
14028     }
14029   else if (type && composite_p)
14030     {
14031       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
14032
14033       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
14034         {
14035           if (is_ha != NULL) *is_ha = true;
14036           *count = ag_count;
14037         }
14038       else
14039         return false;
14040     }
14041   else
14042     return false;
14043
14044   *base_mode = new_mode;
14045   return true;
14046 }
14047
14048 /* Implement TARGET_STRUCT_VALUE_RTX.  */
14049
14050 static rtx
14051 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
14052                           int incoming ATTRIBUTE_UNUSED)
14053 {
14054   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
14055 }
14056
14057 /* Implements target hook vector_mode_supported_p.  */
14058 static bool
14059 aarch64_vector_mode_supported_p (machine_mode mode)
14060 {
14061   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14062   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14063 }
14064
14065 /* Return appropriate SIMD container
14066    for MODE within a vector of WIDTH bits.  */
14067 static machine_mode
14068 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14069 {
14070   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14071     switch (mode)
14072       {
14073       case E_DFmode:
14074         return VNx2DFmode;
14075       case E_SFmode:
14076         return VNx4SFmode;
14077       case E_HFmode:
14078         return VNx8HFmode;
14079       case E_DImode:
14080         return VNx2DImode;
14081       case E_SImode:
14082         return VNx4SImode;
14083       case E_HImode:
14084         return VNx8HImode;
14085       case E_QImode:
14086         return VNx16QImode;
14087       default:
14088         return word_mode;
14089       }
14090
14091   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14092   if (TARGET_SIMD)
14093     {
14094       if (known_eq (width, 128))
14095         switch (mode)
14096           {
14097           case E_DFmode:
14098             return V2DFmode;
14099           case E_SFmode:
14100             return V4SFmode;
14101           case E_HFmode:
14102             return V8HFmode;
14103           case E_SImode:
14104             return V4SImode;
14105           case E_HImode:
14106             return V8HImode;
14107           case E_QImode:
14108             return V16QImode;
14109           case E_DImode:
14110             return V2DImode;
14111           default:
14112             break;
14113           }
14114       else
14115         switch (mode)
14116           {
14117           case E_SFmode:
14118             return V2SFmode;
14119           case E_HFmode:
14120             return V4HFmode;
14121           case E_SImode:
14122             return V2SImode;
14123           case E_HImode:
14124             return V4HImode;
14125           case E_QImode:
14126             return V8QImode;
14127           default:
14128             break;
14129           }
14130     }
14131   return word_mode;
14132 }
14133
14134 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
14135 static machine_mode
14136 aarch64_preferred_simd_mode (scalar_mode mode)
14137 {
14138   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
14139   return aarch64_simd_container_mode (mode, bits);
14140 }
14141
14142 /* Return a list of possible vector sizes for the vectorizer
14143    to iterate over.  */
14144 static void
14145 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
14146 {
14147   if (TARGET_SVE)
14148     sizes->safe_push (BYTES_PER_SVE_VECTOR);
14149   sizes->safe_push (16);
14150   sizes->safe_push (8);
14151 }
14152
14153 /* Implement TARGET_MANGLE_TYPE.  */
14154
14155 static const char *
14156 aarch64_mangle_type (const_tree type)
14157 {
14158   /* The AArch64 ABI documents say that "__va_list" has to be
14159      mangled as if it is in the "std" namespace.  */
14160   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
14161     return "St9__va_list";
14162
14163   /* Half-precision float.  */
14164   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
14165     return "Dh";
14166
14167   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
14168      builtin types.  */
14169   if (TYPE_NAME (type) != NULL)
14170     return aarch64_mangle_builtin_type (type);
14171
14172   /* Use the default mangling.  */
14173   return NULL;
14174 }
14175
14176 /* Find the first rtx_insn before insn that will generate an assembly
14177    instruction.  */
14178
14179 static rtx_insn *
14180 aarch64_prev_real_insn (rtx_insn *insn)
14181 {
14182   if (!insn)
14183     return NULL;
14184
14185   do
14186     {
14187       insn = prev_real_insn (insn);
14188     }
14189   while (insn && recog_memoized (insn) < 0);
14190
14191   return insn;
14192 }
14193
14194 static bool
14195 is_madd_op (enum attr_type t1)
14196 {
14197   unsigned int i;
14198   /* A number of these may be AArch32 only.  */
14199   enum attr_type mlatypes[] = {
14200     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
14201     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
14202     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
14203   };
14204
14205   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
14206     {
14207       if (t1 == mlatypes[i])
14208         return true;
14209     }
14210
14211   return false;
14212 }
14213
14214 /* Check if there is a register dependency between a load and the insn
14215    for which we hold recog_data.  */
14216
14217 static bool
14218 dep_between_memop_and_curr (rtx memop)
14219 {
14220   rtx load_reg;
14221   int opno;
14222
14223   gcc_assert (GET_CODE (memop) == SET);
14224
14225   if (!REG_P (SET_DEST (memop)))
14226     return false;
14227
14228   load_reg = SET_DEST (memop);
14229   for (opno = 1; opno < recog_data.n_operands; opno++)
14230     {
14231       rtx operand = recog_data.operand[opno];
14232       if (REG_P (operand)
14233           && reg_overlap_mentioned_p (load_reg, operand))
14234         return true;
14235
14236     }
14237   return false;
14238 }
14239
14240
14241 /* When working around the Cortex-A53 erratum 835769,
14242    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
14243    instruction and has a preceding memory instruction such that a NOP
14244    should be inserted between them.  */
14245
14246 bool
14247 aarch64_madd_needs_nop (rtx_insn* insn)
14248 {
14249   enum attr_type attr_type;
14250   rtx_insn *prev;
14251   rtx body;
14252
14253   if (!TARGET_FIX_ERR_A53_835769)
14254     return false;
14255
14256   if (!INSN_P (insn) || recog_memoized (insn) < 0)
14257     return false;
14258
14259   attr_type = get_attr_type (insn);
14260   if (!is_madd_op (attr_type))
14261     return false;
14262
14263   prev = aarch64_prev_real_insn (insn);
14264   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
14265      Restore recog state to INSN to avoid state corruption.  */
14266   extract_constrain_insn_cached (insn);
14267
14268   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
14269     return false;
14270
14271   body = single_set (prev);
14272
14273   /* If the previous insn is a memory op and there is no dependency between
14274      it and the DImode madd, emit a NOP between them.  If body is NULL then we
14275      have a complex memory operation, probably a load/store pair.
14276      Be conservative for now and emit a NOP.  */
14277   if (GET_MODE (recog_data.operand[0]) == DImode
14278       && (!body || !dep_between_memop_and_curr (body)))
14279     return true;
14280
14281   return false;
14282
14283 }
14284
14285
14286 /* Implement FINAL_PRESCAN_INSN.  */
14287
14288 void
14289 aarch64_final_prescan_insn (rtx_insn *insn)
14290 {
14291   if (aarch64_madd_needs_nop (insn))
14292     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
14293 }
14294
14295
14296 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
14297    instruction.  */
14298
14299 bool
14300 aarch64_sve_index_immediate_p (rtx base_or_step)
14301 {
14302   return (CONST_INT_P (base_or_step)
14303           && IN_RANGE (INTVAL (base_or_step), -16, 15));
14304 }
14305
14306 /* Return true if X is a valid immediate for the SVE ADD and SUB
14307    instructions.  Negate X first if NEGATE_P is true.  */
14308
14309 bool
14310 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
14311 {
14312   rtx elt;
14313
14314   if (!const_vec_duplicate_p (x, &elt)
14315       || !CONST_INT_P (elt))
14316     return false;
14317
14318   HOST_WIDE_INT val = INTVAL (elt);
14319   if (negate_p)
14320     val = -val;
14321   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
14322
14323   if (val & 0xff)
14324     return IN_RANGE (val, 0, 0xff);
14325   return IN_RANGE (val, 0, 0xff00);
14326 }
14327
14328 /* Return true if X is a valid immediate operand for an SVE logical
14329    instruction such as AND.  */
14330
14331 bool
14332 aarch64_sve_bitmask_immediate_p (rtx x)
14333 {
14334   rtx elt;
14335
14336   return (const_vec_duplicate_p (x, &elt)
14337           && CONST_INT_P (elt)
14338           && aarch64_bitmask_imm (INTVAL (elt),
14339                                   GET_MODE_INNER (GET_MODE (x))));
14340 }
14341
14342 /* Return true if X is a valid immediate for the SVE DUP and CPY
14343    instructions.  */
14344
14345 bool
14346 aarch64_sve_dup_immediate_p (rtx x)
14347 {
14348   rtx elt;
14349
14350   if (!const_vec_duplicate_p (x, &elt)
14351       || !CONST_INT_P (elt))
14352     return false;
14353
14354   HOST_WIDE_INT val = INTVAL (elt);
14355   if (val & 0xff)
14356     return IN_RANGE (val, -0x80, 0x7f);
14357   return IN_RANGE (val, -0x8000, 0x7f00);
14358 }
14359
14360 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14361    SIGNED_P says whether the operand is signed rather than unsigned.  */
14362
14363 bool
14364 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
14365 {
14366   rtx elt;
14367
14368   return (const_vec_duplicate_p (x, &elt)
14369           && CONST_INT_P (elt)
14370           && (signed_p
14371               ? IN_RANGE (INTVAL (elt), -16, 15)
14372               : IN_RANGE (INTVAL (elt), 0, 127)));
14373 }
14374
14375 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14376    instruction.  Negate X first if NEGATE_P is true.  */
14377
14378 bool
14379 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
14380 {
14381   rtx elt;
14382   REAL_VALUE_TYPE r;
14383
14384   if (!const_vec_duplicate_p (x, &elt)
14385       || GET_CODE (elt) != CONST_DOUBLE)
14386     return false;
14387
14388   r = *CONST_DOUBLE_REAL_VALUE (elt);
14389
14390   if (negate_p)
14391     r = real_value_negate (&r);
14392
14393   if (real_equal (&r, &dconst1))
14394     return true;
14395   if (real_equal (&r, &dconsthalf))
14396     return true;
14397   return false;
14398 }
14399
14400 /* Return true if X is a valid immediate operand for an SVE FMUL
14401    instruction.  */
14402
14403 bool
14404 aarch64_sve_float_mul_immediate_p (rtx x)
14405 {
14406   rtx elt;
14407
14408   /* GCC will never generate a multiply with an immediate of 2, so there is no
14409      point testing for it (even though it is a valid constant).  */
14410   return (const_vec_duplicate_p (x, &elt)
14411           && GET_CODE (elt) == CONST_DOUBLE
14412           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
14413 }
14414
14415 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14416    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
14417    is nonnull, use it to describe valid immediates.  */
14418 static bool
14419 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
14420                                     simd_immediate_info *info,
14421                                     enum simd_immediate_check which,
14422                                     simd_immediate_info::insn_type insn)
14423 {
14424   /* Try a 4-byte immediate with LSL.  */
14425   for (unsigned int shift = 0; shift < 32; shift += 8)
14426     if ((val32 & (0xff << shift)) == val32)
14427       {
14428         if (info)
14429           *info = simd_immediate_info (SImode, val32 >> shift, insn,
14430                                        simd_immediate_info::LSL, shift);
14431         return true;
14432       }
14433
14434   /* Try a 2-byte immediate with LSL.  */
14435   unsigned int imm16 = val32 & 0xffff;
14436   if (imm16 == (val32 >> 16))
14437     for (unsigned int shift = 0; shift < 16; shift += 8)
14438       if ((imm16 & (0xff << shift)) == imm16)
14439         {
14440           if (info)
14441             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
14442                                          simd_immediate_info::LSL, shift);
14443           return true;
14444         }
14445
14446   /* Try a 4-byte immediate with MSL, except for cases that MVN
14447      can handle.  */
14448   if (which == AARCH64_CHECK_MOV)
14449     for (unsigned int shift = 8; shift < 24; shift += 8)
14450       {
14451         unsigned int low = (1 << shift) - 1;
14452         if (((val32 & (0xff << shift)) | low) == val32)
14453           {
14454             if (info)
14455               *info = simd_immediate_info (SImode, val32 >> shift, insn,
14456                                            simd_immediate_info::MSL, shift);
14457             return true;
14458           }
14459       }
14460
14461   return false;
14462 }
14463
14464 /* Return true if replicating VAL64 is a valid immediate for the
14465    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
14466    use it to describe valid immediates.  */
14467 static bool
14468 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
14469                                  simd_immediate_info *info,
14470                                  enum simd_immediate_check which)
14471 {
14472   unsigned int val32 = val64 & 0xffffffff;
14473   unsigned int val16 = val64 & 0xffff;
14474   unsigned int val8 = val64 & 0xff;
14475
14476   if (val32 == (val64 >> 32))
14477     {
14478       if ((which & AARCH64_CHECK_ORR) != 0
14479           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
14480                                                  simd_immediate_info::MOV))
14481         return true;
14482
14483       if ((which & AARCH64_CHECK_BIC) != 0
14484           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
14485                                                  simd_immediate_info::MVN))
14486         return true;
14487
14488       /* Try using a replicated byte.  */
14489       if (which == AARCH64_CHECK_MOV
14490           && val16 == (val32 >> 16)
14491           && val8 == (val16 >> 8))
14492         {
14493           if (info)
14494             *info = simd_immediate_info (QImode, val8);
14495           return true;
14496         }
14497     }
14498
14499   /* Try using a bit-to-bytemask.  */
14500   if (which == AARCH64_CHECK_MOV)
14501     {
14502       unsigned int i;
14503       for (i = 0; i < 64; i += 8)
14504         {
14505           unsigned char byte = (val64 >> i) & 0xff;
14506           if (byte != 0 && byte != 0xff)
14507             break;
14508         }
14509       if (i == 64)
14510         {
14511           if (info)
14512             *info = simd_immediate_info (DImode, val64);
14513           return true;
14514         }
14515     }
14516   return false;
14517 }
14518
14519 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
14520    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
14521
14522 static bool
14523 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
14524                              simd_immediate_info *info)
14525 {
14526   scalar_int_mode mode = DImode;
14527   unsigned int val32 = val64 & 0xffffffff;
14528   if (val32 == (val64 >> 32))
14529     {
14530       mode = SImode;
14531       unsigned int val16 = val32 & 0xffff;
14532       if (val16 == (val32 >> 16))
14533         {
14534           mode = HImode;
14535           unsigned int val8 = val16 & 0xff;
14536           if (val8 == (val16 >> 8))
14537             mode = QImode;
14538         }
14539     }
14540   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
14541   if (IN_RANGE (val, -0x80, 0x7f))
14542     {
14543       /* DUP with no shift.  */
14544       if (info)
14545         *info = simd_immediate_info (mode, val);
14546       return true;
14547     }
14548   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
14549     {
14550       /* DUP with LSL #8.  */
14551       if (info)
14552         *info = simd_immediate_info (mode, val);
14553       return true;
14554     }
14555   if (aarch64_bitmask_imm (val64, mode))
14556     {
14557       /* DUPM.  */
14558       if (info)
14559         *info = simd_immediate_info (mode, val);
14560       return true;
14561     }
14562   return false;
14563 }
14564
14565 /* Return true if OP is a valid SIMD immediate for the operation
14566    described by WHICH.  If INFO is nonnull, use it to describe valid
14567    immediates.  */
14568 bool
14569 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
14570                               enum simd_immediate_check which)
14571 {
14572   machine_mode mode = GET_MODE (op);
14573   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14574   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14575     return false;
14576
14577   scalar_mode elt_mode = GET_MODE_INNER (mode);
14578   rtx base, step;
14579   unsigned int n_elts;
14580   if (GET_CODE (op) == CONST_VECTOR
14581       && CONST_VECTOR_DUPLICATE_P (op))
14582     n_elts = CONST_VECTOR_NPATTERNS (op);
14583   else if ((vec_flags & VEC_SVE_DATA)
14584            && const_vec_series_p (op, &base, &step))
14585     {
14586       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
14587       if (!aarch64_sve_index_immediate_p (base)
14588           || !aarch64_sve_index_immediate_p (step))
14589         return false;
14590
14591       if (info)
14592         *info = simd_immediate_info (elt_mode, base, step);
14593       return true;
14594     }
14595   else if (GET_CODE (op) == CONST_VECTOR
14596            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
14597     /* N_ELTS set above.  */;
14598   else
14599     return false;
14600
14601   /* Handle PFALSE and PTRUE.  */
14602   if (vec_flags & VEC_SVE_PRED)
14603     return (op == CONST0_RTX (mode)
14604             || op == CONSTM1_RTX (mode));
14605
14606   scalar_float_mode elt_float_mode;
14607   if (n_elts == 1
14608       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
14609     {
14610       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
14611       if (aarch64_float_const_zero_rtx_p (elt)
14612           || aarch64_float_const_representable_p (elt))
14613         {
14614           if (info)
14615             *info = simd_immediate_info (elt_float_mode, elt);
14616           return true;
14617         }
14618     }
14619
14620   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
14621   if (elt_size > 8)
14622     return false;
14623
14624   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
14625
14626   /* Expand the vector constant out into a byte vector, with the least
14627      significant byte of the register first.  */
14628   auto_vec<unsigned char, 16> bytes;
14629   bytes.reserve (n_elts * elt_size);
14630   for (unsigned int i = 0; i < n_elts; i++)
14631     {
14632       /* The vector is provided in gcc endian-neutral fashion.
14633          For aarch64_be Advanced SIMD, it must be laid out in the vector
14634          register in reverse order.  */
14635       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
14636       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
14637
14638       if (elt_mode != elt_int_mode)
14639         elt = gen_lowpart (elt_int_mode, elt);
14640
14641       if (!CONST_INT_P (elt))
14642         return false;
14643
14644       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
14645       for (unsigned int byte = 0; byte < elt_size; byte++)
14646         {
14647           bytes.quick_push (elt_val & 0xff);
14648           elt_val >>= BITS_PER_UNIT;
14649         }
14650     }
14651
14652   /* The immediate must repeat every eight bytes.  */
14653   unsigned int nbytes = bytes.length ();
14654   for (unsigned i = 8; i < nbytes; ++i)
14655     if (bytes[i] != bytes[i - 8])
14656       return false;
14657
14658   /* Get the repeating 8-byte value as an integer.  No endian correction
14659      is needed here because bytes is already in lsb-first order.  */
14660   unsigned HOST_WIDE_INT val64 = 0;
14661   for (unsigned int i = 0; i < 8; i++)
14662     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
14663               << (i * BITS_PER_UNIT));
14664
14665   if (vec_flags & VEC_SVE_DATA)
14666     return aarch64_sve_valid_immediate (val64, info);
14667   else
14668     return aarch64_advsimd_valid_immediate (val64, info, which);
14669 }
14670
14671 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
14672    has a step in the range of INDEX.  Return the index expression if so,
14673    otherwise return null.  */
14674 rtx
14675 aarch64_check_zero_based_sve_index_immediate (rtx x)
14676 {
14677   rtx base, step;
14678   if (const_vec_series_p (x, &base, &step)
14679       && base == const0_rtx
14680       && aarch64_sve_index_immediate_p (step))
14681     return step;
14682   return NULL_RTX;
14683 }
14684
14685 /* Check of immediate shift constants are within range.  */
14686 bool
14687 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
14688 {
14689   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
14690   if (left)
14691     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
14692   else
14693     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
14694 }
14695
14696 /* Return the bitmask CONST_INT to select the bits required by a zero extract
14697    operation of width WIDTH at bit position POS.  */
14698
14699 rtx
14700 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
14701 {
14702   gcc_assert (CONST_INT_P (width));
14703   gcc_assert (CONST_INT_P (pos));
14704
14705   unsigned HOST_WIDE_INT mask
14706     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
14707   return GEN_INT (mask << UINTVAL (pos));
14708 }
14709
14710 bool
14711 aarch64_mov_operand_p (rtx x, machine_mode mode)
14712 {
14713   if (GET_CODE (x) == HIGH
14714       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
14715     return true;
14716
14717   if (CONST_INT_P (x))
14718     return true;
14719
14720   if (VECTOR_MODE_P (GET_MODE (x)))
14721     return aarch64_simd_valid_immediate (x, NULL);
14722
14723   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
14724     return true;
14725
14726   if (aarch64_sve_cnt_immediate_p (x))
14727     return true;
14728
14729   return aarch64_classify_symbolic_expression (x)
14730     == SYMBOL_TINY_ABSOLUTE;
14731 }
14732
14733 /* Return a const_int vector of VAL.  */
14734 rtx
14735 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
14736 {
14737   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
14738   return gen_const_vec_duplicate (mode, c);
14739 }
14740
14741 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
14742
14743 bool
14744 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
14745 {
14746   machine_mode vmode;
14747
14748   vmode = aarch64_simd_container_mode (mode, 64);
14749   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
14750   return aarch64_simd_valid_immediate (op_v, NULL);
14751 }
14752
14753 /* Construct and return a PARALLEL RTX vector with elements numbering the
14754    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
14755    the vector - from the perspective of the architecture.  This does not
14756    line up with GCC's perspective on lane numbers, so we end up with
14757    different masks depending on our target endian-ness.  The diagram
14758    below may help.  We must draw the distinction when building masks
14759    which select one half of the vector.  An instruction selecting
14760    architectural low-lanes for a big-endian target, must be described using
14761    a mask selecting GCC high-lanes.
14762
14763                  Big-Endian             Little-Endian
14764
14765 GCC             0   1   2   3           3   2   1   0
14766               | x | x | x | x |       | x | x | x | x |
14767 Architecture    3   2   1   0           3   2   1   0
14768
14769 Low Mask:         { 2, 3 }                { 0, 1 }
14770 High Mask:        { 0, 1 }                { 2, 3 }
14771
14772    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
14773
14774 rtx
14775 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
14776 {
14777   rtvec v = rtvec_alloc (nunits / 2);
14778   int high_base = nunits / 2;
14779   int low_base = 0;
14780   int base;
14781   rtx t1;
14782   int i;
14783
14784   if (BYTES_BIG_ENDIAN)
14785     base = high ? low_base : high_base;
14786   else
14787     base = high ? high_base : low_base;
14788
14789   for (i = 0; i < nunits / 2; i++)
14790     RTVEC_ELT (v, i) = GEN_INT (base + i);
14791
14792   t1 = gen_rtx_PARALLEL (mode, v);
14793   return t1;
14794 }
14795
14796 /* Check OP for validity as a PARALLEL RTX vector with elements
14797    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
14798    from the perspective of the architecture.  See the diagram above
14799    aarch64_simd_vect_par_cnst_half for more details.  */
14800
14801 bool
14802 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
14803                                        bool high)
14804 {
14805   int nelts;
14806   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
14807     return false;
14808
14809   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
14810   HOST_WIDE_INT count_op = XVECLEN (op, 0);
14811   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
14812   int i = 0;
14813
14814   if (count_op != count_ideal)
14815     return false;
14816
14817   for (i = 0; i < count_ideal; i++)
14818     {
14819       rtx elt_op = XVECEXP (op, 0, i);
14820       rtx elt_ideal = XVECEXP (ideal, 0, i);
14821
14822       if (!CONST_INT_P (elt_op)
14823           || INTVAL (elt_ideal) != INTVAL (elt_op))
14824         return false;
14825     }
14826   return true;
14827 }
14828
14829 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
14830    HIGH (exclusive).  */
14831 void
14832 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
14833                           const_tree exp)
14834 {
14835   HOST_WIDE_INT lane;
14836   gcc_assert (CONST_INT_P (operand));
14837   lane = INTVAL (operand);
14838
14839   if (lane < low || lane >= high)
14840   {
14841     if (exp)
14842       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
14843     else
14844       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
14845   }
14846 }
14847
14848 /* Peform endian correction on lane number N, which indexes a vector
14849    of mode MODE, and return the result as an SImode rtx.  */
14850
14851 rtx
14852 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
14853 {
14854   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
14855 }
14856
14857 /* Return TRUE if OP is a valid vector addressing mode.  */
14858
14859 bool
14860 aarch64_simd_mem_operand_p (rtx op)
14861 {
14862   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
14863                         || REG_P (XEXP (op, 0)));
14864 }
14865
14866 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
14867
14868 bool
14869 aarch64_sve_ld1r_operand_p (rtx op)
14870 {
14871   struct aarch64_address_info addr;
14872   scalar_mode mode;
14873
14874   return (MEM_P (op)
14875           && is_a <scalar_mode> (GET_MODE (op), &mode)
14876           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
14877           && addr.type == ADDRESS_REG_IMM
14878           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
14879 }
14880
14881 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
14882    The conditions for STR are the same.  */
14883 bool
14884 aarch64_sve_ldr_operand_p (rtx op)
14885 {
14886   struct aarch64_address_info addr;
14887
14888   return (MEM_P (op)
14889           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
14890                                        false, ADDR_QUERY_ANY)
14891           && addr.type == ADDRESS_REG_IMM);
14892 }
14893
14894 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
14895    We need to be able to access the individual pieces, so the range
14896    is different from LD[234] and ST[234].  */
14897 bool
14898 aarch64_sve_struct_memory_operand_p (rtx op)
14899 {
14900   if (!MEM_P (op))
14901     return false;
14902
14903   machine_mode mode = GET_MODE (op);
14904   struct aarch64_address_info addr;
14905   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
14906                                  ADDR_QUERY_ANY)
14907       || addr.type != ADDRESS_REG_IMM)
14908     return false;
14909
14910   poly_int64 first = addr.const_offset;
14911   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
14912   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
14913           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
14914 }
14915
14916 /* Emit a register copy from operand to operand, taking care not to
14917    early-clobber source registers in the process.
14918
14919    COUNT is the number of components into which the copy needs to be
14920    decomposed.  */
14921 void
14922 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
14923                                 unsigned int count)
14924 {
14925   unsigned int i;
14926   int rdest = REGNO (operands[0]);
14927   int rsrc = REGNO (operands[1]);
14928
14929   if (!reg_overlap_mentioned_p (operands[0], operands[1])
14930       || rdest < rsrc)
14931     for (i = 0; i < count; i++)
14932       emit_move_insn (gen_rtx_REG (mode, rdest + i),
14933                       gen_rtx_REG (mode, rsrc + i));
14934   else
14935     for (i = 0; i < count; i++)
14936       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
14937                       gen_rtx_REG (mode, rsrc + count - i - 1));
14938 }
14939
14940 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
14941    one of VSTRUCT modes: OI, CI, or XI.  */
14942 int
14943 aarch64_simd_attr_length_rglist (machine_mode mode)
14944 {
14945   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
14946   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
14947 }
14948
14949 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
14950    alignment of a vector to 128 bits.  SVE predicates have an alignment of
14951    16 bits.  */
14952 static HOST_WIDE_INT
14953 aarch64_simd_vector_alignment (const_tree type)
14954 {
14955   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14956     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
14957        be set for non-predicate vectors of booleans.  Modes are the most
14958        direct way we have of identifying real SVE predicate types.  */
14959     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
14960   return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
14961 }
14962
14963 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
14964 static poly_uint64
14965 aarch64_vectorize_preferred_vector_alignment (const_tree type)
14966 {
14967   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
14968     {
14969       /* If the length of the vector is fixed, try to align to that length,
14970          otherwise don't try to align at all.  */
14971       HOST_WIDE_INT result;
14972       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
14973         result = TYPE_ALIGN (TREE_TYPE (type));
14974       return result;
14975     }
14976   return TYPE_ALIGN (type);
14977 }
14978
14979 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
14980 static bool
14981 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
14982 {
14983   if (is_packed)
14984     return false;
14985
14986   /* For fixed-length vectors, check that the vectorizer will aim for
14987      full-vector alignment.  This isn't true for generic GCC vectors
14988      that are wider than the ABI maximum of 128 bits.  */
14989   poly_uint64 preferred_alignment =
14990     aarch64_vectorize_preferred_vector_alignment (type);
14991   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
14992       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
14993                    preferred_alignment))
14994     return false;
14995
14996   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
14997   return true;
14998 }
14999
15000 /* Return true if the vector misalignment factor is supported by the
15001    target.  */
15002 static bool
15003 aarch64_builtin_support_vector_misalignment (machine_mode mode,
15004                                              const_tree type, int misalignment,
15005                                              bool is_packed)
15006 {
15007   if (TARGET_SIMD && STRICT_ALIGNMENT)
15008     {
15009       /* Return if movmisalign pattern is not supported for this mode.  */
15010       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
15011         return false;
15012
15013       /* Misalignment factor is unknown at compile time.  */
15014       if (misalignment == -1)
15015         return false;
15016     }
15017   return default_builtin_support_vector_misalignment (mode, type, misalignment,
15018                                                       is_packed);
15019 }
15020
15021 /* If VALS is a vector constant that can be loaded into a register
15022    using DUP, generate instructions to do so and return an RTX to
15023    assign to the register.  Otherwise return NULL_RTX.  */
15024 static rtx
15025 aarch64_simd_dup_constant (rtx vals)
15026 {
15027   machine_mode mode = GET_MODE (vals);
15028   machine_mode inner_mode = GET_MODE_INNER (mode);
15029   rtx x;
15030
15031   if (!const_vec_duplicate_p (vals, &x))
15032     return NULL_RTX;
15033
15034   /* We can load this constant by using DUP and a constant in a
15035      single ARM register.  This will be cheaper than a vector
15036      load.  */
15037   x = copy_to_mode_reg (inner_mode, x);
15038   return gen_vec_duplicate (mode, x);
15039 }
15040
15041
15042 /* Generate code to load VALS, which is a PARALLEL containing only
15043    constants (for vec_init) or CONST_VECTOR, efficiently into a
15044    register.  Returns an RTX to copy into the register, or NULL_RTX
15045    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
15046 static rtx
15047 aarch64_simd_make_constant (rtx vals)
15048 {
15049   machine_mode mode = GET_MODE (vals);
15050   rtx const_dup;
15051   rtx const_vec = NULL_RTX;
15052   int n_const = 0;
15053   int i;
15054
15055   if (GET_CODE (vals) == CONST_VECTOR)
15056     const_vec = vals;
15057   else if (GET_CODE (vals) == PARALLEL)
15058     {
15059       /* A CONST_VECTOR must contain only CONST_INTs and
15060          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
15061          Only store valid constants in a CONST_VECTOR.  */
15062       int n_elts = XVECLEN (vals, 0);
15063       for (i = 0; i < n_elts; ++i)
15064         {
15065           rtx x = XVECEXP (vals, 0, i);
15066           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15067             n_const++;
15068         }
15069       if (n_const == n_elts)
15070         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
15071     }
15072   else
15073     gcc_unreachable ();
15074
15075   if (const_vec != NULL_RTX
15076       && aarch64_simd_valid_immediate (const_vec, NULL))
15077     /* Load using MOVI/MVNI.  */
15078     return const_vec;
15079   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
15080     /* Loaded using DUP.  */
15081     return const_dup;
15082   else if (const_vec != NULL_RTX)
15083     /* Load from constant pool. We cannot take advantage of single-cycle
15084        LD1 because we need a PC-relative addressing mode.  */
15085     return const_vec;
15086   else
15087     /* A PARALLEL containing something not valid inside CONST_VECTOR.
15088        We cannot construct an initializer.  */
15089     return NULL_RTX;
15090 }
15091
15092 /* Expand a vector initialisation sequence, such that TARGET is
15093    initialised to contain VALS.  */
15094
15095 void
15096 aarch64_expand_vector_init (rtx target, rtx vals)
15097 {
15098   machine_mode mode = GET_MODE (target);
15099   scalar_mode inner_mode = GET_MODE_INNER (mode);
15100   /* The number of vector elements.  */
15101   int n_elts = XVECLEN (vals, 0);
15102   /* The number of vector elements which are not constant.  */
15103   int n_var = 0;
15104   rtx any_const = NULL_RTX;
15105   /* The first element of vals.  */
15106   rtx v0 = XVECEXP (vals, 0, 0);
15107   bool all_same = true;
15108
15109   /* This is a special vec_init<M><N> where N is not an element mode but a
15110      vector mode with half the elements of M.  We expect to find two entries
15111      of mode N in VALS and we must put their concatentation into TARGET.  */
15112   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
15113     {
15114       gcc_assert (known_eq (GET_MODE_SIZE (mode),
15115                   2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
15116       rtx lo = XVECEXP (vals, 0, 0);
15117       rtx hi = XVECEXP (vals, 0, 1);
15118       machine_mode narrow_mode = GET_MODE (lo);
15119       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
15120       gcc_assert (narrow_mode == GET_MODE (hi));
15121
15122       /* When we want to concatenate a half-width vector with zeroes we can
15123          use the aarch64_combinez[_be] patterns.  Just make sure that the
15124          zeroes are in the right half.  */
15125       if (BYTES_BIG_ENDIAN
15126           && aarch64_simd_imm_zero (lo, narrow_mode)
15127           && general_operand (hi, narrow_mode))
15128         emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
15129       else if (!BYTES_BIG_ENDIAN
15130                && aarch64_simd_imm_zero (hi, narrow_mode)
15131                && general_operand (lo, narrow_mode))
15132         emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
15133       else
15134         {
15135           /* Else create the two half-width registers and combine them.  */
15136           if (!REG_P (lo))
15137             lo = force_reg (GET_MODE (lo), lo);
15138           if (!REG_P (hi))
15139             hi = force_reg (GET_MODE (hi), hi);
15140
15141           if (BYTES_BIG_ENDIAN)
15142             std::swap (lo, hi);
15143           emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
15144         }
15145      return;
15146    }
15147
15148   /* Count the number of variable elements to initialise.  */
15149   for (int i = 0; i < n_elts; ++i)
15150     {
15151       rtx x = XVECEXP (vals, 0, i);
15152       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
15153         ++n_var;
15154       else
15155         any_const = x;
15156
15157       all_same &= rtx_equal_p (x, v0);
15158     }
15159
15160   /* No variable elements, hand off to aarch64_simd_make_constant which knows
15161      how best to handle this.  */
15162   if (n_var == 0)
15163     {
15164       rtx constant = aarch64_simd_make_constant (vals);
15165       if (constant != NULL_RTX)
15166         {
15167           emit_move_insn (target, constant);
15168           return;
15169         }
15170     }
15171
15172   /* Splat a single non-constant element if we can.  */
15173   if (all_same)
15174     {
15175       rtx x = copy_to_mode_reg (inner_mode, v0);
15176       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15177       return;
15178     }
15179
15180   enum insn_code icode = optab_handler (vec_set_optab, mode);
15181   gcc_assert (icode != CODE_FOR_nothing);
15182
15183   /* If there are only variable elements, try to optimize
15184      the insertion using dup for the most common element
15185      followed by insertions.  */
15186
15187   /* The algorithm will fill matches[*][0] with the earliest matching element,
15188      and matches[X][1] with the count of duplicate elements (if X is the
15189      earliest element which has duplicates).  */
15190
15191   if (n_var == n_elts && n_elts <= 16)
15192     {
15193       int matches[16][2] = {0};
15194       for (int i = 0; i < n_elts; i++)
15195         {
15196           for (int j = 0; j <= i; j++)
15197             {
15198               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
15199                 {
15200                   matches[i][0] = j;
15201                   matches[j][1]++;
15202                   break;
15203                 }
15204             }
15205         }
15206       int maxelement = 0;
15207       int maxv = 0;
15208       for (int i = 0; i < n_elts; i++)
15209         if (matches[i][1] > maxv)
15210           {
15211             maxelement = i;
15212             maxv = matches[i][1];
15213           }
15214
15215       /* Create a duplicate of the most common element, unless all elements
15216          are equally useless to us, in which case just immediately set the
15217          vector register using the first element.  */
15218
15219       if (maxv == 1)
15220         {
15221           /* For vectors of two 64-bit elements, we can do even better.  */
15222           if (n_elts == 2
15223               && (inner_mode == E_DImode
15224                   || inner_mode == E_DFmode))
15225
15226             {
15227               rtx x0 = XVECEXP (vals, 0, 0);
15228               rtx x1 = XVECEXP (vals, 0, 1);
15229               /* Combine can pick up this case, but handling it directly
15230                  here leaves clearer RTL.
15231
15232                  This is load_pair_lanes<mode>, and also gives us a clean-up
15233                  for store_pair_lanes<mode>.  */
15234               if (memory_operand (x0, inner_mode)
15235                   && memory_operand (x1, inner_mode)
15236                   && !STRICT_ALIGNMENT
15237                   && rtx_equal_p (XEXP (x1, 0),
15238                                   plus_constant (Pmode,
15239                                                  XEXP (x0, 0),
15240                                                  GET_MODE_SIZE (inner_mode))))
15241                 {
15242                   rtx t;
15243                   if (inner_mode == DFmode)
15244                     t = gen_load_pair_lanesdf (target, x0, x1);
15245                   else
15246                     t = gen_load_pair_lanesdi (target, x0, x1);
15247                   emit_insn (t);
15248                   return;
15249                 }
15250             }
15251           /* The subreg-move sequence below will move into lane zero of the
15252              vector register.  For big-endian we want that position to hold
15253              the last element of VALS.  */
15254           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
15255           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15256           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
15257         }
15258       else
15259         {
15260           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15261           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15262         }
15263
15264       /* Insert the rest.  */
15265       for (int i = 0; i < n_elts; i++)
15266         {
15267           rtx x = XVECEXP (vals, 0, i);
15268           if (matches[i][0] == maxelement)
15269             continue;
15270           x = copy_to_mode_reg (inner_mode, x);
15271           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15272         }
15273       return;
15274     }
15275
15276   /* Initialise a vector which is part-variable.  We want to first try
15277      to build those lanes which are constant in the most efficient way we
15278      can.  */
15279   if (n_var != n_elts)
15280     {
15281       rtx copy = copy_rtx (vals);
15282
15283       /* Load constant part of vector.  We really don't care what goes into the
15284          parts we will overwrite, but we're more likely to be able to load the
15285          constant efficiently if it has fewer, larger, repeating parts
15286          (see aarch64_simd_valid_immediate).  */
15287       for (int i = 0; i < n_elts; i++)
15288         {
15289           rtx x = XVECEXP (vals, 0, i);
15290           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15291             continue;
15292           rtx subst = any_const;
15293           for (int bit = n_elts / 2; bit > 0; bit /= 2)
15294             {
15295               /* Look in the copied vector, as more elements are const.  */
15296               rtx test = XVECEXP (copy, 0, i ^ bit);
15297               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
15298                 {
15299                   subst = test;
15300                   break;
15301                 }
15302             }
15303           XVECEXP (copy, 0, i) = subst;
15304         }
15305       aarch64_expand_vector_init (target, copy);
15306     }
15307
15308   /* Insert the variable lanes directly.  */
15309   for (int i = 0; i < n_elts; i++)
15310     {
15311       rtx x = XVECEXP (vals, 0, i);
15312       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15313         continue;
15314       x = copy_to_mode_reg (inner_mode, x);
15315       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15316     }
15317 }
15318
15319 /* Emit RTL corresponding to:
15320    insr TARGET, ELEM.  */
15321
15322 static void
15323 emit_insr (rtx target, rtx elem)
15324 {
15325   machine_mode mode = GET_MODE (target);
15326   scalar_mode elem_mode = GET_MODE_INNER (mode);
15327   elem = force_reg (elem_mode, elem);
15328
15329   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
15330   gcc_assert (icode != CODE_FOR_nothing);
15331   emit_insn (GEN_FCN (icode) (target, target, elem));
15332 }
15333
15334 /* Subroutine of aarch64_sve_expand_vector_init for handling
15335    trailing constants.
15336    This function works as follows:
15337    (a) Create a new vector consisting of trailing constants.
15338    (b) Initialize TARGET with the constant vector using emit_move_insn.
15339    (c) Insert remaining elements in TARGET using insr.
15340    NELTS is the total number of elements in original vector while
15341    while NELTS_REQD is the number of elements that are actually
15342    significant.
15343
15344    ??? The heuristic used is to do above only if number of constants
15345    is at least half the total number of elements.  May need fine tuning.  */
15346
15347 static bool
15348 aarch64_sve_expand_vector_init_handle_trailing_constants
15349  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
15350 {
15351   machine_mode mode = GET_MODE (target);
15352   scalar_mode elem_mode = GET_MODE_INNER (mode);
15353   int n_trailing_constants = 0;
15354
15355   for (int i = nelts_reqd - 1;
15356        i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
15357        i--)
15358     n_trailing_constants++;
15359
15360   if (n_trailing_constants >= nelts_reqd / 2)
15361     {
15362       rtx_vector_builder v (mode, 1, nelts);
15363       for (int i = 0; i < nelts; i++)
15364         v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
15365       rtx const_vec = v.build ();
15366       emit_move_insn (target, const_vec);
15367
15368       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
15369         emit_insr (target, builder.elt (i));
15370
15371       return true;
15372     }
15373
15374   return false;
15375 }
15376
15377 /* Subroutine of aarch64_sve_expand_vector_init.
15378    Works as follows:
15379    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
15380    (b) Skip trailing elements from BUILDER, which are the same as
15381        element NELTS_REQD - 1.
15382    (c) Insert earlier elements in reverse order in TARGET using insr.  */
15383
15384 static void
15385 aarch64_sve_expand_vector_init_insert_elems (rtx target,
15386                                              const rtx_vector_builder &builder,
15387                                              int nelts_reqd)
15388 {
15389   machine_mode mode = GET_MODE (target);
15390   scalar_mode elem_mode = GET_MODE_INNER (mode);
15391
15392   struct expand_operand ops[2];
15393   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
15394   gcc_assert (icode != CODE_FOR_nothing);
15395
15396   create_output_operand (&ops[0], target, mode);
15397   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
15398   expand_insn (icode, 2, ops);
15399
15400   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
15401   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
15402     emit_insr (target, builder.elt (i));
15403 }
15404
15405 /* Subroutine of aarch64_sve_expand_vector_init to handle case
15406    when all trailing elements of builder are same.
15407    This works as follows:
15408    (a) Use expand_insn interface to broadcast last vector element in TARGET.
15409    (b) Insert remaining elements in TARGET using insr.
15410
15411    ??? The heuristic used is to do above if number of same trailing elements
15412    is at least 3/4 of total number of elements, loosely based on
15413    heuristic from mostly_zeros_p.  May need fine-tuning.  */
15414
15415 static bool
15416 aarch64_sve_expand_vector_init_handle_trailing_same_elem
15417  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
15418 {
15419   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
15420   if (ndups >= (3 * nelts_reqd) / 4)
15421     {
15422       aarch64_sve_expand_vector_init_insert_elems (target, builder,
15423                                                    nelts_reqd - ndups + 1);
15424       return true;
15425     }
15426
15427   return false;
15428 }
15429
15430 /* Initialize register TARGET from BUILDER. NELTS is the constant number
15431    of elements in BUILDER.
15432
15433    The function tries to initialize TARGET from BUILDER if it fits one
15434    of the special cases outlined below.
15435
15436    Failing that, the function divides BUILDER into two sub-vectors:
15437    v_even = even elements of BUILDER;
15438    v_odd = odd elements of BUILDER;
15439
15440    and recursively calls itself with v_even and v_odd.
15441
15442    if (recursive call succeeded for v_even or v_odd)
15443      TARGET = zip (v_even, v_odd)
15444
15445    The function returns true if it managed to build TARGET from BUILDER
15446    with one of the special cases, false otherwise.
15447
15448    Example: {a, 1, b, 2, c, 3, d, 4}
15449
15450    The vector gets divided into:
15451    v_even = {a, b, c, d}
15452    v_odd = {1, 2, 3, 4}
15453
15454    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
15455    initialize tmp2 from constant vector v_odd using emit_move_insn.
15456
15457    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
15458    4 elements, so we construct tmp1 from v_even using insr:
15459    tmp1 = dup(d)
15460    insr tmp1, c
15461    insr tmp1, b
15462    insr tmp1, a
15463
15464    And finally:
15465    TARGET = zip (tmp1, tmp2)
15466    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
15467
15468 static bool
15469 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
15470                                 int nelts, int nelts_reqd)
15471 {
15472   machine_mode mode = GET_MODE (target);
15473
15474   /* Case 1: Vector contains trailing constants.  */
15475
15476   if (aarch64_sve_expand_vector_init_handle_trailing_constants
15477        (target, builder, nelts, nelts_reqd))
15478     return true;
15479
15480   /* Case 2: Vector contains leading constants.  */
15481
15482   rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
15483   for (int i = 0; i < nelts_reqd; i++)
15484     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
15485   rev_builder.finalize ();
15486
15487   if (aarch64_sve_expand_vector_init_handle_trailing_constants
15488        (target, rev_builder, nelts, nelts_reqd))
15489     {
15490       emit_insn (gen_aarch64_sve_rev (mode, target, target));
15491       return true;
15492     }
15493
15494   /* Case 3: Vector contains trailing same element.  */
15495
15496   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
15497        (target, builder, nelts_reqd))
15498     return true;
15499
15500   /* Case 4: Vector contains leading same element.  */
15501
15502   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
15503        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
15504     {
15505       emit_insn (gen_aarch64_sve_rev (mode, target, target));
15506       return true;
15507     }
15508
15509   /* Avoid recursing below 4-elements.
15510      ??? The threshold 4 may need fine-tuning.  */
15511
15512   if (nelts_reqd <= 4)
15513     return false;
15514
15515   rtx_vector_builder v_even (mode, 1, nelts);
15516   rtx_vector_builder v_odd (mode, 1, nelts);
15517
15518   for (int i = 0; i < nelts * 2; i += 2)
15519     {
15520       v_even.quick_push (builder.elt (i));
15521       v_odd.quick_push (builder.elt (i + 1));
15522     }
15523
15524   v_even.finalize ();
15525   v_odd.finalize ();
15526
15527   rtx tmp1 = gen_reg_rtx (mode);
15528   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
15529                                                     nelts, nelts_reqd / 2);
15530
15531   rtx tmp2 = gen_reg_rtx (mode);
15532   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
15533                                                    nelts, nelts_reqd / 2);
15534
15535   if (!did_even_p && !did_odd_p)
15536     return false;
15537
15538   /* Initialize v_even and v_odd using INSR if it didn't match any of the
15539      special cases and zip v_even, v_odd.  */
15540
15541   if (!did_even_p)
15542     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
15543
15544   if (!did_odd_p)
15545     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
15546
15547   rtvec v = gen_rtvec (2, tmp1, tmp2);
15548   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
15549   return true;
15550 }
15551
15552 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
15553
15554 void
15555 aarch64_sve_expand_vector_init (rtx target, rtx vals)
15556 {
15557   machine_mode mode = GET_MODE (target);
15558   int nelts = XVECLEN (vals, 0);
15559
15560   rtx_vector_builder v (mode, 1, nelts);
15561   for (int i = 0; i < nelts; i++)
15562     v.quick_push (XVECEXP (vals, 0, i));
15563   v.finalize ();
15564
15565   /* If neither sub-vectors of v could be initialized specially,
15566      then use INSR to insert all elements from v into TARGET.
15567      ??? This might not be optimal for vectors with large
15568      initializers like 16-element or above.
15569      For nelts < 4, it probably isn't useful to handle specially.  */
15570
15571   if (nelts < 4
15572       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
15573     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
15574 }
15575
15576 static unsigned HOST_WIDE_INT
15577 aarch64_shift_truncation_mask (machine_mode mode)
15578 {
15579   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
15580     return 0;
15581   return GET_MODE_UNIT_BITSIZE (mode) - 1;
15582 }
15583
15584 /* Select a format to encode pointers in exception handling data.  */
15585 int
15586 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
15587 {
15588    int type;
15589    switch (aarch64_cmodel)
15590      {
15591      case AARCH64_CMODEL_TINY:
15592      case AARCH64_CMODEL_TINY_PIC:
15593      case AARCH64_CMODEL_SMALL:
15594      case AARCH64_CMODEL_SMALL_PIC:
15595      case AARCH64_CMODEL_SMALL_SPIC:
15596        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
15597           for everything.  */
15598        type = DW_EH_PE_sdata4;
15599        break;
15600      default:
15601        /* No assumptions here.  8-byte relocs required.  */
15602        type = DW_EH_PE_sdata8;
15603        break;
15604      }
15605    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
15606 }
15607
15608 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
15609
15610 static void
15611 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
15612 {
15613   if (aarch64_simd_decl_p (decl))
15614     {
15615       fprintf (stream, "\t.variant_pcs\t");
15616       assemble_name (stream, name);
15617       fprintf (stream, "\n");
15618     }
15619 }
15620
15621 /* The last .arch and .tune assembly strings that we printed.  */
15622 static std::string aarch64_last_printed_arch_string;
15623 static std::string aarch64_last_printed_tune_string;
15624
15625 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
15626    by the function fndecl.  */
15627
15628 void
15629 aarch64_declare_function_name (FILE *stream, const char* name,
15630                                 tree fndecl)
15631 {
15632   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15633
15634   struct cl_target_option *targ_options;
15635   if (target_parts)
15636     targ_options = TREE_TARGET_OPTION (target_parts);
15637   else
15638     targ_options = TREE_TARGET_OPTION (target_option_current_node);
15639   gcc_assert (targ_options);
15640
15641   const struct processor *this_arch
15642     = aarch64_get_arch (targ_options->x_explicit_arch);
15643
15644   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
15645   std::string extension
15646     = aarch64_get_extension_string_for_isa_flags (isa_flags,
15647                                                   this_arch->flags);
15648   /* Only update the assembler .arch string if it is distinct from the last
15649      such string we printed.  */
15650   std::string to_print = this_arch->name + extension;
15651   if (to_print != aarch64_last_printed_arch_string)
15652     {
15653       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
15654       aarch64_last_printed_arch_string = to_print;
15655     }
15656
15657   /* Print the cpu name we're tuning for in the comments, might be
15658      useful to readers of the generated asm.  Do it only when it changes
15659      from function to function and verbose assembly is requested.  */
15660   const struct processor *this_tune
15661     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
15662
15663   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
15664     {
15665       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
15666                    this_tune->name);
15667       aarch64_last_printed_tune_string = this_tune->name;
15668     }
15669
15670   aarch64_asm_output_variant_pcs (stream, fndecl, name);
15671
15672   /* Don't forget the type directive for ELF.  */
15673   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
15674   ASM_OUTPUT_LABEL (stream, name);
15675 }
15676
15677 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
15678
15679 void
15680 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
15681 {
15682   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
15683   const char *value = IDENTIFIER_POINTER (target);
15684   aarch64_asm_output_variant_pcs (stream, decl, name);
15685   ASM_OUTPUT_DEF (stream, name, value);
15686 }
15687
15688 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
15689    function symbol references.  */
15690
15691 void
15692 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
15693 {
15694   default_elf_asm_output_external (stream, decl, name);
15695   aarch64_asm_output_variant_pcs (stream, decl, name);
15696 }
15697
15698 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
15699    Used to output the .cfi_b_key_frame directive when signing the current
15700    function with the B key.  */
15701
15702 void
15703 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
15704 {
15705   if (!cfun->is_thunk && aarch64_return_address_signing_enabled ()
15706       && aarch64_ra_sign_key == AARCH64_KEY_B)
15707         asm_fprintf (f, "\t.cfi_b_key_frame\n");
15708 }
15709
15710 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
15711
15712 static void
15713 aarch64_start_file (void)
15714 {
15715   struct cl_target_option *default_options
15716     = TREE_TARGET_OPTION (target_option_default_node);
15717
15718   const struct processor *default_arch
15719     = aarch64_get_arch (default_options->x_explicit_arch);
15720   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
15721   std::string extension
15722     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
15723                                                   default_arch->flags);
15724
15725    aarch64_last_printed_arch_string = default_arch->name + extension;
15726    aarch64_last_printed_tune_string = "";
15727    asm_fprintf (asm_out_file, "\t.arch %s\n",
15728                 aarch64_last_printed_arch_string.c_str ());
15729
15730    default_file_start ();
15731 }
15732
15733 /* Emit load exclusive.  */
15734
15735 static void
15736 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
15737                              rtx mem, rtx model_rtx)
15738 {
15739   emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
15740 }
15741
15742 /* Emit store exclusive.  */
15743
15744 static void
15745 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
15746                               rtx rval, rtx mem, rtx model_rtx)
15747 {
15748   emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
15749 }
15750
15751 /* Mark the previous jump instruction as unlikely.  */
15752
15753 static void
15754 aarch64_emit_unlikely_jump (rtx insn)
15755 {
15756   rtx_insn *jump = emit_jump_insn (insn);
15757   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
15758 }
15759
15760 /* Expand a compare and swap pattern.  */
15761
15762 void
15763 aarch64_expand_compare_and_swap (rtx operands[])
15764 {
15765   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
15766   machine_mode mode, r_mode;
15767
15768   bval = operands[0];
15769   rval = operands[1];
15770   mem = operands[2];
15771   oldval = operands[3];
15772   newval = operands[4];
15773   is_weak = operands[5];
15774   mod_s = operands[6];
15775   mod_f = operands[7];
15776   mode = GET_MODE (mem);
15777
15778   /* Normally the succ memory model must be stronger than fail, but in the
15779      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
15780      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
15781   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
15782       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
15783     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
15784
15785   r_mode = mode;
15786   if (mode == QImode || mode == HImode)
15787     {
15788       r_mode = SImode;
15789       rval = gen_reg_rtx (r_mode);
15790     }
15791
15792   if (TARGET_LSE)
15793     {
15794       /* The CAS insn requires oldval and rval overlap, but we need to
15795          have a copy of oldval saved across the operation to tell if
15796          the operation is successful.  */
15797       if (reg_overlap_mentioned_p (rval, oldval))
15798         rval = copy_to_mode_reg (r_mode, oldval);
15799       else
15800         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
15801
15802       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
15803                                                    newval, mod_s));
15804       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15805     }
15806   else
15807     {
15808       /* The oldval predicate varies by mode.  Test it and force to reg.  */
15809       insn_code code = code_for_aarch64_compare_and_swap (mode);
15810       if (!insn_data[code].operand[2].predicate (oldval, mode))
15811         oldval = force_reg (mode, oldval);
15812
15813       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
15814                                  is_weak, mod_s, mod_f));
15815       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
15816     }
15817
15818   if (r_mode != mode)
15819     rval = gen_lowpart (mode, rval);
15820   emit_move_insn (operands[1], rval);
15821
15822   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
15823   emit_insn (gen_rtx_SET (bval, x));
15824 }
15825
15826 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
15827    sequence implementing an atomic operation.  */
15828
15829 static void
15830 aarch64_emit_post_barrier (enum memmodel model)
15831 {
15832   const enum memmodel base_model = memmodel_base (model);
15833
15834   if (is_mm_sync (model)
15835       && (base_model == MEMMODEL_ACQUIRE
15836           || base_model == MEMMODEL_ACQ_REL
15837           || base_model == MEMMODEL_SEQ_CST))
15838     {
15839       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
15840     }
15841 }
15842
15843 /* Split a compare and swap pattern.  */
15844
15845 void
15846 aarch64_split_compare_and_swap (rtx operands[])
15847 {
15848   rtx rval, mem, oldval, newval, scratch;
15849   machine_mode mode;
15850   bool is_weak;
15851   rtx_code_label *label1, *label2;
15852   rtx x, cond;
15853   enum memmodel model;
15854   rtx model_rtx;
15855
15856   rval = operands[0];
15857   mem = operands[1];
15858   oldval = operands[2];
15859   newval = operands[3];
15860   is_weak = (operands[4] != const0_rtx);
15861   model_rtx = operands[5];
15862   scratch = operands[7];
15863   mode = GET_MODE (mem);
15864   model = memmodel_from_int (INTVAL (model_rtx));
15865
15866   /* When OLDVAL is zero and we want the strong version we can emit a tighter
15867     loop:
15868     .label1:
15869         LD[A]XR rval, [mem]
15870         CBNZ    rval, .label2
15871         ST[L]XR scratch, newval, [mem]
15872         CBNZ    scratch, .label1
15873     .label2:
15874         CMP     rval, 0.  */
15875   bool strong_zero_p = !is_weak && oldval == const0_rtx;
15876
15877   label1 = NULL;
15878   if (!is_weak)
15879     {
15880       label1 = gen_label_rtx ();
15881       emit_label (label1);
15882     }
15883   label2 = gen_label_rtx ();
15884
15885   /* The initial load can be relaxed for a __sync operation since a final
15886      barrier will be emitted to stop code hoisting.  */
15887   if (is_mm_sync (model))
15888     aarch64_emit_load_exclusive (mode, rval, mem,
15889                                  GEN_INT (MEMMODEL_RELAXED));
15890   else
15891     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
15892
15893   if (strong_zero_p)
15894     {
15895       if (aarch64_track_speculation)
15896         {
15897           /* Emit an explicit compare instruction, so that we can correctly
15898              track the condition codes.  */
15899           rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
15900           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15901         }
15902       else
15903         x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
15904
15905       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15906                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15907       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15908     }
15909   else
15910     {
15911       cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15912       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15913       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15914                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15915       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15916     }
15917
15918   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
15919
15920   if (!is_weak)
15921     {
15922       if (aarch64_track_speculation)
15923         {
15924           /* Emit an explicit compare instruction, so that we can correctly
15925              track the condition codes.  */
15926           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
15927           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15928         }
15929       else
15930         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
15931
15932       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15933                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
15934       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15935     }
15936   else
15937     {
15938       cond = gen_rtx_REG (CCmode, CC_REGNUM);
15939       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
15940       emit_insn (gen_rtx_SET (cond, x));
15941     }
15942
15943   emit_label (label2);
15944   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
15945      to set the condition flags.  If this is not used it will be removed by
15946      later passes.  */
15947   if (strong_zero_p)
15948     {
15949       cond = gen_rtx_REG (CCmode, CC_REGNUM);
15950       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
15951       emit_insn (gen_rtx_SET (cond, x));
15952     }
15953   /* Emit any final barrier needed for a __sync operation.  */
15954   if (is_mm_sync (model))
15955     aarch64_emit_post_barrier (model);
15956 }
15957
15958 /* Split an atomic operation.  */
15959
15960 void
15961 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
15962                          rtx value, rtx model_rtx, rtx cond)
15963 {
15964   machine_mode mode = GET_MODE (mem);
15965   machine_mode wmode = (mode == DImode ? DImode : SImode);
15966   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
15967   const bool is_sync = is_mm_sync (model);
15968   rtx_code_label *label;
15969   rtx x;
15970
15971   /* Split the atomic operation into a sequence.  */
15972   label = gen_label_rtx ();
15973   emit_label (label);
15974
15975   if (new_out)
15976     new_out = gen_lowpart (wmode, new_out);
15977   if (old_out)
15978     old_out = gen_lowpart (wmode, old_out);
15979   else
15980     old_out = new_out;
15981   value = simplify_gen_subreg (wmode, value, mode, 0);
15982
15983   /* The initial load can be relaxed for a __sync operation since a final
15984      barrier will be emitted to stop code hoisting.  */
15985  if (is_sync)
15986     aarch64_emit_load_exclusive (mode, old_out, mem,
15987                                  GEN_INT (MEMMODEL_RELAXED));
15988   else
15989     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
15990
15991   switch (code)
15992     {
15993     case SET:
15994       new_out = value;
15995       break;
15996
15997     case NOT:
15998       x = gen_rtx_AND (wmode, old_out, value);
15999       emit_insn (gen_rtx_SET (new_out, x));
16000       x = gen_rtx_NOT (wmode, new_out);
16001       emit_insn (gen_rtx_SET (new_out, x));
16002       break;
16003
16004     case MINUS:
16005       if (CONST_INT_P (value))
16006         {
16007           value = GEN_INT (-INTVAL (value));
16008           code = PLUS;
16009         }
16010       /* Fall through.  */
16011
16012     default:
16013       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
16014       emit_insn (gen_rtx_SET (new_out, x));
16015       break;
16016     }
16017
16018   aarch64_emit_store_exclusive (mode, cond, mem,
16019                                 gen_lowpart (mode, new_out), model_rtx);
16020
16021   if (aarch64_track_speculation)
16022     {
16023       /* Emit an explicit compare instruction, so that we can correctly
16024          track the condition codes.  */
16025       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
16026       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16027     }
16028   else
16029     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16030
16031   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16032                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
16033   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16034
16035   /* Emit any final barrier needed for a __sync operation.  */
16036   if (is_sync)
16037     aarch64_emit_post_barrier (model);
16038 }
16039
16040 static void
16041 aarch64_init_libfuncs (void)
16042 {
16043    /* Half-precision float operations.  The compiler handles all operations
16044      with NULL libfuncs by converting to SFmode.  */
16045
16046   /* Conversions.  */
16047   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
16048   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
16049
16050   /* Arithmetic.  */
16051   set_optab_libfunc (add_optab, HFmode, NULL);
16052   set_optab_libfunc (sdiv_optab, HFmode, NULL);
16053   set_optab_libfunc (smul_optab, HFmode, NULL);
16054   set_optab_libfunc (neg_optab, HFmode, NULL);
16055   set_optab_libfunc (sub_optab, HFmode, NULL);
16056
16057   /* Comparisons.  */
16058   set_optab_libfunc (eq_optab, HFmode, NULL);
16059   set_optab_libfunc (ne_optab, HFmode, NULL);
16060   set_optab_libfunc (lt_optab, HFmode, NULL);
16061   set_optab_libfunc (le_optab, HFmode, NULL);
16062   set_optab_libfunc (ge_optab, HFmode, NULL);
16063   set_optab_libfunc (gt_optab, HFmode, NULL);
16064   set_optab_libfunc (unord_optab, HFmode, NULL);
16065 }
16066
16067 /* Target hook for c_mode_for_suffix.  */
16068 static machine_mode
16069 aarch64_c_mode_for_suffix (char suffix)
16070 {
16071   if (suffix == 'q')
16072     return TFmode;
16073
16074   return VOIDmode;
16075 }
16076
16077 /* We can only represent floating point constants which will fit in
16078    "quarter-precision" values.  These values are characterised by
16079    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
16080    by:
16081
16082    (-1)^s * (n/16) * 2^r
16083
16084    Where:
16085      's' is the sign bit.
16086      'n' is an integer in the range 16 <= n <= 31.
16087      'r' is an integer in the range -3 <= r <= 4.  */
16088
16089 /* Return true iff X can be represented by a quarter-precision
16090    floating point immediate operand X.  Note, we cannot represent 0.0.  */
16091 bool
16092 aarch64_float_const_representable_p (rtx x)
16093 {
16094   /* This represents our current view of how many bits
16095      make up the mantissa.  */
16096   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
16097   int exponent;
16098   unsigned HOST_WIDE_INT mantissa, mask;
16099   REAL_VALUE_TYPE r, m;
16100   bool fail;
16101
16102   if (!CONST_DOUBLE_P (x))
16103     return false;
16104
16105   if (GET_MODE (x) == VOIDmode
16106       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
16107     return false;
16108
16109   r = *CONST_DOUBLE_REAL_VALUE (x);
16110
16111   /* We cannot represent infinities, NaNs or +/-zero.  We won't
16112      know if we have +zero until we analyse the mantissa, but we
16113      can reject the other invalid values.  */
16114   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
16115       || REAL_VALUE_MINUS_ZERO (r))
16116     return false;
16117
16118   /* Extract exponent.  */
16119   r = real_value_abs (&r);
16120   exponent = REAL_EXP (&r);
16121
16122   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
16123      highest (sign) bit, with a fixed binary point at bit point_pos.
16124      m1 holds the low part of the mantissa, m2 the high part.
16125      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
16126      bits for the mantissa, this can fail (low bits will be lost).  */
16127   real_ldexp (&m, &r, point_pos - exponent);
16128   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
16129
16130   /* If the low part of the mantissa has bits set we cannot represent
16131      the value.  */
16132   if (w.ulow () != 0)
16133     return false;
16134   /* We have rejected the lower HOST_WIDE_INT, so update our
16135      understanding of how many bits lie in the mantissa and
16136      look only at the high HOST_WIDE_INT.  */
16137   mantissa = w.elt (1);
16138   point_pos -= HOST_BITS_PER_WIDE_INT;
16139
16140   /* We can only represent values with a mantissa of the form 1.xxxx.  */
16141   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
16142   if ((mantissa & mask) != 0)
16143     return false;
16144
16145   /* Having filtered unrepresentable values, we may now remove all
16146      but the highest 5 bits.  */
16147   mantissa >>= point_pos - 5;
16148
16149   /* We cannot represent the value 0.0, so reject it.  This is handled
16150      elsewhere.  */
16151   if (mantissa == 0)
16152     return false;
16153
16154   /* Then, as bit 4 is always set, we can mask it off, leaving
16155      the mantissa in the range [0, 15].  */
16156   mantissa &= ~(1 << 4);
16157   gcc_assert (mantissa <= 15);
16158
16159   /* GCC internally does not use IEEE754-like encoding (where normalized
16160      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
16161      Our mantissa values are shifted 4 places to the left relative to
16162      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
16163      by 5 places to correct for GCC's representation.  */
16164   exponent = 5 - exponent;
16165
16166   return (exponent >= 0 && exponent <= 7);
16167 }
16168
16169 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
16170    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
16171    output MOVI/MVNI, ORR or BIC immediate.  */
16172 char*
16173 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
16174                                    enum simd_immediate_check which)
16175 {
16176   bool is_valid;
16177   static char templ[40];
16178   const char *mnemonic;
16179   const char *shift_op;
16180   unsigned int lane_count = 0;
16181   char element_char;
16182
16183   struct simd_immediate_info info;
16184
16185   /* This will return true to show const_vector is legal for use as either
16186      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
16187      It will also update INFO to show how the immediate should be generated.
16188      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
16189   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
16190   gcc_assert (is_valid);
16191
16192   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
16193   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
16194
16195   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
16196     {
16197       gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
16198       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
16199          move immediate path.  */
16200       if (aarch64_float_const_zero_rtx_p (info.value))
16201         info.value = GEN_INT (0);
16202       else
16203         {
16204           const unsigned int buf_size = 20;
16205           char float_buf[buf_size] = {'\0'};
16206           real_to_decimal_for_mode (float_buf,
16207                                     CONST_DOUBLE_REAL_VALUE (info.value),
16208                                     buf_size, buf_size, 1, info.elt_mode);
16209
16210           if (lane_count == 1)
16211             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
16212           else
16213             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
16214                       lane_count, element_char, float_buf);
16215           return templ;
16216         }
16217     }
16218
16219   gcc_assert (CONST_INT_P (info.value));
16220
16221   if (which == AARCH64_CHECK_MOV)
16222     {
16223       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
16224       shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
16225       if (lane_count == 1)
16226         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
16227                   mnemonic, UINTVAL (info.value));
16228       else if (info.shift)
16229         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
16230                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
16231                   element_char, UINTVAL (info.value), shift_op, info.shift);
16232       else
16233         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
16234                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
16235                   element_char, UINTVAL (info.value));
16236     }
16237   else
16238     {
16239       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
16240       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
16241       if (info.shift)
16242         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
16243                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
16244                   element_char, UINTVAL (info.value), "lsl", info.shift);
16245       else
16246         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
16247                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
16248                   element_char, UINTVAL (info.value));
16249     }
16250   return templ;
16251 }
16252
16253 char*
16254 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
16255 {
16256
16257   /* If a floating point number was passed and we desire to use it in an
16258      integer mode do the conversion to integer.  */
16259   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
16260     {
16261       unsigned HOST_WIDE_INT ival;
16262       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
16263           gcc_unreachable ();
16264       immediate = gen_int_mode (ival, mode);
16265     }
16266
16267   machine_mode vmode;
16268   /* use a 64 bit mode for everything except for DI/DF mode, where we use
16269      a 128 bit vector mode.  */
16270   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
16271
16272   vmode = aarch64_simd_container_mode (mode, width);
16273   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
16274   return aarch64_output_simd_mov_immediate (v_op, width);
16275 }
16276
16277 /* Return the output string to use for moving immediate CONST_VECTOR
16278    into an SVE register.  */
16279
16280 char *
16281 aarch64_output_sve_mov_immediate (rtx const_vector)
16282 {
16283   static char templ[40];
16284   struct simd_immediate_info info;
16285   char element_char;
16286
16287   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
16288   gcc_assert (is_valid);
16289
16290   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
16291
16292   if (info.step)
16293     {
16294       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
16295                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
16296                 element_char, INTVAL (info.value), INTVAL (info.step));
16297       return templ;
16298     }
16299
16300   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
16301     {
16302       if (aarch64_float_const_zero_rtx_p (info.value))
16303         info.value = GEN_INT (0);
16304       else
16305         {
16306           const int buf_size = 20;
16307           char float_buf[buf_size] = {};
16308           real_to_decimal_for_mode (float_buf,
16309                                     CONST_DOUBLE_REAL_VALUE (info.value),
16310                                     buf_size, buf_size, 1, info.elt_mode);
16311
16312           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
16313                     element_char, float_buf);
16314           return templ;
16315         }
16316     }
16317
16318   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
16319             element_char, INTVAL (info.value));
16320   return templ;
16321 }
16322
16323 /* Return the asm format for a PTRUE instruction whose destination has
16324    mode MODE.  SUFFIX is the element size suffix.  */
16325
16326 char *
16327 aarch64_output_ptrue (machine_mode mode, char suffix)
16328 {
16329   unsigned int nunits;
16330   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
16331   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
16332     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
16333   else
16334     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
16335   return buf;
16336 }
16337
16338 /* Split operands into moves from op[1] + op[2] into op[0].  */
16339
16340 void
16341 aarch64_split_combinev16qi (rtx operands[3])
16342 {
16343   unsigned int dest = REGNO (operands[0]);
16344   unsigned int src1 = REGNO (operands[1]);
16345   unsigned int src2 = REGNO (operands[2]);
16346   machine_mode halfmode = GET_MODE (operands[1]);
16347   unsigned int halfregs = REG_NREGS (operands[1]);
16348   rtx destlo, desthi;
16349
16350   gcc_assert (halfmode == V16QImode);
16351
16352   if (src1 == dest && src2 == dest + halfregs)
16353     {
16354       /* No-op move.  Can't split to nothing; emit something.  */
16355       emit_note (NOTE_INSN_DELETED);
16356       return;
16357     }
16358
16359   /* Preserve register attributes for variable tracking.  */
16360   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
16361   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
16362                                GET_MODE_SIZE (halfmode));
16363
16364   /* Special case of reversed high/low parts.  */
16365   if (reg_overlap_mentioned_p (operands[2], destlo)
16366       && reg_overlap_mentioned_p (operands[1], desthi))
16367     {
16368       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
16369       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
16370       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
16371     }
16372   else if (!reg_overlap_mentioned_p (operands[2], destlo))
16373     {
16374       /* Try to avoid unnecessary moves if part of the result
16375          is in the right place already.  */
16376       if (src1 != dest)
16377         emit_move_insn (destlo, operands[1]);
16378       if (src2 != dest + halfregs)
16379         emit_move_insn (desthi, operands[2]);
16380     }
16381   else
16382     {
16383       if (src2 != dest + halfregs)
16384         emit_move_insn (desthi, operands[2]);
16385       if (src1 != dest)
16386         emit_move_insn (destlo, operands[1]);
16387     }
16388 }
16389
16390 /* vec_perm support.  */
16391
16392 struct expand_vec_perm_d
16393 {
16394   rtx target, op0, op1;
16395   vec_perm_indices perm;
16396   machine_mode vmode;
16397   unsigned int vec_flags;
16398   bool one_vector_p;
16399   bool testing_p;
16400 };
16401
16402 /* Generate a variable permutation.  */
16403
16404 static void
16405 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
16406 {
16407   machine_mode vmode = GET_MODE (target);
16408   bool one_vector_p = rtx_equal_p (op0, op1);
16409
16410   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
16411   gcc_checking_assert (GET_MODE (op0) == vmode);
16412   gcc_checking_assert (GET_MODE (op1) == vmode);
16413   gcc_checking_assert (GET_MODE (sel) == vmode);
16414   gcc_checking_assert (TARGET_SIMD);
16415
16416   if (one_vector_p)
16417     {
16418       if (vmode == V8QImode)
16419         {
16420           /* Expand the argument to a V16QI mode by duplicating it.  */
16421           rtx pair = gen_reg_rtx (V16QImode);
16422           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
16423           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16424         }
16425       else
16426         {
16427           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
16428         }
16429     }
16430   else
16431     {
16432       rtx pair;
16433
16434       if (vmode == V8QImode)
16435         {
16436           pair = gen_reg_rtx (V16QImode);
16437           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
16438           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16439         }
16440       else
16441         {
16442           pair = gen_reg_rtx (OImode);
16443           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
16444           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
16445         }
16446     }
16447 }
16448
16449 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
16450    NELT is the number of elements in the vector.  */
16451
16452 void
16453 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
16454                          unsigned int nelt)
16455 {
16456   machine_mode vmode = GET_MODE (target);
16457   bool one_vector_p = rtx_equal_p (op0, op1);
16458   rtx mask;
16459
16460   /* The TBL instruction does not use a modulo index, so we must take care
16461      of that ourselves.  */
16462   mask = aarch64_simd_gen_const_vector_dup (vmode,
16463       one_vector_p ? nelt - 1 : 2 * nelt - 1);
16464   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
16465
16466   /* For big-endian, we also need to reverse the index within the vector
16467      (but not which vector).  */
16468   if (BYTES_BIG_ENDIAN)
16469     {
16470       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
16471       if (!one_vector_p)
16472         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
16473       sel = expand_simple_binop (vmode, XOR, sel, mask,
16474                                  NULL, 0, OPTAB_LIB_WIDEN);
16475     }
16476   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
16477 }
16478
16479 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
16480
16481 static void
16482 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
16483 {
16484   emit_insn (gen_rtx_SET (target,
16485                           gen_rtx_UNSPEC (GET_MODE (target),
16486                                           gen_rtvec (2, op0, op1), code)));
16487 }
16488
16489 /* Expand an SVE vec_perm with the given operands.  */
16490
16491 void
16492 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
16493 {
16494   machine_mode data_mode = GET_MODE (target);
16495   machine_mode sel_mode = GET_MODE (sel);
16496   /* Enforced by the pattern condition.  */
16497   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
16498
16499   /* Note: vec_perm indices are supposed to wrap when they go beyond the
16500      size of the two value vectors, i.e. the upper bits of the indices
16501      are effectively ignored.  SVE TBL instead produces 0 for any
16502      out-of-range indices, so we need to modulo all the vec_perm indices
16503      to ensure they are all in range.  */
16504   rtx sel_reg = force_reg (sel_mode, sel);
16505
16506   /* Check if the sel only references the first values vector.  */
16507   if (GET_CODE (sel) == CONST_VECTOR
16508       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
16509     {
16510       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
16511       return;
16512     }
16513
16514   /* Check if the two values vectors are the same.  */
16515   if (rtx_equal_p (op0, op1))
16516     {
16517       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
16518       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16519                                          NULL, 0, OPTAB_DIRECT);
16520       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
16521       return;
16522     }
16523
16524   /* Run TBL on for each value vector and combine the results.  */
16525
16526   rtx res0 = gen_reg_rtx (data_mode);
16527   rtx res1 = gen_reg_rtx (data_mode);
16528   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
16529   if (GET_CODE (sel) != CONST_VECTOR
16530       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
16531     {
16532       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
16533                                                        2 * nunits - 1);
16534       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16535                                      NULL, 0, OPTAB_DIRECT);
16536     }
16537   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
16538   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
16539                                      NULL, 0, OPTAB_DIRECT);
16540   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
16541   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
16542     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
16543   else
16544     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
16545 }
16546
16547 /* Recognize patterns suitable for the TRN instructions.  */
16548 static bool
16549 aarch64_evpc_trn (struct expand_vec_perm_d *d)
16550 {
16551   HOST_WIDE_INT odd;
16552   poly_uint64 nelt = d->perm.length ();
16553   rtx out, in0, in1, x;
16554   machine_mode vmode = d->vmode;
16555
16556   if (GET_MODE_UNIT_SIZE (vmode) > 8)
16557     return false;
16558
16559   /* Note that these are little-endian tests.
16560      We correct for big-endian later.  */
16561   if (!d->perm[0].is_constant (&odd)
16562       || (odd != 0 && odd != 1)
16563       || !d->perm.series_p (0, 2, odd, 2)
16564       || !d->perm.series_p (1, 2, nelt + odd, 2))
16565     return false;
16566
16567   /* Success!  */
16568   if (d->testing_p)
16569     return true;
16570
16571   in0 = d->op0;
16572   in1 = d->op1;
16573   /* We don't need a big-endian lane correction for SVE; see the comment
16574      at the head of aarch64-sve.md for details.  */
16575   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16576     {
16577       x = in0, in0 = in1, in1 = x;
16578       odd = !odd;
16579     }
16580   out = d->target;
16581
16582   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16583                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
16584   return true;
16585 }
16586
16587 /* Recognize patterns suitable for the UZP instructions.  */
16588 static bool
16589 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
16590 {
16591   HOST_WIDE_INT odd;
16592   rtx out, in0, in1, x;
16593   machine_mode vmode = d->vmode;
16594
16595   if (GET_MODE_UNIT_SIZE (vmode) > 8)
16596     return false;
16597
16598   /* Note that these are little-endian tests.
16599      We correct for big-endian later.  */
16600   if (!d->perm[0].is_constant (&odd)
16601       || (odd != 0 && odd != 1)
16602       || !d->perm.series_p (0, 1, odd, 2))
16603     return false;
16604
16605   /* Success!  */
16606   if (d->testing_p)
16607     return true;
16608
16609   in0 = d->op0;
16610   in1 = d->op1;
16611   /* We don't need a big-endian lane correction for SVE; see the comment
16612      at the head of aarch64-sve.md for details.  */
16613   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16614     {
16615       x = in0, in0 = in1, in1 = x;
16616       odd = !odd;
16617     }
16618   out = d->target;
16619
16620   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16621                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
16622   return true;
16623 }
16624
16625 /* Recognize patterns suitable for the ZIP instructions.  */
16626 static bool
16627 aarch64_evpc_zip (struct expand_vec_perm_d *d)
16628 {
16629   unsigned int high;
16630   poly_uint64 nelt = d->perm.length ();
16631   rtx out, in0, in1, x;
16632   machine_mode vmode = d->vmode;
16633
16634   if (GET_MODE_UNIT_SIZE (vmode) > 8)
16635     return false;
16636
16637   /* Note that these are little-endian tests.
16638      We correct for big-endian later.  */
16639   poly_uint64 first = d->perm[0];
16640   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
16641       || !d->perm.series_p (0, 2, first, 1)
16642       || !d->perm.series_p (1, 2, first + nelt, 1))
16643     return false;
16644   high = maybe_ne (first, 0U);
16645
16646   /* Success!  */
16647   if (d->testing_p)
16648     return true;
16649
16650   in0 = d->op0;
16651   in1 = d->op1;
16652   /* We don't need a big-endian lane correction for SVE; see the comment
16653      at the head of aarch64-sve.md for details.  */
16654   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16655     {
16656       x = in0, in0 = in1, in1 = x;
16657       high = !high;
16658     }
16659   out = d->target;
16660
16661   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16662                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
16663   return true;
16664 }
16665
16666 /* Recognize patterns for the EXT insn.  */
16667
16668 static bool
16669 aarch64_evpc_ext (struct expand_vec_perm_d *d)
16670 {
16671   HOST_WIDE_INT location;
16672   rtx offset;
16673
16674   /* The first element always refers to the first vector.
16675      Check if the extracted indices are increasing by one.  */
16676   if (d->vec_flags == VEC_SVE_PRED
16677       || !d->perm[0].is_constant (&location)
16678       || !d->perm.series_p (0, 1, location, 1))
16679     return false;
16680
16681   /* Success! */
16682   if (d->testing_p)
16683     return true;
16684
16685   /* The case where (location == 0) is a no-op for both big- and little-endian,
16686      and is removed by the mid-end at optimization levels -O1 and higher.
16687
16688      We don't need a big-endian lane correction for SVE; see the comment
16689      at the head of aarch64-sve.md for details.  */
16690   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
16691     {
16692       /* After setup, we want the high elements of the first vector (stored
16693          at the LSB end of the register), and the low elements of the second
16694          vector (stored at the MSB end of the register). So swap.  */
16695       std::swap (d->op0, d->op1);
16696       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
16697          to_constant () is safe since this is restricted to Advanced SIMD
16698          vectors.  */
16699       location = d->perm.length ().to_constant () - location;
16700     }
16701
16702   offset = GEN_INT (location);
16703   emit_set_insn (d->target,
16704                  gen_rtx_UNSPEC (d->vmode,
16705                                  gen_rtvec (3, d->op0, d->op1, offset),
16706                                  UNSPEC_EXT));
16707   return true;
16708 }
16709
16710 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
16711    within each 64-bit, 32-bit or 16-bit granule.  */
16712
16713 static bool
16714 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
16715 {
16716   HOST_WIDE_INT diff;
16717   unsigned int i, size, unspec;
16718   machine_mode pred_mode;
16719
16720   if (d->vec_flags == VEC_SVE_PRED
16721       || !d->one_vector_p
16722       || !d->perm[0].is_constant (&diff))
16723     return false;
16724
16725   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
16726   if (size == 8)
16727     {
16728       unspec = UNSPEC_REV64;
16729       pred_mode = VNx2BImode;
16730     }
16731   else if (size == 4)
16732     {
16733       unspec = UNSPEC_REV32;
16734       pred_mode = VNx4BImode;
16735     }
16736   else if (size == 2)
16737     {
16738       unspec = UNSPEC_REV16;
16739       pred_mode = VNx8BImode;
16740     }
16741   else
16742     return false;
16743
16744   unsigned int step = diff + 1;
16745   for (i = 0; i < step; ++i)
16746     if (!d->perm.series_p (i, step, diff - i, step))
16747       return false;
16748
16749   /* Success! */
16750   if (d->testing_p)
16751     return true;
16752
16753   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
16754   if (d->vec_flags == VEC_SVE_DATA)
16755     {
16756       rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16757       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
16758                             UNSPEC_MERGE_PTRUE);
16759     }
16760   emit_set_insn (d->target, src);
16761   return true;
16762 }
16763
16764 /* Recognize patterns for the REV insn, which reverses elements within
16765    a full vector.  */
16766
16767 static bool
16768 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
16769 {
16770   poly_uint64 nelt = d->perm.length ();
16771
16772   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
16773     return false;
16774
16775   if (!d->perm.series_p (0, 1, nelt - 1, -1))
16776     return false;
16777
16778   /* Success! */
16779   if (d->testing_p)
16780     return true;
16781
16782   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
16783   emit_set_insn (d->target, src);
16784   return true;
16785 }
16786
16787 static bool
16788 aarch64_evpc_dup (struct expand_vec_perm_d *d)
16789 {
16790   rtx out = d->target;
16791   rtx in0;
16792   HOST_WIDE_INT elt;
16793   machine_mode vmode = d->vmode;
16794   rtx lane;
16795
16796   if (d->vec_flags == VEC_SVE_PRED
16797       || d->perm.encoding ().encoded_nelts () != 1
16798       || !d->perm[0].is_constant (&elt))
16799     return false;
16800
16801   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
16802     return false;
16803
16804   /* Success! */
16805   if (d->testing_p)
16806     return true;
16807
16808   /* The generic preparation in aarch64_expand_vec_perm_const_1
16809      swaps the operand order and the permute indices if it finds
16810      d->perm[0] to be in the second operand.  Thus, we can always
16811      use d->op0 and need not do any extra arithmetic to get the
16812      correct lane number.  */
16813   in0 = d->op0;
16814   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
16815
16816   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
16817   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
16818   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
16819   return true;
16820 }
16821
16822 static bool
16823 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
16824 {
16825   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
16826   machine_mode vmode = d->vmode;
16827
16828   /* Make sure that the indices are constant.  */
16829   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
16830   for (unsigned int i = 0; i < encoded_nelts; ++i)
16831     if (!d->perm[i].is_constant ())
16832       return false;
16833
16834   if (d->testing_p)
16835     return true;
16836
16837   /* Generic code will try constant permutation twice.  Once with the
16838      original mode and again with the elements lowered to QImode.
16839      So wait and don't do the selector expansion ourselves.  */
16840   if (vmode != V8QImode && vmode != V16QImode)
16841     return false;
16842
16843   /* to_constant is safe since this routine is specific to Advanced SIMD
16844      vectors.  */
16845   unsigned int nelt = d->perm.length ().to_constant ();
16846   for (unsigned int i = 0; i < nelt; ++i)
16847     /* If big-endian and two vectors we end up with a weird mixed-endian
16848        mode on NEON.  Reverse the index within each word but not the word
16849        itself.  to_constant is safe because we checked is_constant above.  */
16850     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
16851                         ? d->perm[i].to_constant () ^ (nelt - 1)
16852                         : d->perm[i].to_constant ());
16853
16854   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16855   sel = force_reg (vmode, sel);
16856
16857   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
16858   return true;
16859 }
16860
16861 /* Try to implement D using an SVE TBL instruction.  */
16862
16863 static bool
16864 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
16865 {
16866   unsigned HOST_WIDE_INT nelt;
16867
16868   /* Permuting two variable-length vectors could overflow the
16869      index range.  */
16870   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
16871     return false;
16872
16873   if (d->testing_p)
16874     return true;
16875
16876   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
16877   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
16878   if (d->one_vector_p)
16879     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
16880   else
16881     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
16882   return true;
16883 }
16884
16885 static bool
16886 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
16887 {
16888   /* The pattern matching functions above are written to look for a small
16889      number to begin the sequence (0, 1, N/2).  If we begin with an index
16890      from the second operand, we can swap the operands.  */
16891   poly_int64 nelt = d->perm.length ();
16892   if (known_ge (d->perm[0], nelt))
16893     {
16894       d->perm.rotate_inputs (1);
16895       std::swap (d->op0, d->op1);
16896     }
16897
16898   if ((d->vec_flags == VEC_ADVSIMD
16899        || d->vec_flags == VEC_SVE_DATA
16900        || d->vec_flags == VEC_SVE_PRED)
16901       && known_gt (nelt, 1))
16902     {
16903       if (aarch64_evpc_rev_local (d))
16904         return true;
16905       else if (aarch64_evpc_rev_global (d))
16906         return true;
16907       else if (aarch64_evpc_ext (d))
16908         return true;
16909       else if (aarch64_evpc_dup (d))
16910         return true;
16911       else if (aarch64_evpc_zip (d))
16912         return true;
16913       else if (aarch64_evpc_uzp (d))
16914         return true;
16915       else if (aarch64_evpc_trn (d))
16916         return true;
16917       if (d->vec_flags == VEC_SVE_DATA)
16918         return aarch64_evpc_sve_tbl (d);
16919       else if (d->vec_flags == VEC_ADVSIMD)
16920         return aarch64_evpc_tbl (d);
16921     }
16922   return false;
16923 }
16924
16925 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
16926
16927 static bool
16928 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
16929                                   rtx op1, const vec_perm_indices &sel)
16930 {
16931   struct expand_vec_perm_d d;
16932
16933   /* Check whether the mask can be applied to a single vector.  */
16934   if (sel.ninputs () == 1
16935       || (op0 && rtx_equal_p (op0, op1)))
16936     d.one_vector_p = true;
16937   else if (sel.all_from_input_p (0))
16938     {
16939       d.one_vector_p = true;
16940       op1 = op0;
16941     }
16942   else if (sel.all_from_input_p (1))
16943     {
16944       d.one_vector_p = true;
16945       op0 = op1;
16946     }
16947   else
16948     d.one_vector_p = false;
16949
16950   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
16951                      sel.nelts_per_input ());
16952   d.vmode = vmode;
16953   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
16954   d.target = target;
16955   d.op0 = op0;
16956   d.op1 = op1;
16957   d.testing_p = !target;
16958
16959   if (!d.testing_p)
16960     return aarch64_expand_vec_perm_const_1 (&d);
16961
16962   rtx_insn *last = get_last_insn ();
16963   bool ret = aarch64_expand_vec_perm_const_1 (&d);
16964   gcc_assert (last == get_last_insn ());
16965
16966   return ret;
16967 }
16968
16969 /* Generate a byte permute mask for a register of mode MODE,
16970    which has NUNITS units.  */
16971
16972 rtx
16973 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
16974 {
16975   /* We have to reverse each vector because we dont have
16976      a permuted load that can reverse-load according to ABI rules.  */
16977   rtx mask;
16978   rtvec v = rtvec_alloc (16);
16979   unsigned int i, j;
16980   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
16981
16982   gcc_assert (BYTES_BIG_ENDIAN);
16983   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
16984
16985   for (i = 0; i < nunits; i++)
16986     for (j = 0; j < usize; j++)
16987       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
16988   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
16989   return force_reg (V16QImode, mask);
16990 }
16991
16992 /* Return true if X is a valid second operand for the SVE instruction
16993    that implements integer comparison OP_CODE.  */
16994
16995 static bool
16996 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
16997 {
16998   if (register_operand (x, VOIDmode))
16999     return true;
17000
17001   switch (op_code)
17002     {
17003     case LTU:
17004     case LEU:
17005     case GEU:
17006     case GTU:
17007       return aarch64_sve_cmp_immediate_p (x, false);
17008     case LT:
17009     case LE:
17010     case GE:
17011     case GT:
17012     case NE:
17013     case EQ:
17014       return aarch64_sve_cmp_immediate_p (x, true);
17015     default:
17016       gcc_unreachable ();
17017     }
17018 }
17019
17020 /* Use predicated SVE instructions to implement the equivalent of:
17021
17022      (set TARGET OP)
17023
17024    given that PTRUE is an all-true predicate of the appropriate mode.  */
17025
17026 static void
17027 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
17028 {
17029   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
17030                                gen_rtvec (2, ptrue, op),
17031                                UNSPEC_MERGE_PTRUE);
17032   rtx_insn *insn = emit_set_insn (target, unspec);
17033   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
17034 }
17035
17036 /* Likewise, but also clobber the condition codes.  */
17037
17038 static void
17039 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
17040 {
17041   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
17042                                gen_rtvec (2, ptrue, op),
17043                                UNSPEC_MERGE_PTRUE);
17044   rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
17045   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
17046 }
17047
17048 /* Return the UNSPEC_COND_* code for comparison CODE.  */
17049
17050 static unsigned int
17051 aarch64_unspec_cond_code (rtx_code code)
17052 {
17053   switch (code)
17054     {
17055     case NE:
17056       return UNSPEC_COND_NE;
17057     case EQ:
17058       return UNSPEC_COND_EQ;
17059     case LT:
17060       return UNSPEC_COND_LT;
17061     case GT:
17062       return UNSPEC_COND_GT;
17063     case LE:
17064       return UNSPEC_COND_LE;
17065     case GE:
17066       return UNSPEC_COND_GE;
17067     default:
17068       gcc_unreachable ();
17069     }
17070 }
17071
17072 /* Emit:
17073
17074       (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
17075
17076    where <X> is the operation associated with comparison CODE.  This form
17077    of instruction is used when (and (CODE OP0 OP1) PRED) would have different
17078    semantics, such as when PRED might not be all-true and when comparing
17079    inactive lanes could have side effects.  */
17080
17081 static void
17082 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
17083                                   rtx pred, rtx op0, rtx op1)
17084 {
17085   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
17086                                gen_rtvec (3, pred, op0, op1),
17087                                aarch64_unspec_cond_code (code));
17088   emit_set_insn (target, unspec);
17089 }
17090
17091 /* Expand an SVE integer comparison using the SVE equivalent of:
17092
17093      (set TARGET (CODE OP0 OP1)).  */
17094
17095 void
17096 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
17097 {
17098   machine_mode pred_mode = GET_MODE (target);
17099   machine_mode data_mode = GET_MODE (op0);
17100
17101   if (!aarch64_sve_cmp_operand_p (code, op1))
17102     op1 = force_reg (data_mode, op1);
17103
17104   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
17105   rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17106   aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
17107 }
17108
17109 /* Emit the SVE equivalent of:
17110
17111       (set TMP1 (CODE1 OP0 OP1))
17112       (set TMP2 (CODE2 OP0 OP1))
17113       (set TARGET (ior:PRED_MODE TMP1 TMP2))
17114
17115    PTRUE is an all-true predicate with the same mode as TARGET.  */
17116
17117 static void
17118 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
17119                            rtx ptrue, rtx op0, rtx op1)
17120 {
17121   machine_mode pred_mode = GET_MODE (ptrue);
17122   rtx tmp1 = gen_reg_rtx (pred_mode);
17123   aarch64_emit_sve_ptrue_op (tmp1, ptrue,
17124                              gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
17125   rtx tmp2 = gen_reg_rtx (pred_mode);
17126   aarch64_emit_sve_ptrue_op (tmp2, ptrue,
17127                              gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
17128   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
17129 }
17130
17131 /* Emit the SVE equivalent of:
17132
17133       (set TMP (CODE OP0 OP1))
17134       (set TARGET (not TMP))
17135
17136    PTRUE is an all-true predicate with the same mode as TARGET.  */
17137
17138 static void
17139 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
17140                                 rtx op0, rtx op1)
17141 {
17142   machine_mode pred_mode = GET_MODE (ptrue);
17143   rtx tmp = gen_reg_rtx (pred_mode);
17144   aarch64_emit_sve_ptrue_op (tmp, ptrue,
17145                              gen_rtx_fmt_ee (code, pred_mode, op0, op1));
17146   aarch64_emit_unop (target, one_cmpl_optab, tmp);
17147 }
17148
17149 /* Expand an SVE floating-point comparison using the SVE equivalent of:
17150
17151      (set TARGET (CODE OP0 OP1))
17152
17153    If CAN_INVERT_P is true, the caller can also handle inverted results;
17154    return true if the result is in fact inverted.  */
17155
17156 bool
17157 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
17158                                   rtx op0, rtx op1, bool can_invert_p)
17159 {
17160   machine_mode pred_mode = GET_MODE (target);
17161   machine_mode data_mode = GET_MODE (op0);
17162
17163   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
17164   switch (code)
17165     {
17166     case UNORDERED:
17167       /* UNORDERED has no immediate form.  */
17168       op1 = force_reg (data_mode, op1);
17169       /* fall through */
17170     case LT:
17171     case LE:
17172     case GT:
17173     case GE:
17174     case EQ:
17175     case NE:
17176       {
17177         /* There is native support for the comparison.  */
17178         rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17179         aarch64_emit_sve_ptrue_op (target, ptrue, cond);
17180         return false;
17181       }
17182
17183     case LTGT:
17184       /* This is a trapping operation (LT or GT).  */
17185       aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
17186       return false;
17187
17188     case UNEQ:
17189       if (!flag_trapping_math)
17190         {
17191           /* This would trap for signaling NaNs.  */
17192           op1 = force_reg (data_mode, op1);
17193           aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
17194           return false;
17195         }
17196       /* fall through */
17197     case UNLT:
17198     case UNLE:
17199     case UNGT:
17200     case UNGE:
17201       if (flag_trapping_math)
17202         {
17203           /* Work out which elements are ordered.  */
17204           rtx ordered = gen_reg_rtx (pred_mode);
17205           op1 = force_reg (data_mode, op1);
17206           aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
17207
17208           /* Test the opposite condition for the ordered elements,
17209              then invert the result.  */
17210           if (code == UNEQ)
17211             code = NE;
17212           else
17213             code = reverse_condition_maybe_unordered (code);
17214           if (can_invert_p)
17215             {
17216               aarch64_emit_sve_predicated_cond (target, code,
17217                                                 ordered, op0, op1);
17218               return true;
17219             }
17220           rtx tmp = gen_reg_rtx (pred_mode);
17221           aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
17222           aarch64_emit_unop (target, one_cmpl_optab, tmp);
17223           return false;
17224         }
17225       break;
17226
17227     case ORDERED:
17228       /* ORDERED has no immediate form.  */
17229       op1 = force_reg (data_mode, op1);
17230       break;
17231
17232     default:
17233       gcc_unreachable ();
17234     }
17235
17236   /* There is native support for the inverse comparison.  */
17237   code = reverse_condition_maybe_unordered (code);
17238   if (can_invert_p)
17239     {
17240       rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17241       aarch64_emit_sve_ptrue_op (target, ptrue, cond);
17242       return true;
17243     }
17244   aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
17245   return false;
17246 }
17247
17248 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
17249    of the data being selected and CMP_MODE is the mode of the values being
17250    compared.  */
17251
17252 void
17253 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
17254                           rtx *ops)
17255 {
17256   machine_mode pred_mode
17257     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
17258                              GET_MODE_SIZE (cmp_mode)).require ();
17259   rtx pred = gen_reg_rtx (pred_mode);
17260   if (FLOAT_MODE_P (cmp_mode))
17261     {
17262       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
17263                                             ops[4], ops[5], true))
17264         std::swap (ops[1], ops[2]);
17265     }
17266   else
17267     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
17268
17269   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
17270   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
17271 }
17272
17273 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
17274    true.  However due to issues with register allocation it is preferable
17275    to avoid tieing integer scalar and FP scalar modes.  Executing integer
17276    operations in general registers is better than treating them as scalar
17277    vector operations.  This reduces latency and avoids redundant int<->FP
17278    moves.  So tie modes if they are either the same class, or vector modes
17279    with other vector modes, vector structs or any scalar mode.  */
17280
17281 static bool
17282 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
17283 {
17284   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
17285     return true;
17286
17287   /* We specifically want to allow elements of "structure" modes to
17288      be tieable to the structure.  This more general condition allows
17289      other rarer situations too.  The reason we don't extend this to
17290      predicate modes is that there are no predicate structure modes
17291      nor any specific instructions for extracting part of a predicate
17292      register.  */
17293   if (aarch64_vector_data_mode_p (mode1)
17294       && aarch64_vector_data_mode_p (mode2))
17295     return true;
17296
17297   /* Also allow any scalar modes with vectors.  */
17298   if (aarch64_vector_mode_supported_p (mode1)
17299       || aarch64_vector_mode_supported_p (mode2))
17300     return true;
17301
17302   return false;
17303 }
17304
17305 /* Return a new RTX holding the result of moving POINTER forward by
17306    AMOUNT bytes.  */
17307
17308 static rtx
17309 aarch64_move_pointer (rtx pointer, poly_int64 amount)
17310 {
17311   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
17312
17313   return adjust_automodify_address (pointer, GET_MODE (pointer),
17314                                     next, amount);
17315 }
17316
17317 /* Return a new RTX holding the result of moving POINTER forward by the
17318    size of the mode it points to.  */
17319
17320 static rtx
17321 aarch64_progress_pointer (rtx pointer)
17322 {
17323   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
17324 }
17325
17326 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
17327    MODE bytes.  */
17328
17329 static void
17330 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
17331                                               machine_mode mode)
17332 {
17333   rtx reg = gen_reg_rtx (mode);
17334
17335   /* "Cast" the pointers to the correct mode.  */
17336   *src = adjust_address (*src, mode, 0);
17337   *dst = adjust_address (*dst, mode, 0);
17338   /* Emit the memcpy.  */
17339   emit_move_insn (reg, *src);
17340   emit_move_insn (*dst, reg);
17341   /* Move the pointers forward.  */
17342   *src = aarch64_progress_pointer (*src);
17343   *dst = aarch64_progress_pointer (*dst);
17344 }
17345
17346 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
17347    we succeed, otherwise return false.  */
17348
17349 bool
17350 aarch64_expand_movmem (rtx *operands)
17351 {
17352   int n, mode_bits;
17353   rtx dst = operands[0];
17354   rtx src = operands[1];
17355   rtx base;
17356   machine_mode cur_mode = BLKmode, next_mode;
17357   bool speed_p = !optimize_function_for_size_p (cfun);
17358
17359   /* When optimizing for size, give a better estimate of the length of a
17360      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
17361      will always require an even number of instructions to do now.  And each
17362      operation requires both a load+store, so devide the max number by 2.  */
17363   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
17364
17365   /* We can't do anything smart if the amount to copy is not constant.  */
17366   if (!CONST_INT_P (operands[2]))
17367     return false;
17368
17369   n = INTVAL (operands[2]);
17370
17371   /* Try to keep the number of instructions low.  For all cases we will do at
17372      most two moves for the residual amount, since we'll always overlap the
17373      remainder.  */
17374   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
17375     return false;
17376
17377   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
17378   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
17379
17380   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
17381   src = adjust_automodify_address (src, VOIDmode, base, 0);
17382
17383   /* Convert n to bits to make the rest of the code simpler.  */
17384   n = n * BITS_PER_UNIT;
17385
17386   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
17387      larger than TImode, but we should not use them for loads/stores here.  */
17388   const int copy_limit = GET_MODE_BITSIZE (TImode);
17389
17390   while (n > 0)
17391     {
17392       /* Find the largest mode in which to do the copy in without over reading
17393          or writing.  */
17394       opt_scalar_int_mode mode_iter;
17395       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
17396         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
17397           cur_mode = mode_iter.require ();
17398
17399       gcc_assert (cur_mode != BLKmode);
17400
17401       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
17402       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
17403
17404       n -= mode_bits;
17405
17406       /* Do certain trailing copies as overlapping if it's going to be
17407          cheaper.  i.e. less instructions to do so.  For instance doing a 15
17408          byte copy it's more efficient to do two overlapping 8 byte copies than
17409          8 + 6 + 1.  */
17410       if (n > 0 && n <= 8 * BITS_PER_UNIT)
17411         {
17412           next_mode = smallest_mode_for_size (n, MODE_INT);
17413           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
17414           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
17415           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
17416           n = n_bits;
17417         }
17418     }
17419
17420   return true;
17421 }
17422
17423 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
17424    SImode stores.  Handle the case when the constant has identical
17425    bottom and top halves.  This is beneficial when the two stores can be
17426    merged into an STP and we avoid synthesising potentially expensive
17427    immediates twice.  Return true if such a split is possible.  */
17428
17429 bool
17430 aarch64_split_dimode_const_store (rtx dst, rtx src)
17431 {
17432   rtx lo = gen_lowpart (SImode, src);
17433   rtx hi = gen_highpart_mode (SImode, DImode, src);
17434
17435   bool size_p = optimize_function_for_size_p (cfun);
17436
17437   if (!rtx_equal_p (lo, hi))
17438     return false;
17439
17440   unsigned int orig_cost
17441     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
17442   unsigned int lo_cost
17443     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
17444
17445   /* We want to transform:
17446      MOV        x1, 49370
17447      MOVK       x1, 0x140, lsl 16
17448      MOVK       x1, 0xc0da, lsl 32
17449      MOVK       x1, 0x140, lsl 48
17450      STR        x1, [x0]
17451    into:
17452      MOV        w1, 49370
17453      MOVK       w1, 0x140, lsl 16
17454      STP        w1, w1, [x0]
17455    So we want to perform this only when we save two instructions
17456    or more.  When optimizing for size, however, accept any code size
17457    savings we can.  */
17458   if (size_p && orig_cost <= lo_cost)
17459     return false;
17460
17461   if (!size_p
17462       && (orig_cost <= lo_cost + 1))
17463     return false;
17464
17465   rtx mem_lo = adjust_address (dst, SImode, 0);
17466   if (!aarch64_mem_pair_operand (mem_lo, SImode))
17467     return false;
17468
17469   rtx tmp_reg = gen_reg_rtx (SImode);
17470   aarch64_expand_mov_immediate (tmp_reg, lo);
17471   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
17472   /* Don't emit an explicit store pair as this may not be always profitable.
17473      Let the sched-fusion logic decide whether to merge them.  */
17474   emit_move_insn (mem_lo, tmp_reg);
17475   emit_move_insn (mem_hi, tmp_reg);
17476
17477   return true;
17478 }
17479
17480 /* Generate RTL for a conditional branch with rtx comparison CODE in
17481    mode CC_MODE.  The destination of the unlikely conditional branch
17482    is LABEL_REF.  */
17483
17484 void
17485 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
17486                               rtx label_ref)
17487 {
17488   rtx x;
17489   x = gen_rtx_fmt_ee (code, VOIDmode,
17490                       gen_rtx_REG (cc_mode, CC_REGNUM),
17491                       const0_rtx);
17492
17493   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17494                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
17495                             pc_rtx);
17496   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17497 }
17498
17499 /* Generate DImode scratch registers for 128-bit (TImode) addition.
17500
17501    OP1 represents the TImode destination operand 1
17502    OP2 represents the TImode destination operand 2
17503    LOW_DEST represents the low half (DImode) of TImode operand 0
17504    LOW_IN1 represents the low half (DImode) of TImode operand 1
17505    LOW_IN2 represents the low half (DImode) of TImode operand 2
17506    HIGH_DEST represents the high half (DImode) of TImode operand 0
17507    HIGH_IN1 represents the high half (DImode) of TImode operand 1
17508    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
17509
17510 void
17511 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17512                             rtx *low_in1, rtx *low_in2,
17513                             rtx *high_dest, rtx *high_in1,
17514                             rtx *high_in2)
17515 {
17516   *low_dest = gen_reg_rtx (DImode);
17517   *low_in1 = gen_lowpart (DImode, op1);
17518   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17519                                   subreg_lowpart_offset (DImode, TImode));
17520   *high_dest = gen_reg_rtx (DImode);
17521   *high_in1 = gen_highpart (DImode, op1);
17522   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17523                                    subreg_highpart_offset (DImode, TImode));
17524 }
17525
17526 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
17527
17528    This function differs from 'arch64_addti_scratch_regs' in that
17529    OP1 can be an immediate constant (zero). We must call
17530    subreg_highpart_offset with DImode and TImode arguments, otherwise
17531    VOIDmode will be used for the const_int which generates an internal
17532    error from subreg_size_highpart_offset which does not expect a size of zero.
17533
17534    OP1 represents the TImode destination operand 1
17535    OP2 represents the TImode destination operand 2
17536    LOW_DEST represents the low half (DImode) of TImode operand 0
17537    LOW_IN1 represents the low half (DImode) of TImode operand 1
17538    LOW_IN2 represents the low half (DImode) of TImode operand 2
17539    HIGH_DEST represents the high half (DImode) of TImode operand 0
17540    HIGH_IN1 represents the high half (DImode) of TImode operand 1
17541    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
17542
17543
17544 void
17545 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17546                              rtx *low_in1, rtx *low_in2,
17547                              rtx *high_dest, rtx *high_in1,
17548                              rtx *high_in2)
17549 {
17550   *low_dest = gen_reg_rtx (DImode);
17551   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
17552                                   subreg_lowpart_offset (DImode, TImode));
17553
17554   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17555                                   subreg_lowpart_offset (DImode, TImode));
17556   *high_dest = gen_reg_rtx (DImode);
17557
17558   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
17559                                    subreg_highpart_offset (DImode, TImode));
17560   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17561                                    subreg_highpart_offset (DImode, TImode));
17562 }
17563
17564 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
17565
17566    OP0 represents the TImode destination operand 0
17567    LOW_DEST represents the low half (DImode) of TImode operand 0
17568    LOW_IN1 represents the low half (DImode) of TImode operand 1
17569    LOW_IN2 represents the low half (DImode) of TImode operand 2
17570    HIGH_DEST represents the high half (DImode) of TImode operand 0
17571    HIGH_IN1 represents the high half (DImode) of TImode operand 1
17572    HIGH_IN2 represents the high half (DImode) of TImode operand 2
17573    UNSIGNED_P is true if the operation is being performed on unsigned
17574    values.  */
17575 void
17576 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
17577                        rtx low_in2, rtx high_dest, rtx high_in1,
17578                        rtx high_in2, bool unsigned_p)
17579 {
17580   if (low_in2 == const0_rtx)
17581     {
17582       low_dest = low_in1;
17583       high_in2 = force_reg (DImode, high_in2);
17584       if (unsigned_p)
17585         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
17586       else
17587         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
17588     }
17589   else
17590     {
17591       if (CONST_INT_P (low_in2))
17592         {
17593           high_in2 = force_reg (DImode, high_in2);
17594           emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
17595                                               GEN_INT (-INTVAL (low_in2))));
17596         }
17597       else
17598         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
17599
17600       if (unsigned_p)
17601         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
17602       else
17603         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
17604     }
17605
17606   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
17607   emit_move_insn (gen_highpart (DImode, op0), high_dest);
17608
17609 }
17610
17611 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
17612
17613 static unsigned HOST_WIDE_INT
17614 aarch64_asan_shadow_offset (void)
17615 {
17616   if (TARGET_ILP32)
17617     return (HOST_WIDE_INT_1 << 29);
17618   else
17619     return (HOST_WIDE_INT_1 << 36);
17620 }
17621
17622 static rtx
17623 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
17624                         int code, tree treeop0, tree treeop1)
17625 {
17626   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17627   rtx op0, op1;
17628   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17629   insn_code icode;
17630   struct expand_operand ops[4];
17631
17632   start_sequence ();
17633   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17634
17635   op_mode = GET_MODE (op0);
17636   if (op_mode == VOIDmode)
17637     op_mode = GET_MODE (op1);
17638
17639   switch (op_mode)
17640     {
17641     case E_QImode:
17642     case E_HImode:
17643     case E_SImode:
17644       cmp_mode = SImode;
17645       icode = CODE_FOR_cmpsi;
17646       break;
17647
17648     case E_DImode:
17649       cmp_mode = DImode;
17650       icode = CODE_FOR_cmpdi;
17651       break;
17652
17653     case E_SFmode:
17654       cmp_mode = SFmode;
17655       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17656       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
17657       break;
17658
17659     case E_DFmode:
17660       cmp_mode = DFmode;
17661       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17662       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
17663       break;
17664
17665     default:
17666       end_sequence ();
17667       return NULL_RTX;
17668     }
17669
17670   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
17671   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
17672   if (!op0 || !op1)
17673     {
17674       end_sequence ();
17675       return NULL_RTX;
17676     }
17677   *prep_seq = get_insns ();
17678   end_sequence ();
17679
17680   create_fixed_operand (&ops[0], op0);
17681   create_fixed_operand (&ops[1], op1);
17682
17683   start_sequence ();
17684   if (!maybe_expand_insn (icode, 2, ops))
17685     {
17686       end_sequence ();
17687       return NULL_RTX;
17688     }
17689   *gen_seq = get_insns ();
17690   end_sequence ();
17691
17692   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
17693                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
17694 }
17695
17696 static rtx
17697 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
17698                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
17699 {
17700   rtx op0, op1, target;
17701   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17702   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17703   insn_code icode;
17704   struct expand_operand ops[6];
17705   int aarch64_cond;
17706
17707   push_to_sequence (*prep_seq);
17708   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17709
17710   op_mode = GET_MODE (op0);
17711   if (op_mode == VOIDmode)
17712     op_mode = GET_MODE (op1);
17713
17714   switch (op_mode)
17715     {
17716     case E_QImode:
17717     case E_HImode:
17718     case E_SImode:
17719       cmp_mode = SImode;
17720       icode = CODE_FOR_ccmpsi;
17721       break;
17722
17723     case E_DImode:
17724       cmp_mode = DImode;
17725       icode = CODE_FOR_ccmpdi;
17726       break;
17727
17728     case E_SFmode:
17729       cmp_mode = SFmode;
17730       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17731       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
17732       break;
17733
17734     case E_DFmode:
17735       cmp_mode = DFmode;
17736       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17737       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
17738       break;
17739
17740     default:
17741       end_sequence ();
17742       return NULL_RTX;
17743     }
17744
17745   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
17746   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
17747   if (!op0 || !op1)
17748     {
17749       end_sequence ();
17750       return NULL_RTX;
17751     }
17752   *prep_seq = get_insns ();
17753   end_sequence ();
17754
17755   target = gen_rtx_REG (cc_mode, CC_REGNUM);
17756   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
17757
17758   if (bit_code != AND)
17759     {
17760       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
17761                                                 GET_MODE (XEXP (prev, 0))),
17762                              VOIDmode, XEXP (prev, 0), const0_rtx);
17763       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
17764     }
17765
17766   create_fixed_operand (&ops[0], XEXP (prev, 0));
17767   create_fixed_operand (&ops[1], target);
17768   create_fixed_operand (&ops[2], op0);
17769   create_fixed_operand (&ops[3], op1);
17770   create_fixed_operand (&ops[4], prev);
17771   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
17772
17773   push_to_sequence (*gen_seq);
17774   if (!maybe_expand_insn (icode, 6, ops))
17775     {
17776       end_sequence ();
17777       return NULL_RTX;
17778     }
17779
17780   *gen_seq = get_insns ();
17781   end_sequence ();
17782
17783   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
17784 }
17785
17786 #undef TARGET_GEN_CCMP_FIRST
17787 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
17788
17789 #undef TARGET_GEN_CCMP_NEXT
17790 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
17791
17792 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
17793    instruction fusion of some sort.  */
17794
17795 static bool
17796 aarch64_macro_fusion_p (void)
17797 {
17798   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
17799 }
17800
17801
17802 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
17803    should be kept together during scheduling.  */
17804
17805 static bool
17806 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
17807 {
17808   rtx set_dest;
17809   rtx prev_set = single_set (prev);
17810   rtx curr_set = single_set (curr);
17811   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
17812   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
17813
17814   if (!aarch64_macro_fusion_p ())
17815     return false;
17816
17817   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
17818     {
17819       /* We are trying to match:
17820          prev (mov)  == (set (reg r0) (const_int imm16))
17821          curr (movk) == (set (zero_extract (reg r0)
17822                                            (const_int 16)
17823                                            (const_int 16))
17824                              (const_int imm16_1))  */
17825
17826       set_dest = SET_DEST (curr_set);
17827
17828       if (GET_CODE (set_dest) == ZERO_EXTRACT
17829           && CONST_INT_P (SET_SRC (curr_set))
17830           && CONST_INT_P (SET_SRC (prev_set))
17831           && CONST_INT_P (XEXP (set_dest, 2))
17832           && INTVAL (XEXP (set_dest, 2)) == 16
17833           && REG_P (XEXP (set_dest, 0))
17834           && REG_P (SET_DEST (prev_set))
17835           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
17836         {
17837           return true;
17838         }
17839     }
17840
17841   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
17842     {
17843
17844       /*  We're trying to match:
17845           prev (adrp) == (set (reg r1)
17846                               (high (symbol_ref ("SYM"))))
17847           curr (add) == (set (reg r0)
17848                              (lo_sum (reg r1)
17849                                      (symbol_ref ("SYM"))))
17850           Note that r0 need not necessarily be the same as r1, especially
17851           during pre-regalloc scheduling.  */
17852
17853       if (satisfies_constraint_Ush (SET_SRC (prev_set))
17854           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17855         {
17856           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
17857               && REG_P (XEXP (SET_SRC (curr_set), 0))
17858               && REGNO (XEXP (SET_SRC (curr_set), 0))
17859                  == REGNO (SET_DEST (prev_set))
17860               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
17861                               XEXP (SET_SRC (curr_set), 1)))
17862             return true;
17863         }
17864     }
17865
17866   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
17867     {
17868
17869       /* We're trying to match:
17870          prev (movk) == (set (zero_extract (reg r0)
17871                                            (const_int 16)
17872                                            (const_int 32))
17873                              (const_int imm16_1))
17874          curr (movk) == (set (zero_extract (reg r0)
17875                                            (const_int 16)
17876                                            (const_int 48))
17877                              (const_int imm16_2))  */
17878
17879       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
17880           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
17881           && REG_P (XEXP (SET_DEST (prev_set), 0))
17882           && REG_P (XEXP (SET_DEST (curr_set), 0))
17883           && REGNO (XEXP (SET_DEST (prev_set), 0))
17884              == REGNO (XEXP (SET_DEST (curr_set), 0))
17885           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
17886           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
17887           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
17888           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
17889           && CONST_INT_P (SET_SRC (prev_set))
17890           && CONST_INT_P (SET_SRC (curr_set)))
17891         return true;
17892
17893     }
17894   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
17895     {
17896       /* We're trying to match:
17897           prev (adrp) == (set (reg r0)
17898                               (high (symbol_ref ("SYM"))))
17899           curr (ldr) == (set (reg r1)
17900                              (mem (lo_sum (reg r0)
17901                                              (symbol_ref ("SYM")))))
17902                  or
17903           curr (ldr) == (set (reg r1)
17904                              (zero_extend (mem
17905                                            (lo_sum (reg r0)
17906                                                    (symbol_ref ("SYM"))))))  */
17907       if (satisfies_constraint_Ush (SET_SRC (prev_set))
17908           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17909         {
17910           rtx curr_src = SET_SRC (curr_set);
17911
17912           if (GET_CODE (curr_src) == ZERO_EXTEND)
17913             curr_src = XEXP (curr_src, 0);
17914
17915           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
17916               && REG_P (XEXP (XEXP (curr_src, 0), 0))
17917               && REGNO (XEXP (XEXP (curr_src, 0), 0))
17918                  == REGNO (SET_DEST (prev_set))
17919               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
17920                               XEXP (SET_SRC (prev_set), 0)))
17921               return true;
17922         }
17923     }
17924
17925   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
17926        && aarch_crypto_can_dual_issue (prev, curr))
17927     return true;
17928
17929   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
17930       && any_condjump_p (curr))
17931     {
17932       unsigned int condreg1, condreg2;
17933       rtx cc_reg_1;
17934       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
17935       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
17936
17937       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
17938           && prev
17939           && modified_in_p (cc_reg_1, prev))
17940         {
17941           enum attr_type prev_type = get_attr_type (prev);
17942
17943           /* FIXME: this misses some which is considered simple arthematic
17944              instructions for ThunderX.  Simple shifts are missed here.  */
17945           if (prev_type == TYPE_ALUS_SREG
17946               || prev_type == TYPE_ALUS_IMM
17947               || prev_type == TYPE_LOGICS_REG
17948               || prev_type == TYPE_LOGICS_IMM)
17949             return true;
17950         }
17951     }
17952
17953   if (prev_set
17954       && curr_set
17955       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
17956       && any_condjump_p (curr))
17957     {
17958       /* We're trying to match:
17959           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
17960           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
17961                                                          (const_int 0))
17962                                                  (label_ref ("SYM"))
17963                                                  (pc))  */
17964       if (SET_DEST (curr_set) == (pc_rtx)
17965           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
17966           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
17967           && REG_P (SET_DEST (prev_set))
17968           && REGNO (SET_DEST (prev_set))
17969              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
17970         {
17971           /* Fuse ALU operations followed by conditional branch instruction.  */
17972           switch (get_attr_type (prev))
17973             {
17974             case TYPE_ALU_IMM:
17975             case TYPE_ALU_SREG:
17976             case TYPE_ADC_REG:
17977             case TYPE_ADC_IMM:
17978             case TYPE_ADCS_REG:
17979             case TYPE_ADCS_IMM:
17980             case TYPE_LOGIC_REG:
17981             case TYPE_LOGIC_IMM:
17982             case TYPE_CSEL:
17983             case TYPE_ADR:
17984             case TYPE_MOV_IMM:
17985             case TYPE_SHIFT_REG:
17986             case TYPE_SHIFT_IMM:
17987             case TYPE_BFM:
17988             case TYPE_RBIT:
17989             case TYPE_REV:
17990             case TYPE_EXTEND:
17991               return true;
17992
17993             default:;
17994             }
17995         }
17996     }
17997
17998   return false;
17999 }
18000
18001 /* Return true iff the instruction fusion described by OP is enabled.  */
18002
18003 bool
18004 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
18005 {
18006   return (aarch64_tune_params.fusible_ops & op) != 0;
18007 }
18008
18009 /* If MEM is in the form of [base+offset], extract the two parts
18010    of address and set to BASE and OFFSET, otherwise return false
18011    after clearing BASE and OFFSET.  */
18012
18013 bool
18014 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
18015 {
18016   rtx addr;
18017
18018   gcc_assert (MEM_P (mem));
18019
18020   addr = XEXP (mem, 0);
18021
18022   if (REG_P (addr))
18023     {
18024       *base = addr;
18025       *offset = const0_rtx;
18026       return true;
18027     }
18028
18029   if (GET_CODE (addr) == PLUS
18030       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
18031     {
18032       *base = XEXP (addr, 0);
18033       *offset = XEXP (addr, 1);
18034       return true;
18035     }
18036
18037   *base = NULL_RTX;
18038   *offset = NULL_RTX;
18039
18040   return false;
18041 }
18042
18043 /* Types for scheduling fusion.  */
18044 enum sched_fusion_type
18045 {
18046   SCHED_FUSION_NONE = 0,
18047   SCHED_FUSION_LD_SIGN_EXTEND,
18048   SCHED_FUSION_LD_ZERO_EXTEND,
18049   SCHED_FUSION_LD,
18050   SCHED_FUSION_ST,
18051   SCHED_FUSION_NUM
18052 };
18053
18054 /* If INSN is a load or store of address in the form of [base+offset],
18055    extract the two parts and set to BASE and OFFSET.  Return scheduling
18056    fusion type this INSN is.  */
18057
18058 static enum sched_fusion_type
18059 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
18060 {
18061   rtx x, dest, src;
18062   enum sched_fusion_type fusion = SCHED_FUSION_LD;
18063
18064   gcc_assert (INSN_P (insn));
18065   x = PATTERN (insn);
18066   if (GET_CODE (x) != SET)
18067     return SCHED_FUSION_NONE;
18068
18069   src = SET_SRC (x);
18070   dest = SET_DEST (x);
18071
18072   machine_mode dest_mode = GET_MODE (dest);
18073
18074   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
18075     return SCHED_FUSION_NONE;
18076
18077   if (GET_CODE (src) == SIGN_EXTEND)
18078     {
18079       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
18080       src = XEXP (src, 0);
18081       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18082         return SCHED_FUSION_NONE;
18083     }
18084   else if (GET_CODE (src) == ZERO_EXTEND)
18085     {
18086       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
18087       src = XEXP (src, 0);
18088       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18089         return SCHED_FUSION_NONE;
18090     }
18091
18092   if (GET_CODE (src) == MEM && REG_P (dest))
18093     extract_base_offset_in_addr (src, base, offset);
18094   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
18095     {
18096       fusion = SCHED_FUSION_ST;
18097       extract_base_offset_in_addr (dest, base, offset);
18098     }
18099   else
18100     return SCHED_FUSION_NONE;
18101
18102   if (*base == NULL_RTX || *offset == NULL_RTX)
18103     fusion = SCHED_FUSION_NONE;
18104
18105   return fusion;
18106 }
18107
18108 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
18109
18110    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
18111    and PRI are only calculated for these instructions.  For other instruction,
18112    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
18113    type instruction fusion can be added by returning different priorities.
18114
18115    It's important that irrelevant instructions get the largest FUSION_PRI.  */
18116
18117 static void
18118 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
18119                                int *fusion_pri, int *pri)
18120 {
18121   int tmp, off_val;
18122   rtx base, offset;
18123   enum sched_fusion_type fusion;
18124
18125   gcc_assert (INSN_P (insn));
18126
18127   tmp = max_pri - 1;
18128   fusion = fusion_load_store (insn, &base, &offset);
18129   if (fusion == SCHED_FUSION_NONE)
18130     {
18131       *pri = tmp;
18132       *fusion_pri = tmp;
18133       return;
18134     }
18135
18136   /* Set FUSION_PRI according to fusion type and base register.  */
18137   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
18138
18139   /* Calculate PRI.  */
18140   tmp /= 2;
18141
18142   /* INSN with smaller offset goes first.  */
18143   off_val = (int)(INTVAL (offset));
18144   if (off_val >= 0)
18145     tmp -= (off_val & 0xfffff);
18146   else
18147     tmp += ((- off_val) & 0xfffff);
18148
18149   *pri = tmp;
18150   return;
18151 }
18152
18153 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
18154    Adjust priority of sha1h instructions so they are scheduled before
18155    other SHA1 instructions.  */
18156
18157 static int
18158 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
18159 {
18160   rtx x = PATTERN (insn);
18161
18162   if (GET_CODE (x) == SET)
18163     {
18164       x = SET_SRC (x);
18165
18166       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
18167         return priority + 10;
18168     }
18169
18170   return priority;
18171 }
18172
18173 /* Given OPERANDS of consecutive load/store, check if we can merge
18174    them into ldp/stp.  LOAD is true if they are load instructions.
18175    MODE is the mode of memory operands.  */
18176
18177 bool
18178 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
18179                                 machine_mode mode)
18180 {
18181   HOST_WIDE_INT offval_1, offval_2, msize;
18182   enum reg_class rclass_1, rclass_2;
18183   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
18184
18185   if (load)
18186     {
18187       mem_1 = operands[1];
18188       mem_2 = operands[3];
18189       reg_1 = operands[0];
18190       reg_2 = operands[2];
18191       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
18192       if (REGNO (reg_1) == REGNO (reg_2))
18193         return false;
18194     }
18195   else
18196     {
18197       mem_1 = operands[0];
18198       mem_2 = operands[2];
18199       reg_1 = operands[1];
18200       reg_2 = operands[3];
18201     }
18202
18203   /* The mems cannot be volatile.  */
18204   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
18205     return false;
18206
18207   /* If we have SImode and slow unaligned ldp,
18208      check the alignment to be at least 8 byte. */
18209   if (mode == SImode
18210       && (aarch64_tune_params.extra_tuning_flags
18211           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18212       && !optimize_size
18213       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
18214     return false;
18215
18216   /* Check if the addresses are in the form of [base+offset].  */
18217   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
18218   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
18219     return false;
18220   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
18221   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
18222     return false;
18223
18224   /* Check if the bases are same.  */
18225   if (!rtx_equal_p (base_1, base_2))
18226     return false;
18227
18228   /* The operands must be of the same size.  */
18229   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
18230                          GET_MODE_SIZE (GET_MODE (mem_2))));
18231
18232   offval_1 = INTVAL (offset_1);
18233   offval_2 = INTVAL (offset_2);
18234   /* We should only be trying this for fixed-sized modes.  There is no
18235      SVE LDP/STP instruction.  */
18236   msize = GET_MODE_SIZE (mode).to_constant ();
18237   /* Check if the offsets are consecutive.  */
18238   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
18239     return false;
18240
18241   /* Check if the addresses are clobbered by load.  */
18242   if (load)
18243     {
18244       if (reg_mentioned_p (reg_1, mem_1))
18245         return false;
18246
18247       /* In increasing order, the last load can clobber the address.  */
18248       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
18249         return false;
18250     }
18251
18252   /* One of the memory accesses must be a mempair operand.
18253      If it is not the first one, they need to be swapped by the
18254      peephole.  */
18255   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
18256        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
18257     return false;
18258
18259   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
18260     rclass_1 = FP_REGS;
18261   else
18262     rclass_1 = GENERAL_REGS;
18263
18264   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
18265     rclass_2 = FP_REGS;
18266   else
18267     rclass_2 = GENERAL_REGS;
18268
18269   /* Check if the registers are of same class.  */
18270   if (rclass_1 != rclass_2)
18271     return false;
18272
18273   return true;
18274 }
18275
18276 /* Given OPERANDS of consecutive load/store that can be merged,
18277    swap them if they are not in ascending order.  */
18278 void
18279 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
18280 {
18281   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
18282   HOST_WIDE_INT offval_1, offval_2;
18283
18284   if (load)
18285     {
18286       mem_1 = operands[1];
18287       mem_2 = operands[3];
18288     }
18289   else
18290     {
18291       mem_1 = operands[0];
18292       mem_2 = operands[2];
18293     }
18294
18295   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
18296   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
18297
18298   offval_1 = INTVAL (offset_1);
18299   offval_2 = INTVAL (offset_2);
18300
18301   if (offval_1 > offval_2)
18302     {
18303       /* Irrespective of whether this is a load or a store,
18304          we do the same swap.  */
18305       std::swap (operands[0], operands[2]);
18306       std::swap (operands[1], operands[3]);
18307     }
18308 }
18309
18310 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
18311    comparison between the two.  */
18312 int
18313 aarch64_host_wide_int_compare (const void *x, const void *y)
18314 {
18315   return wi::cmps (* ((const HOST_WIDE_INT *) x),
18316                    * ((const HOST_WIDE_INT *) y));
18317 }
18318
18319 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
18320    other pointing to a REG rtx containing an offset, compare the offsets
18321    of the two pairs.
18322
18323    Return:
18324
18325         1 iff offset (X) > offset (Y)
18326         0 iff offset (X) == offset (Y)
18327         -1 iff offset (X) < offset (Y)  */
18328 int
18329 aarch64_ldrstr_offset_compare (const void *x, const void *y)
18330 {
18331   const rtx * operands_1 = (const rtx *) x;
18332   const rtx * operands_2 = (const rtx *) y;
18333   rtx mem_1, mem_2, base, offset_1, offset_2;
18334
18335   if (MEM_P (operands_1[0]))
18336     mem_1 = operands_1[0];
18337   else
18338     mem_1 = operands_1[1];
18339
18340   if (MEM_P (operands_2[0]))
18341     mem_2 = operands_2[0];
18342   else
18343     mem_2 = operands_2[1];
18344
18345   /* Extract the offsets.  */
18346   extract_base_offset_in_addr (mem_1, &base, &offset_1);
18347   extract_base_offset_in_addr (mem_2, &base, &offset_2);
18348
18349   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
18350
18351   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
18352 }
18353
18354 /* Given OPERANDS of consecutive load/store, check if we can merge
18355    them into ldp/stp by adjusting the offset.  LOAD is true if they
18356    are load instructions.  MODE is the mode of memory operands.
18357
18358    Given below consecutive stores:
18359
18360      str  w1, [xb, 0x100]
18361      str  w1, [xb, 0x104]
18362      str  w1, [xb, 0x108]
18363      str  w1, [xb, 0x10c]
18364
18365    Though the offsets are out of the range supported by stp, we can
18366    still pair them after adjusting the offset, like:
18367
18368      add  scratch, xb, 0x100
18369      stp  w1, w1, [scratch]
18370      stp  w1, w1, [scratch, 0x8]
18371
18372    The peephole patterns detecting this opportunity should guarantee
18373    the scratch register is avaliable.  */
18374
18375 bool
18376 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
18377                                        scalar_mode mode)
18378 {
18379   const int num_insns = 4;
18380   enum reg_class rclass;
18381   HOST_WIDE_INT offvals[num_insns], msize;
18382   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
18383
18384   if (load)
18385     {
18386       for (int i = 0; i < num_insns; i++)
18387         {
18388           reg[i] = operands[2 * i];
18389           mem[i] = operands[2 * i + 1];
18390
18391           gcc_assert (REG_P (reg[i]));
18392         }
18393
18394       /* Do not attempt to merge the loads if the loads clobber each other.  */
18395       for (int i = 0; i < 8; i += 2)
18396         for (int j = i + 2; j < 8; j += 2)
18397           if (reg_overlap_mentioned_p (operands[i], operands[j]))
18398             return false;
18399     }
18400   else
18401     for (int i = 0; i < num_insns; i++)
18402       {
18403         mem[i] = operands[2 * i];
18404         reg[i] = operands[2 * i + 1];
18405       }
18406
18407   /* Skip if memory operand is by itself valid for ldp/stp.  */
18408   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
18409     return false;
18410
18411   for (int i = 0; i < num_insns; i++)
18412     {
18413       /* The mems cannot be volatile.  */
18414       if (MEM_VOLATILE_P (mem[i]))
18415         return false;
18416
18417       /* Check if the addresses are in the form of [base+offset].  */
18418       extract_base_offset_in_addr (mem[i], base + i, offset + i);
18419       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
18420         return false;
18421     }
18422
18423   /* Check if the registers are of same class.  */
18424   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
18425     ? FP_REGS : GENERAL_REGS;
18426
18427   for (int i = 1; i < num_insns; i++)
18428     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
18429       {
18430         if (rclass != FP_REGS)
18431           return false;
18432       }
18433     else
18434       {
18435         if (rclass != GENERAL_REGS)
18436           return false;
18437       }
18438
18439   /* Only the last register in the order in which they occur
18440      may be clobbered by the load.  */
18441   if (rclass == GENERAL_REGS && load)
18442     for (int i = 0; i < num_insns - 1; i++)
18443       if (reg_mentioned_p (reg[i], mem[i]))
18444         return false;
18445
18446   /* Check if the bases are same.  */
18447   for (int i = 0; i < num_insns - 1; i++)
18448     if (!rtx_equal_p (base[i], base[i + 1]))
18449       return false;
18450
18451   for (int i = 0; i < num_insns; i++)
18452     offvals[i] = INTVAL (offset[i]);
18453
18454   msize = GET_MODE_SIZE (mode);
18455
18456   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
18457   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
18458          aarch64_host_wide_int_compare);
18459
18460   if (!(offvals[1] == offvals[0] + msize
18461         && offvals[3] == offvals[2] + msize))
18462     return false;
18463
18464   /* Check that offsets are within range of each other.  The ldp/stp
18465      instructions have 7 bit immediate offsets, so use 0x80.  */
18466   if (offvals[2] - offvals[0] >= msize * 0x80)
18467     return false;
18468
18469   /* The offsets must be aligned with respect to each other.  */
18470   if (offvals[0] % msize != offvals[2] % msize)
18471     return false;
18472
18473   /* If we have SImode and slow unaligned ldp,
18474      check the alignment to be at least 8 byte. */
18475   if (mode == SImode
18476       && (aarch64_tune_params.extra_tuning_flags
18477           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18478       && !optimize_size
18479       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
18480     return false;
18481
18482   return true;
18483 }
18484
18485 /* Given OPERANDS of consecutive load/store, this function pairs them
18486    into LDP/STP after adjusting the offset.  It depends on the fact
18487    that the operands can be sorted so the offsets are correct for STP.
18488    MODE is the mode of memory operands.  CODE is the rtl operator
18489    which should be applied to all memory operands, it's SIGN_EXTEND,
18490    ZERO_EXTEND or UNKNOWN.  */
18491
18492 bool
18493 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
18494                              scalar_mode mode, RTX_CODE code)
18495 {
18496   rtx base, offset_1, offset_3, t1, t2;
18497   rtx mem_1, mem_2, mem_3, mem_4;
18498   rtx temp_operands[8];
18499   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
18500                 stp_off_upper_limit, stp_off_lower_limit, msize;
18501
18502   /* We make changes on a copy as we may still bail out.  */
18503   for (int i = 0; i < 8; i ++)
18504     temp_operands[i] = operands[i];
18505
18506   /* Sort the operands.  */
18507   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
18508
18509   if (load)
18510     {
18511       mem_1 = temp_operands[1];
18512       mem_2 = temp_operands[3];
18513       mem_3 = temp_operands[5];
18514       mem_4 = temp_operands[7];
18515     }
18516   else
18517     {
18518       mem_1 = temp_operands[0];
18519       mem_2 = temp_operands[2];
18520       mem_3 = temp_operands[4];
18521       mem_4 = temp_operands[6];
18522       gcc_assert (code == UNKNOWN);
18523     }
18524
18525   extract_base_offset_in_addr (mem_1, &base, &offset_1);
18526   extract_base_offset_in_addr (mem_3, &base, &offset_3);
18527   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
18528               && offset_3 != NULL_RTX);
18529
18530   /* Adjust offset so it can fit in LDP/STP instruction.  */
18531   msize = GET_MODE_SIZE (mode);
18532   stp_off_upper_limit = msize * (0x40 - 1);
18533   stp_off_lower_limit = - msize * 0x40;
18534
18535   off_val_1 = INTVAL (offset_1);
18536   off_val_3 = INTVAL (offset_3);
18537
18538   /* The base offset is optimally half way between the two STP/LDP offsets.  */
18539   if (msize <= 4)
18540     base_off = (off_val_1 + off_val_3) / 2;
18541   else
18542     /* However, due to issues with negative LDP/STP offset generation for
18543        larger modes, for DF, DI and vector modes. we must not use negative
18544        addresses smaller than 9 signed unadjusted bits can store.  This
18545        provides the most range in this case.  */
18546     base_off = off_val_1;
18547
18548   /* Adjust the base so that it is aligned with the addresses but still
18549      optimal.  */
18550   if (base_off % msize != off_val_1 % msize)
18551     /* Fix the offset, bearing in mind we want to make it bigger not
18552        smaller.  */
18553     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18554   else if (msize <= 4)
18555     /* The negative range of LDP/STP is one larger than the positive range.  */
18556     base_off += msize;
18557
18558   /* Check if base offset is too big or too small.  We can attempt to resolve
18559      this issue by setting it to the maximum value and seeing if the offsets
18560      still fit.  */
18561   if (base_off >= 0x1000)
18562     {
18563       base_off = 0x1000 - 1;
18564       /* We must still make sure that the base offset is aligned with respect
18565          to the address.  But it may may not be made any bigger.  */
18566       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18567     }
18568
18569   /* Likewise for the case where the base is too small.  */
18570   if (base_off <= -0x1000)
18571     {
18572       base_off = -0x1000 + 1;
18573       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18574     }
18575
18576   /* Offset of the first STP/LDP.  */
18577   new_off_1 = off_val_1 - base_off;
18578
18579   /* Offset of the second STP/LDP.  */
18580   new_off_3 = off_val_3 - base_off;
18581
18582   /* The offsets must be within the range of the LDP/STP instructions.  */
18583   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
18584       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
18585     return false;
18586
18587   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
18588                                                   new_off_1), true);
18589   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
18590                                                   new_off_1 + msize), true);
18591   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
18592                                                   new_off_3), true);
18593   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
18594                                                   new_off_3 + msize), true);
18595
18596   if (!aarch64_mem_pair_operand (mem_1, mode)
18597       || !aarch64_mem_pair_operand (mem_3, mode))
18598     return false;
18599
18600   if (code == ZERO_EXTEND)
18601     {
18602       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
18603       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
18604       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
18605       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
18606     }
18607   else if (code == SIGN_EXTEND)
18608     {
18609       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
18610       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
18611       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
18612       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
18613     }
18614
18615   if (load)
18616     {
18617       operands[0] = temp_operands[0];
18618       operands[1] = mem_1;
18619       operands[2] = temp_operands[2];
18620       operands[3] = mem_2;
18621       operands[4] = temp_operands[4];
18622       operands[5] = mem_3;
18623       operands[6] = temp_operands[6];
18624       operands[7] = mem_4;
18625     }
18626   else
18627     {
18628       operands[0] = mem_1;
18629       operands[1] = temp_operands[1];
18630       operands[2] = mem_2;
18631       operands[3] = temp_operands[3];
18632       operands[4] = mem_3;
18633       operands[5] = temp_operands[5];
18634       operands[6] = mem_4;
18635       operands[7] = temp_operands[7];
18636     }
18637
18638   /* Emit adjusting instruction.  */
18639   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
18640   /* Emit ldp/stp instructions.  */
18641   t1 = gen_rtx_SET (operands[0], operands[1]);
18642   t2 = gen_rtx_SET (operands[2], operands[3]);
18643   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18644   t1 = gen_rtx_SET (operands[4], operands[5]);
18645   t2 = gen_rtx_SET (operands[6], operands[7]);
18646   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18647   return true;
18648 }
18649
18650 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
18651    it isn't worth branching around empty masked ops (including masked
18652    stores).  */
18653
18654 static bool
18655 aarch64_empty_mask_is_expensive (unsigned)
18656 {
18657   return false;
18658 }
18659
18660 /* Return 1 if pseudo register should be created and used to hold
18661    GOT address for PIC code.  */
18662
18663 bool
18664 aarch64_use_pseudo_pic_reg (void)
18665 {
18666   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
18667 }
18668
18669 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
18670
18671 static int
18672 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
18673 {
18674   switch (XINT (x, 1))
18675     {
18676     case UNSPEC_GOTSMALLPIC:
18677     case UNSPEC_GOTSMALLPIC28K:
18678     case UNSPEC_GOTTINYPIC:
18679       return 0;
18680     default:
18681       break;
18682     }
18683
18684   return default_unspec_may_trap_p (x, flags);
18685 }
18686
18687
18688 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
18689    return the log2 of that value.  Otherwise return -1.  */
18690
18691 int
18692 aarch64_fpconst_pow_of_2 (rtx x)
18693 {
18694   const REAL_VALUE_TYPE *r;
18695
18696   if (!CONST_DOUBLE_P (x))
18697     return -1;
18698
18699   r = CONST_DOUBLE_REAL_VALUE (x);
18700
18701   if (REAL_VALUE_NEGATIVE (*r)
18702       || REAL_VALUE_ISNAN (*r)
18703       || REAL_VALUE_ISINF (*r)
18704       || !real_isinteger (r, DFmode))
18705     return -1;
18706
18707   return exact_log2 (real_to_integer (r));
18708 }
18709
18710 /* If X is a vector of equal CONST_DOUBLE values and that value is
18711    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
18712
18713 int
18714 aarch64_vec_fpconst_pow_of_2 (rtx x)
18715 {
18716   int nelts;
18717   if (GET_CODE (x) != CONST_VECTOR
18718       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
18719     return -1;
18720
18721   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
18722     return -1;
18723
18724   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
18725   if (firstval <= 0)
18726     return -1;
18727
18728   for (int i = 1; i < nelts; i++)
18729     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
18730       return -1;
18731
18732   return firstval;
18733 }
18734
18735 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
18736    to float.
18737
18738    __fp16 always promotes through this hook.
18739    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
18740    through the generic excess precision logic rather than here.  */
18741
18742 static tree
18743 aarch64_promoted_type (const_tree t)
18744 {
18745   if (SCALAR_FLOAT_TYPE_P (t)
18746       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
18747     return float_type_node;
18748
18749   return NULL_TREE;
18750 }
18751
18752 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
18753
18754 static bool
18755 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
18756                            optimization_type opt_type)
18757 {
18758   switch (op)
18759     {
18760     case rsqrt_optab:
18761       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
18762
18763     default:
18764       return true;
18765     }
18766 }
18767
18768 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
18769
18770 static unsigned int
18771 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
18772                                         int *offset)
18773 {
18774   /* Polynomial invariant 1 == (VG / 2) - 1.  */
18775   gcc_assert (i == 1);
18776   *factor = 2;
18777   *offset = 1;
18778   return AARCH64_DWARF_VG;
18779 }
18780
18781 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
18782    if MODE is HFmode, and punt to the generic implementation otherwise.  */
18783
18784 static bool
18785 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
18786 {
18787   return (mode == HFmode
18788           ? true
18789           : default_libgcc_floating_mode_supported_p (mode));
18790 }
18791
18792 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
18793    if MODE is HFmode, and punt to the generic implementation otherwise.  */
18794
18795 static bool
18796 aarch64_scalar_mode_supported_p (scalar_mode mode)
18797 {
18798   return (mode == HFmode
18799           ? true
18800           : default_scalar_mode_supported_p (mode));
18801 }
18802
18803 /* Set the value of FLT_EVAL_METHOD.
18804    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
18805
18806     0: evaluate all operations and constants, whose semantic type has at
18807        most the range and precision of type float, to the range and
18808        precision of float; evaluate all other operations and constants to
18809        the range and precision of the semantic type;
18810
18811     N, where _FloatN is a supported interchange floating type
18812        evaluate all operations and constants, whose semantic type has at
18813        most the range and precision of _FloatN type, to the range and
18814        precision of the _FloatN type; evaluate all other operations and
18815        constants to the range and precision of the semantic type;
18816
18817    If we have the ARMv8.2-A extensions then we support _Float16 in native
18818    precision, so we should set this to 16.  Otherwise, we support the type,
18819    but want to evaluate expressions in float precision, so set this to
18820    0.  */
18821
18822 static enum flt_eval_method
18823 aarch64_excess_precision (enum excess_precision_type type)
18824 {
18825   switch (type)
18826     {
18827       case EXCESS_PRECISION_TYPE_FAST:
18828       case EXCESS_PRECISION_TYPE_STANDARD:
18829         /* We can calculate either in 16-bit range and precision or
18830            32-bit range and precision.  Make that decision based on whether
18831            we have native support for the ARMv8.2-A 16-bit floating-point
18832            instructions or not.  */
18833         return (TARGET_FP_F16INST
18834                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
18835                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
18836       case EXCESS_PRECISION_TYPE_IMPLICIT:
18837         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
18838       default:
18839         gcc_unreachable ();
18840     }
18841   return FLT_EVAL_METHOD_UNPREDICTABLE;
18842 }
18843
18844 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
18845    scheduled for speculative execution.  Reject the long-running division
18846    and square-root instructions.  */
18847
18848 static bool
18849 aarch64_sched_can_speculate_insn (rtx_insn *insn)
18850 {
18851   switch (get_attr_type (insn))
18852     {
18853       case TYPE_SDIV:
18854       case TYPE_UDIV:
18855       case TYPE_FDIVS:
18856       case TYPE_FDIVD:
18857       case TYPE_FSQRTS:
18858       case TYPE_FSQRTD:
18859       case TYPE_NEON_FP_SQRT_S:
18860       case TYPE_NEON_FP_SQRT_D:
18861       case TYPE_NEON_FP_SQRT_S_Q:
18862       case TYPE_NEON_FP_SQRT_D_Q:
18863       case TYPE_NEON_FP_DIV_S:
18864       case TYPE_NEON_FP_DIV_D:
18865       case TYPE_NEON_FP_DIV_S_Q:
18866       case TYPE_NEON_FP_DIV_D_Q:
18867         return false;
18868       default:
18869         return true;
18870     }
18871 }
18872
18873 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
18874
18875 static int
18876 aarch64_compute_pressure_classes (reg_class *classes)
18877 {
18878   int i = 0;
18879   classes[i++] = GENERAL_REGS;
18880   classes[i++] = FP_REGS;
18881   /* PR_REGS isn't a useful pressure class because many predicate pseudo
18882      registers need to go in PR_LO_REGS at some point during their
18883      lifetime.  Splitting it into two halves has the effect of making
18884      all predicates count against PR_LO_REGS, so that we try whenever
18885      possible to restrict the number of live predicates to 8.  This
18886      greatly reduces the amount of spilling in certain loops.  */
18887   classes[i++] = PR_LO_REGS;
18888   classes[i++] = PR_HI_REGS;
18889   return i;
18890 }
18891
18892 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
18893
18894 static bool
18895 aarch64_can_change_mode_class (machine_mode from,
18896                                machine_mode to, reg_class_t)
18897 {
18898   if (BYTES_BIG_ENDIAN)
18899     {
18900       bool from_sve_p = aarch64_sve_data_mode_p (from);
18901       bool to_sve_p = aarch64_sve_data_mode_p (to);
18902
18903       /* Don't allow changes between SVE data modes and non-SVE modes.
18904          See the comment at the head of aarch64-sve.md for details.  */
18905       if (from_sve_p != to_sve_p)
18906         return false;
18907
18908       /* Don't allow changes in element size: lane 0 of the new vector
18909          would not then be lane 0 of the old vector.  See the comment
18910          above aarch64_maybe_expand_sve_subreg_move for a more detailed
18911          description.
18912
18913          In the worst case, this forces a register to be spilled in
18914          one mode and reloaded in the other, which handles the
18915          endianness correctly.  */
18916       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
18917         return false;
18918     }
18919   return true;
18920 }
18921
18922 /* Implement TARGET_EARLY_REMAT_MODES.  */
18923
18924 static void
18925 aarch64_select_early_remat_modes (sbitmap modes)
18926 {
18927   /* SVE values are not normally live across a call, so it should be
18928      worth doing early rematerialization even in VL-specific mode.  */
18929   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
18930     {
18931       machine_mode mode = (machine_mode) i;
18932       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
18933       if (vec_flags & VEC_ANY_SVE)
18934         bitmap_set_bit (modes, i);
18935     }
18936 }
18937
18938 /* Override the default target speculation_safe_value.  */
18939 static rtx
18940 aarch64_speculation_safe_value (machine_mode mode,
18941                                 rtx result, rtx val, rtx failval)
18942 {
18943   /* Maybe we should warn if falling back to hard barriers.  They are
18944      likely to be noticably more expensive than the alternative below.  */
18945   if (!aarch64_track_speculation)
18946     return default_speculation_safe_value (mode, result, val, failval);
18947
18948   if (!REG_P (val))
18949     val = copy_to_mode_reg (mode, val);
18950
18951   if (!aarch64_reg_or_zero (failval, mode))
18952     failval = copy_to_mode_reg (mode, failval);
18953
18954   emit_insn (gen_despeculate_copy (mode, result, val, failval));
18955   return result;
18956 }
18957
18958 /* Implement TARGET_ESTIMATED_POLY_VALUE.
18959    Look into the tuning structure for an estimate.
18960    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
18961    Advanced SIMD 128 bits.  */
18962
18963 static HOST_WIDE_INT
18964 aarch64_estimated_poly_value (poly_int64 val)
18965 {
18966   enum aarch64_sve_vector_bits_enum width_source
18967     = aarch64_tune_params.sve_width;
18968
18969   /* If we still don't have an estimate, use the default.  */
18970   if (width_source == SVE_SCALABLE)
18971     return default_estimated_poly_value (val);
18972
18973   HOST_WIDE_INT over_128 = width_source - 128;
18974   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
18975 }
18976
18977
18978 /* Return true for types that could be supported as SIMD return or
18979    argument types.  */
18980
18981 static bool
18982 supported_simd_type (tree t)
18983 {
18984   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
18985     {
18986       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
18987       return s == 1 || s == 2 || s == 4 || s == 8;
18988     }
18989   return false;
18990 }
18991
18992 /* Return true for types that currently are supported as SIMD return
18993    or argument types.  */
18994
18995 static bool
18996 currently_supported_simd_type (tree t, tree b)
18997 {
18998   if (COMPLEX_FLOAT_TYPE_P (t))
18999     return false;
19000
19001   if (TYPE_SIZE (t) != TYPE_SIZE (b))
19002     return false;
19003
19004   return supported_simd_type (t);
19005 }
19006
19007 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
19008
19009 static int
19010 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
19011                                         struct cgraph_simd_clone *clonei,
19012                                         tree base_type, int num)
19013 {
19014   tree t, ret_type, arg_type;
19015   unsigned int elt_bits, vec_bits, count;
19016
19017   if (!TARGET_SIMD)
19018     return 0;
19019
19020   if (clonei->simdlen
19021       && (clonei->simdlen < 2
19022           || clonei->simdlen > 1024
19023           || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
19024     {
19025       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19026                   "unsupported simdlen %d", clonei->simdlen);
19027       return 0;
19028     }
19029
19030   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
19031   if (TREE_CODE (ret_type) != VOID_TYPE
19032       && !currently_supported_simd_type (ret_type, base_type))
19033     {
19034       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
19035         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19036                     "GCC does not currently support mixed size types "
19037                     "for %<simd%> functions");
19038       else if (supported_simd_type (ret_type))
19039         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19040                     "GCC does not currently support return type %qT "
19041                     "for %<simd%> functions", ret_type);
19042       else
19043         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19044                     "unsupported return type %qT for %<simd%> functions",
19045                     ret_type);
19046       return 0;
19047     }
19048
19049   for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
19050     {
19051       arg_type = TREE_TYPE (t);
19052
19053       if (!currently_supported_simd_type (arg_type, base_type))
19054         {
19055           if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
19056             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19057                         "GCC does not currently support mixed size types "
19058                         "for %<simd%> functions");
19059           else
19060             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19061                         "GCC does not currently support argument type %qT "
19062                         "for %<simd%> functions", arg_type);
19063           return 0;
19064         }
19065     }
19066
19067   clonei->vecsize_mangle = 'n';
19068   clonei->mask_mode = VOIDmode;
19069   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
19070   if (clonei->simdlen == 0)
19071     {
19072       count = 2;
19073       vec_bits = (num == 0 ? 64 : 128);
19074       clonei->simdlen = vec_bits / elt_bits;
19075     }
19076   else
19077     {
19078       count = 1;
19079       vec_bits = clonei->simdlen * elt_bits;
19080       if (vec_bits != 64 && vec_bits != 128)
19081         {
19082           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19083                       "GCC does not currently support simdlen %d for type %qT",
19084                       clonei->simdlen, base_type);
19085           return 0;
19086         }
19087     }
19088   clonei->vecsize_int = vec_bits;
19089   clonei->vecsize_float = vec_bits;
19090   return count;
19091 }
19092
19093 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
19094
19095 static void
19096 aarch64_simd_clone_adjust (struct cgraph_node *node)
19097 {
19098   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
19099      use the correct ABI.  */
19100
19101   tree t = TREE_TYPE (node->decl);
19102   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
19103                                         TYPE_ATTRIBUTES (t));
19104 }
19105
19106 /* Implement TARGET_SIMD_CLONE_USABLE.  */
19107
19108 static int
19109 aarch64_simd_clone_usable (struct cgraph_node *node)
19110 {
19111   switch (node->simdclone->vecsize_mangle)
19112     {
19113     case 'n':
19114       if (!TARGET_SIMD)
19115         return -1;
19116       return 0;
19117     default:
19118       gcc_unreachable ();
19119     }
19120 }
19121
19122 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
19123
19124 static int
19125 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
19126 {
19127   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
19128       != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
19129     return 0;
19130   return 1;
19131 }
19132
19133 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
19134
19135 static const char *
19136 aarch64_get_multilib_abi_name (void)
19137 {
19138   if (TARGET_BIG_END)
19139     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
19140   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
19141 }
19142
19143 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
19144    global variable based guard use the default else
19145    return a null tree.  */
19146 static tree
19147 aarch64_stack_protect_guard (void)
19148 {
19149   if (aarch64_stack_protector_guard == SSP_GLOBAL)
19150     return default_stack_protect_guard ();
19151
19152   return NULL_TREE;
19153 }
19154
19155 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
19156    section at the end if needed.  */
19157 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
19158 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
19159 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
19160 void
19161 aarch64_file_end_indicate_exec_stack ()
19162 {
19163   file_end_indicate_exec_stack ();
19164
19165   unsigned feature_1_and = 0;
19166   if (aarch64_bti_enabled ())
19167     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
19168
19169   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
19170     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
19171
19172   if (feature_1_and)
19173     {
19174       /* Generate .note.gnu.property section.  */
19175       switch_to_section (get_section (".note.gnu.property",
19176                                       SECTION_NOTYPE, NULL));
19177
19178       /* PT_NOTE header: namesz, descsz, type.
19179          namesz = 4 ("GNU\0")
19180          descsz = 16 (Size of the program property array)
19181                   [(12 + padding) * Number of array elements]
19182          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
19183       assemble_align (POINTER_SIZE);
19184       assemble_integer (GEN_INT (4), 4, 32, 1);
19185       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
19186       assemble_integer (GEN_INT (5), 4, 32, 1);
19187
19188       /* PT_NOTE name.  */
19189       assemble_string ("GNU", 4);
19190
19191       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
19192          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
19193          datasz = 4
19194          data   = feature_1_and.  */
19195       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
19196       assemble_integer (GEN_INT (4), 4, 32, 1);
19197       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
19198
19199       /* Pad the size of the note to the required alignment.  */
19200       assemble_align (POINTER_SIZE);
19201     }
19202 }
19203 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
19204 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
19205 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
19206
19207 /* Target-specific selftests.  */
19208
19209 #if CHECKING_P
19210
19211 namespace selftest {
19212
19213 /* Selftest for the RTL loader.
19214    Verify that the RTL loader copes with a dump from
19215    print_rtx_function.  This is essentially just a test that class
19216    function_reader can handle a real dump, but it also verifies
19217    that lookup_reg_by_dump_name correctly handles hard regs.
19218    The presence of hard reg names in the dump means that the test is
19219    target-specific, hence it is in this file.  */
19220
19221 static void
19222 aarch64_test_loading_full_dump ()
19223 {
19224   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
19225
19226   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
19227
19228   rtx_insn *insn_1 = get_insn_by_uid (1);
19229   ASSERT_EQ (NOTE, GET_CODE (insn_1));
19230
19231   rtx_insn *insn_15 = get_insn_by_uid (15);
19232   ASSERT_EQ (INSN, GET_CODE (insn_15));
19233   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
19234
19235   /* Verify crtl->return_rtx.  */
19236   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
19237   ASSERT_EQ (0, REGNO (crtl->return_rtx));
19238   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
19239 }
19240
19241 /* Run all target-specific selftests.  */
19242
19243 static void
19244 aarch64_run_selftests (void)
19245 {
19246   aarch64_test_loading_full_dump ();
19247 }
19248
19249 } // namespace selftest
19250
19251 #endif /* #if CHECKING_P */
19252
19253 #undef TARGET_STACK_PROTECT_GUARD
19254 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
19255
19256 #undef TARGET_ADDRESS_COST
19257 #define TARGET_ADDRESS_COST aarch64_address_cost
19258
19259 /* This hook will determines whether unnamed bitfields affect the alignment
19260    of the containing structure.  The hook returns true if the structure
19261    should inherit the alignment requirements of an unnamed bitfield's
19262    type.  */
19263 #undef TARGET_ALIGN_ANON_BITFIELD
19264 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
19265
19266 #undef TARGET_ASM_ALIGNED_DI_OP
19267 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
19268
19269 #undef TARGET_ASM_ALIGNED_HI_OP
19270 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
19271
19272 #undef TARGET_ASM_ALIGNED_SI_OP
19273 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
19274
19275 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
19276 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
19277   hook_bool_const_tree_hwi_hwi_const_tree_true
19278
19279 #undef TARGET_ASM_FILE_START
19280 #define TARGET_ASM_FILE_START aarch64_start_file
19281
19282 #undef TARGET_ASM_OUTPUT_MI_THUNK
19283 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
19284
19285 #undef TARGET_ASM_SELECT_RTX_SECTION
19286 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
19287
19288 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
19289 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
19290
19291 #undef TARGET_BUILD_BUILTIN_VA_LIST
19292 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
19293
19294 #undef TARGET_CALLEE_COPIES
19295 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
19296
19297 #undef TARGET_CAN_ELIMINATE
19298 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
19299
19300 #undef TARGET_CAN_INLINE_P
19301 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
19302
19303 #undef TARGET_CANNOT_FORCE_CONST_MEM
19304 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
19305
19306 #undef TARGET_CASE_VALUES_THRESHOLD
19307 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
19308
19309 #undef TARGET_CONDITIONAL_REGISTER_USAGE
19310 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
19311
19312 /* Only the least significant bit is used for initialization guard
19313    variables.  */
19314 #undef TARGET_CXX_GUARD_MASK_BIT
19315 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
19316
19317 #undef TARGET_C_MODE_FOR_SUFFIX
19318 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
19319
19320 #ifdef TARGET_BIG_ENDIAN_DEFAULT
19321 #undef  TARGET_DEFAULT_TARGET_FLAGS
19322 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
19323 #endif
19324
19325 #undef TARGET_CLASS_MAX_NREGS
19326 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
19327
19328 #undef TARGET_BUILTIN_DECL
19329 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
19330
19331 #undef TARGET_BUILTIN_RECIPROCAL
19332 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
19333
19334 #undef TARGET_C_EXCESS_PRECISION
19335 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
19336
19337 #undef  TARGET_EXPAND_BUILTIN
19338 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
19339
19340 #undef TARGET_EXPAND_BUILTIN_VA_START
19341 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
19342
19343 #undef TARGET_FOLD_BUILTIN
19344 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
19345
19346 #undef TARGET_FUNCTION_ARG
19347 #define TARGET_FUNCTION_ARG aarch64_function_arg
19348
19349 #undef TARGET_FUNCTION_ARG_ADVANCE
19350 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
19351
19352 #undef TARGET_FUNCTION_ARG_BOUNDARY
19353 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
19354
19355 #undef TARGET_FUNCTION_ARG_PADDING
19356 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
19357
19358 #undef TARGET_GET_RAW_RESULT_MODE
19359 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
19360 #undef TARGET_GET_RAW_ARG_MODE
19361 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
19362
19363 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
19364 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
19365
19366 #undef TARGET_FUNCTION_VALUE
19367 #define TARGET_FUNCTION_VALUE aarch64_function_value
19368
19369 #undef TARGET_FUNCTION_VALUE_REGNO_P
19370 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
19371
19372 #undef TARGET_GIMPLE_FOLD_BUILTIN
19373 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
19374
19375 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
19376 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
19377
19378 #undef  TARGET_INIT_BUILTINS
19379 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
19380
19381 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
19382 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
19383   aarch64_ira_change_pseudo_allocno_class
19384
19385 #undef TARGET_LEGITIMATE_ADDRESS_P
19386 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
19387
19388 #undef TARGET_LEGITIMATE_CONSTANT_P
19389 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
19390
19391 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
19392 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
19393   aarch64_legitimize_address_displacement
19394
19395 #undef TARGET_LIBGCC_CMP_RETURN_MODE
19396 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
19397
19398 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
19399 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
19400 aarch64_libgcc_floating_mode_supported_p
19401
19402 #undef TARGET_MANGLE_TYPE
19403 #define TARGET_MANGLE_TYPE aarch64_mangle_type
19404
19405 #undef TARGET_MEMORY_MOVE_COST
19406 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
19407
19408 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
19409 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
19410
19411 #undef TARGET_MUST_PASS_IN_STACK
19412 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
19413
19414 /* This target hook should return true if accesses to volatile bitfields
19415    should use the narrowest mode possible.  It should return false if these
19416    accesses should use the bitfield container type.  */
19417 #undef TARGET_NARROW_VOLATILE_BITFIELD
19418 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
19419
19420 #undef  TARGET_OPTION_OVERRIDE
19421 #define TARGET_OPTION_OVERRIDE aarch64_override_options
19422
19423 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
19424 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
19425   aarch64_override_options_after_change
19426
19427 #undef TARGET_OPTION_SAVE
19428 #define TARGET_OPTION_SAVE aarch64_option_save
19429
19430 #undef TARGET_OPTION_RESTORE
19431 #define TARGET_OPTION_RESTORE aarch64_option_restore
19432
19433 #undef TARGET_OPTION_PRINT
19434 #define TARGET_OPTION_PRINT aarch64_option_print
19435
19436 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
19437 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
19438
19439 #undef TARGET_SET_CURRENT_FUNCTION
19440 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
19441
19442 #undef TARGET_PASS_BY_REFERENCE
19443 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
19444
19445 #undef TARGET_PREFERRED_RELOAD_CLASS
19446 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
19447
19448 #undef TARGET_SCHED_REASSOCIATION_WIDTH
19449 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
19450
19451 #undef TARGET_PROMOTED_TYPE
19452 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
19453
19454 #undef TARGET_SECONDARY_RELOAD
19455 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
19456
19457 #undef TARGET_SHIFT_TRUNCATION_MASK
19458 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
19459
19460 #undef TARGET_SETUP_INCOMING_VARARGS
19461 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
19462
19463 #undef TARGET_STRUCT_VALUE_RTX
19464 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
19465
19466 #undef TARGET_REGISTER_MOVE_COST
19467 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
19468
19469 #undef TARGET_RETURN_IN_MEMORY
19470 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
19471
19472 #undef TARGET_RETURN_IN_MSB
19473 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
19474
19475 #undef TARGET_RTX_COSTS
19476 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
19477
19478 #undef TARGET_SCALAR_MODE_SUPPORTED_P
19479 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
19480
19481 #undef TARGET_SCHED_ISSUE_RATE
19482 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
19483
19484 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
19485 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
19486   aarch64_sched_first_cycle_multipass_dfa_lookahead
19487
19488 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
19489 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
19490   aarch64_first_cycle_multipass_dfa_lookahead_guard
19491
19492 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
19493 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
19494   aarch64_get_separate_components
19495
19496 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
19497 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
19498   aarch64_components_for_bb
19499
19500 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
19501 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
19502   aarch64_disqualify_components
19503
19504 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
19505 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
19506   aarch64_emit_prologue_components
19507
19508 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
19509 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
19510   aarch64_emit_epilogue_components
19511
19512 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
19513 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
19514   aarch64_set_handled_components
19515
19516 #undef TARGET_TRAMPOLINE_INIT
19517 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
19518
19519 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
19520 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
19521
19522 #undef TARGET_VECTOR_MODE_SUPPORTED_P
19523 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
19524
19525 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
19526 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
19527   aarch64_builtin_support_vector_misalignment
19528
19529 #undef TARGET_ARRAY_MODE
19530 #define TARGET_ARRAY_MODE aarch64_array_mode
19531
19532 #undef TARGET_ARRAY_MODE_SUPPORTED_P
19533 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
19534
19535 #undef TARGET_VECTORIZE_ADD_STMT_COST
19536 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
19537
19538 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
19539 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
19540   aarch64_builtin_vectorization_cost
19541
19542 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
19543 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
19544
19545 #undef TARGET_VECTORIZE_BUILTINS
19546 #define TARGET_VECTORIZE_BUILTINS
19547
19548 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
19549 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
19550   aarch64_builtin_vectorized_function
19551
19552 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
19553 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
19554   aarch64_autovectorize_vector_sizes
19555
19556 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
19557 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
19558   aarch64_atomic_assign_expand_fenv
19559
19560 /* Section anchor support.  */
19561
19562 #undef TARGET_MIN_ANCHOR_OFFSET
19563 #define TARGET_MIN_ANCHOR_OFFSET -256
19564
19565 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
19566    byte offset; we can do much more for larger data types, but have no way
19567    to determine the size of the access.  We assume accesses are aligned.  */
19568 #undef TARGET_MAX_ANCHOR_OFFSET
19569 #define TARGET_MAX_ANCHOR_OFFSET 4095
19570
19571 #undef TARGET_VECTOR_ALIGNMENT
19572 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
19573
19574 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
19575 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
19576   aarch64_vectorize_preferred_vector_alignment
19577 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
19578 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
19579   aarch64_simd_vector_alignment_reachable
19580
19581 /* vec_perm support.  */
19582
19583 #undef TARGET_VECTORIZE_VEC_PERM_CONST
19584 #define TARGET_VECTORIZE_VEC_PERM_CONST \
19585   aarch64_vectorize_vec_perm_const
19586
19587 #undef TARGET_VECTORIZE_GET_MASK_MODE
19588 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
19589 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
19590 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
19591   aarch64_empty_mask_is_expensive
19592 #undef TARGET_PREFERRED_ELSE_VALUE
19593 #define TARGET_PREFERRED_ELSE_VALUE \
19594   aarch64_preferred_else_value
19595
19596 #undef TARGET_INIT_LIBFUNCS
19597 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
19598
19599 #undef TARGET_FIXED_CONDITION_CODE_REGS
19600 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
19601
19602 #undef TARGET_FLAGS_REGNUM
19603 #define TARGET_FLAGS_REGNUM CC_REGNUM
19604
19605 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
19606 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
19607
19608 #undef TARGET_ASAN_SHADOW_OFFSET
19609 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
19610
19611 #undef TARGET_LEGITIMIZE_ADDRESS
19612 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
19613
19614 #undef TARGET_SCHED_CAN_SPECULATE_INSN
19615 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
19616
19617 #undef TARGET_CAN_USE_DOLOOP_P
19618 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
19619
19620 #undef TARGET_SCHED_ADJUST_PRIORITY
19621 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
19622
19623 #undef TARGET_SCHED_MACRO_FUSION_P
19624 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
19625
19626 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
19627 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
19628
19629 #undef TARGET_SCHED_FUSION_PRIORITY
19630 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
19631
19632 #undef TARGET_UNSPEC_MAY_TRAP_P
19633 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
19634
19635 #undef TARGET_USE_PSEUDO_PIC_REG
19636 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
19637
19638 #undef TARGET_PRINT_OPERAND
19639 #define TARGET_PRINT_OPERAND aarch64_print_operand
19640
19641 #undef TARGET_PRINT_OPERAND_ADDRESS
19642 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
19643
19644 #undef TARGET_OPTAB_SUPPORTED_P
19645 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
19646
19647 #undef TARGET_OMIT_STRUCT_RETURN_REG
19648 #define TARGET_OMIT_STRUCT_RETURN_REG true
19649
19650 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
19651 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
19652   aarch64_dwarf_poly_indeterminate_value
19653
19654 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
19655 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
19656 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
19657
19658 #undef TARGET_HARD_REGNO_NREGS
19659 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
19660 #undef TARGET_HARD_REGNO_MODE_OK
19661 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
19662
19663 #undef TARGET_MODES_TIEABLE_P
19664 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
19665
19666 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
19667 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
19668   aarch64_hard_regno_call_part_clobbered
19669
19670 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
19671 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
19672   aarch64_remove_extra_call_preserved_regs
19673
19674 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
19675 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
19676   aarch64_return_call_with_max_clobbers
19677
19678 #undef TARGET_CONSTANT_ALIGNMENT
19679 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
19680
19681 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
19682 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
19683   aarch64_stack_clash_protection_alloca_probe_range
19684
19685 #undef TARGET_COMPUTE_PRESSURE_CLASSES
19686 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
19687
19688 #undef TARGET_CAN_CHANGE_MODE_CLASS
19689 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
19690
19691 #undef TARGET_SELECT_EARLY_REMAT_MODES
19692 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
19693
19694 #undef TARGET_SPECULATION_SAFE_VALUE
19695 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
19696
19697 #undef TARGET_ESTIMATED_POLY_VALUE
19698 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
19699
19700 #undef TARGET_ATTRIBUTE_TABLE
19701 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
19702
19703 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
19704 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
19705   aarch64_simd_clone_compute_vecsize_and_simdlen
19706
19707 #undef TARGET_SIMD_CLONE_ADJUST
19708 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
19709
19710 #undef TARGET_SIMD_CLONE_USABLE
19711 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
19712
19713 #undef TARGET_COMP_TYPE_ATTRIBUTES
19714 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
19715
19716 #undef TARGET_GET_MULTILIB_ABI_NAME
19717 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
19718
19719 #if CHECKING_P
19720 #undef TARGET_RUN_TARGET_SELFTESTS
19721 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
19722 #endif /* #if CHECKING_P */
19723
19724 #undef TARGET_ASM_POST_CFI_STARTPROC
19725 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
19726
19727 struct gcc_target targetm = TARGET_INITIALIZER;
19728
19729 #include "gt-aarch64.h"