gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2019 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "cgraph.h"
  44 #include "diagnostic.h"
  45 #include "insn-attr.h"
  46 #include "alias.h"
  47 #include "fold-const.h"
  48 #include "stor-layout.h"
  49 #include "calls.h"
  50 #include "varasm.h"
  51 #include "output.h"
  52 #include "flags.h"
  53 #include "explow.h"
  54 #include "expr.h"
  55 #include "reload.h"
  56 #include "langhooks.h"
  57 #include "opts.h"
  58 #include "params.h"
  59 #include "gimplify.h"
  60 #include "dwarf2.h"
  61 #include "gimple-iterator.h"
  62 #include "tree-vectorizer.h"
  63 #include "aarch64-cost-tables.h"
  64 #include "dumpfile.h"
  65 #include "builtins.h"
  66 #include "rtl-iter.h"
  67 #include "tm-constrs.h"
  68 #include "sched-int.h"
  69 #include "target-globals.h"
  70 #include "common/common-target.h"
  71 #include "cfgrtl.h"
  72 #include "selftest.h"
  73 #include "selftest-rtl.h"
  74 #include "rtx-vector-builder.h"
  75 #include "intl.h"
  76
  77 /* This file should be included last.  */
  78 #include "target-def.h"
  79
  80 /* Defined for convenience.  */
  81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  82
  83 /* Information about a legitimate vector immediate operand.  */
  84 struct simd_immediate_info
  85 {
  86   enum insn_type { MOV, MVN };
  87   enum modifier_type { LSL, MSL };
  88
  89   simd_immediate_info () {}
  90   simd_immediate_info (scalar_float_mode, rtx);
  91   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  92                        insn_type = MOV, modifier_type = LSL,
  93                        unsigned int = 0);
  94   simd_immediate_info (scalar_mode, rtx, rtx);
  95
  96   /* The mode of the elements.  */
  97   scalar_mode elt_mode;
  98
  99   /* The value of each element if all elements are the same, or the
 100      first value if the constant is a series.  */
 101   rtx value;
 102
 103   /* The value of the step if the constant is a series, null otherwise.  */
 104   rtx step;
 105
 106   /* The instruction to use to move the immediate into a vector.  */
 107   insn_type insn;
 108
 109   /* The kind of shift modifier to use, and the number of bits to shift.
 110      This is (LSL, 0) if no shift is needed.  */
 111   modifier_type modifier;
 112   unsigned int shift;
 113 };
 114
 115 /* Construct a floating-point immediate in which each element has mode
 116    ELT_MODE_IN and value VALUE_IN.  */
 117 inline simd_immediate_info
 118 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 119   : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
 120     modifier (LSL), shift (0)
 121 {}
 122
 123 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 124    and value VALUE_IN.  The other parameters are as for the structure
 125    fields.  */
 126 inline simd_immediate_info
 127 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 128                        unsigned HOST_WIDE_INT value_in,
 129                        insn_type insn_in, modifier_type modifier_in,
 130                        unsigned int shift_in)
 131   : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
 132     step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
 133 {}
 134
 135 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 136    and where element I is equal to VALUE_IN + I * STEP_IN.  */
 137 inline simd_immediate_info
 138 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
 139   : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
 140     modifier (LSL), shift (0)
 141 {}
 142
 143 /* The current code model.  */
 144 enum aarch64_code_model aarch64_cmodel;
 145
 146 /* The number of 64-bit elements in an SVE vector.  */
 147 poly_uint16 aarch64_sve_vg;
 148
 149 #ifdef HAVE_AS_TLS
 150 #undef TARGET_HAVE_TLS
 151 #define TARGET_HAVE_TLS 1
 152 #endif
 153
 154 static bool aarch64_composite_type_p (const_tree, machine_mode);
 155 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 156                                                      const_tree,
 157                                                      machine_mode *, int *,
 158                                                      bool *);
 159 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 160 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 161 static void aarch64_override_options_after_change (void);
 162 static bool aarch64_vector_mode_supported_p (machine_mode);
 163 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 164 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 165                                                          const_tree type,
 166                                                          int misalignment,
 167                                                          bool is_packed);
 168 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 169 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 170                                             aarch64_addr_query_type);
 171 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 172
 173 /* Major revision number of the ARM Architecture implemented by the target.  */
 174 unsigned aarch64_architecture_version;
 175
 176 /* The processor for which instructions should be scheduled.  */
 177 enum aarch64_processor aarch64_tune = cortexa53;
 178
 179 /* Mask to specify which instruction scheduling options should be used.  */
 180 unsigned long aarch64_tune_flags = 0;
 181
 182 /* Global flag for PC relative loads.  */
 183 bool aarch64_pcrelative_literal_loads;
 184
 185 /* Global flag for whether frame pointer is enabled.  */
 186 bool aarch64_use_frame_pointer;
 187
 188 #define BRANCH_PROTECT_STR_MAX 255
 189 char *accepted_branch_protection_string = NULL;
 190
 191 static enum aarch64_parse_opt_result
 192 aarch64_parse_branch_protection (const char*, char**);
 193
 194 /* Support for command line parsing of boolean flags in the tuning
 195    structures.  */
 196 struct aarch64_flag_desc
 197 {
 198   const char* name;
 199   unsigned int flag;
 200 };
 201
 202 #define AARCH64_FUSION_PAIR(name, internal_name) \
 203   { name, AARCH64_FUSE_##internal_name },
 204 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 205 {
 206   { "none", AARCH64_FUSE_NOTHING },
 207 #include "aarch64-fusion-pairs.def"
 208   { "all", AARCH64_FUSE_ALL },
 209   { NULL, AARCH64_FUSE_NOTHING }
 210 };
 211
 212 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 213   { name, AARCH64_EXTRA_TUNE_##internal_name },
 214 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 215 {
 216   { "none", AARCH64_EXTRA_TUNE_NONE },
 217 #include "aarch64-tuning-flags.def"
 218   { "all", AARCH64_EXTRA_TUNE_ALL },
 219   { NULL, AARCH64_EXTRA_TUNE_NONE }
 220 };
 221
 222 /* Tuning parameters.  */
 223
 224 static const struct cpu_addrcost_table generic_addrcost_table =
 225 {
 226     {
 227       1, /* hi  */
 228       0, /* si  */
 229       0, /* di  */
 230       1, /* ti  */
 231     },
 232   0, /* pre_modify  */
 233   0, /* post_modify  */
 234   0, /* register_offset  */
 235   0, /* register_sextend  */
 236   0, /* register_zextend  */
 237   0 /* imm_offset  */
 238 };
 239
 240 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 241 {
 242     {
 243       0, /* hi  */
 244       0, /* si  */
 245       0, /* di  */
 246       2, /* ti  */
 247     },
 248   0, /* pre_modify  */
 249   0, /* post_modify  */
 250   1, /* register_offset  */
 251   1, /* register_sextend  */
 252   2, /* register_zextend  */
 253   0, /* imm_offset  */
 254 };
 255
 256 static const struct cpu_addrcost_table xgene1_addrcost_table =
 257 {
 258     {
 259       1, /* hi  */
 260       0, /* si  */
 261       0, /* di  */
 262       1, /* ti  */
 263     },
 264   1, /* pre_modify  */
 265   1, /* post_modify  */
 266   0, /* register_offset  */
 267   1, /* register_sextend  */
 268   1, /* register_zextend  */
 269   0, /* imm_offset  */
 270 };
 271
 272 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 273 {
 274     {
 275       1, /* hi  */
 276       1, /* si  */
 277       1, /* di  */
 278       2, /* ti  */
 279     },
 280   0, /* pre_modify  */
 281   0, /* post_modify  */
 282   2, /* register_offset  */
 283   3, /* register_sextend  */
 284   3, /* register_zextend  */
 285   0, /* imm_offset  */
 286 };
 287
 288 static const struct cpu_addrcost_table tsv110_addrcost_table =
 289 {
 290     {
 291       1, /* hi  */
 292       0, /* si  */
 293       0, /* di  */
 294       1, /* ti  */
 295     },
 296   0, /* pre_modify  */
 297   0, /* post_modify  */
 298   0, /* register_offset  */
 299   1, /* register_sextend  */
 300   1, /* register_zextend  */
 301   0, /* imm_offset  */
 302 };
 303
 304 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 305 {
 306     {
 307       1, /* hi  */
 308       1, /* si  */
 309       1, /* di  */
 310       2, /* ti  */
 311     },
 312   1, /* pre_modify  */
 313   1, /* post_modify  */
 314   3, /* register_offset  */
 315   3, /* register_sextend  */
 316   3, /* register_zextend  */
 317   2, /* imm_offset  */
 318 };
 319
 320 static const struct cpu_regmove_cost generic_regmove_cost =
 321 {
 322   1, /* GP2GP  */
 323   /* Avoid the use of slow int<->fp moves for spilling by setting
 324      their cost higher than memmov_cost.  */
 325   5, /* GP2FP  */
 326   5, /* FP2GP  */
 327   2 /* FP2FP  */
 328 };
 329
 330 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 331 {
 332   1, /* GP2GP  */
 333   /* Avoid the use of slow int<->fp moves for spilling by setting
 334      their cost higher than memmov_cost.  */
 335   5, /* GP2FP  */
 336   5, /* FP2GP  */
 337   2 /* FP2FP  */
 338 };
 339
 340 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 341 {
 342   1, /* GP2GP  */
 343   /* Avoid the use of slow int<->fp moves for spilling by setting
 344      their cost higher than memmov_cost.  */
 345   5, /* GP2FP  */
 346   5, /* FP2GP  */
 347   2 /* FP2FP  */
 348 };
 349
 350 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 351 {
 352   1, /* GP2GP  */
 353   /* Avoid the use of slow int<->fp moves for spilling by setting
 354      their cost higher than memmov_cost (actual, 4 and 9).  */
 355   9, /* GP2FP  */
 356   9, /* FP2GP  */
 357   1 /* FP2FP  */
 358 };
 359
 360 static const struct cpu_regmove_cost thunderx_regmove_cost =
 361 {
 362   2, /* GP2GP  */
 363   2, /* GP2FP  */
 364   6, /* FP2GP  */
 365   4 /* FP2FP  */
 366 };
 367
 368 static const struct cpu_regmove_cost xgene1_regmove_cost =
 369 {
 370   1, /* GP2GP  */
 371   /* Avoid the use of slow int<->fp moves for spilling by setting
 372      their cost higher than memmov_cost.  */
 373   8, /* GP2FP  */
 374   8, /* FP2GP  */
 375   2 /* FP2FP  */
 376 };
 377
 378 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 379 {
 380   2, /* GP2GP  */
 381   /* Avoid the use of int<->fp moves for spilling.  */
 382   6, /* GP2FP  */
 383   6, /* FP2GP  */
 384   4 /* FP2FP  */
 385 };
 386
 387 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 388 {
 389   1, /* GP2GP  */
 390   /* Avoid the use of int<->fp moves for spilling.  */
 391   8, /* GP2FP  */
 392   8, /* FP2GP  */
 393   4  /* FP2FP  */
 394 };
 395
 396 static const struct cpu_regmove_cost tsv110_regmove_cost =
 397 {
 398   1, /* GP2GP  */
 399   /* Avoid the use of slow int<->fp moves for spilling by setting
 400      their cost higher than memmov_cost.  */
 401   2, /* GP2FP  */
 402   3, /* FP2GP  */
 403   2  /* FP2FP  */
 404 };
 405
 406 /* Generic costs for vector insn classes.  */
 407 static const struct cpu_vector_cost generic_vector_cost =
 408 {
 409   1, /* scalar_int_stmt_cost  */
 410   1, /* scalar_fp_stmt_cost  */
 411   1, /* scalar_load_cost  */
 412   1, /* scalar_store_cost  */
 413   1, /* vec_int_stmt_cost  */
 414   1, /* vec_fp_stmt_cost  */
 415   2, /* vec_permute_cost  */
 416   1, /* vec_to_scalar_cost  */
 417   1, /* scalar_to_vec_cost  */
 418   1, /* vec_align_load_cost  */
 419   1, /* vec_unalign_load_cost  */
 420   1, /* vec_unalign_store_cost  */
 421   1, /* vec_store_cost  */
 422   3, /* cond_taken_branch_cost  */
 423   1 /* cond_not_taken_branch_cost  */
 424 };
 425
 426 /* QDF24XX costs for vector insn classes.  */
 427 static const struct cpu_vector_cost qdf24xx_vector_cost =
 428 {
 429   1, /* scalar_int_stmt_cost  */
 430   1, /* scalar_fp_stmt_cost  */
 431   1, /* scalar_load_cost  */
 432   1, /* scalar_store_cost  */
 433   1, /* vec_int_stmt_cost  */
 434   3, /* vec_fp_stmt_cost  */
 435   2, /* vec_permute_cost  */
 436   1, /* vec_to_scalar_cost  */
 437   1, /* scalar_to_vec_cost  */
 438   1, /* vec_align_load_cost  */
 439   1, /* vec_unalign_load_cost  */
 440   1, /* vec_unalign_store_cost  */
 441   1, /* vec_store_cost  */
 442   3, /* cond_taken_branch_cost  */
 443   1 /* cond_not_taken_branch_cost  */
 444 };
 445
 446 /* ThunderX costs for vector insn classes.  */
 447 static const struct cpu_vector_cost thunderx_vector_cost =
 448 {
 449   1, /* scalar_int_stmt_cost  */
 450   1, /* scalar_fp_stmt_cost  */
 451   3, /* scalar_load_cost  */
 452   1, /* scalar_store_cost  */
 453   4, /* vec_int_stmt_cost  */
 454   1, /* vec_fp_stmt_cost  */
 455   4, /* vec_permute_cost  */
 456   2, /* vec_to_scalar_cost  */
 457   2, /* scalar_to_vec_cost  */
 458   3, /* vec_align_load_cost  */
 459   5, /* vec_unalign_load_cost  */
 460   5, /* vec_unalign_store_cost  */
 461   1, /* vec_store_cost  */
 462   3, /* cond_taken_branch_cost  */
 463   3 /* cond_not_taken_branch_cost  */
 464 };
 465
 466 static const struct cpu_vector_cost tsv110_vector_cost =
 467 {
 468   1, /* scalar_int_stmt_cost  */
 469   1, /* scalar_fp_stmt_cost  */
 470   5, /* scalar_load_cost  */
 471   1, /* scalar_store_cost  */
 472   2, /* vec_int_stmt_cost  */
 473   2, /* vec_fp_stmt_cost  */
 474   2, /* vec_permute_cost  */
 475   3, /* vec_to_scalar_cost  */
 476   2, /* scalar_to_vec_cost  */
 477   5, /* vec_align_load_cost  */
 478   5, /* vec_unalign_load_cost  */
 479   1, /* vec_unalign_store_cost  */
 480   1, /* vec_store_cost  */
 481   1, /* cond_taken_branch_cost  */
 482   1 /* cond_not_taken_branch_cost  */
 483 };
 484
 485 /* Generic costs for vector insn classes.  */
 486 static const struct cpu_vector_cost cortexa57_vector_cost =
 487 {
 488   1, /* scalar_int_stmt_cost  */
 489   1, /* scalar_fp_stmt_cost  */
 490   4, /* scalar_load_cost  */
 491   1, /* scalar_store_cost  */
 492   2, /* vec_int_stmt_cost  */
 493   2, /* vec_fp_stmt_cost  */
 494   3, /* vec_permute_cost  */
 495   8, /* vec_to_scalar_cost  */
 496   8, /* scalar_to_vec_cost  */
 497   4, /* vec_align_load_cost  */
 498   4, /* vec_unalign_load_cost  */
 499   1, /* vec_unalign_store_cost  */
 500   1, /* vec_store_cost  */
 501   1, /* cond_taken_branch_cost  */
 502   1 /* cond_not_taken_branch_cost  */
 503 };
 504
 505 static const struct cpu_vector_cost exynosm1_vector_cost =
 506 {
 507   1, /* scalar_int_stmt_cost  */
 508   1, /* scalar_fp_stmt_cost  */
 509   5, /* scalar_load_cost  */
 510   1, /* scalar_store_cost  */
 511   3, /* vec_int_stmt_cost  */
 512   3, /* vec_fp_stmt_cost  */
 513   3, /* vec_permute_cost  */
 514   3, /* vec_to_scalar_cost  */
 515   3, /* scalar_to_vec_cost  */
 516   5, /* vec_align_load_cost  */
 517   5, /* vec_unalign_load_cost  */
 518   1, /* vec_unalign_store_cost  */
 519   1, /* vec_store_cost  */
 520   1, /* cond_taken_branch_cost  */
 521   1 /* cond_not_taken_branch_cost  */
 522 };
 523
 524 /* Generic costs for vector insn classes.  */
 525 static const struct cpu_vector_cost xgene1_vector_cost =
 526 {
 527   1, /* scalar_int_stmt_cost  */
 528   1, /* scalar_fp_stmt_cost  */
 529   5, /* scalar_load_cost  */
 530   1, /* scalar_store_cost  */
 531   2, /* vec_int_stmt_cost  */
 532   2, /* vec_fp_stmt_cost  */
 533   2, /* vec_permute_cost  */
 534   4, /* vec_to_scalar_cost  */
 535   4, /* scalar_to_vec_cost  */
 536   10, /* vec_align_load_cost  */
 537   10, /* vec_unalign_load_cost  */
 538   2, /* vec_unalign_store_cost  */
 539   2, /* vec_store_cost  */
 540   2, /* cond_taken_branch_cost  */
 541   1 /* cond_not_taken_branch_cost  */
 542 };
 543
 544 /* Costs for vector insn classes for Vulcan.  */
 545 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 546 {
 547   1, /* scalar_int_stmt_cost  */
 548   6, /* scalar_fp_stmt_cost  */
 549   4, /* scalar_load_cost  */
 550   1, /* scalar_store_cost  */
 551   5, /* vec_int_stmt_cost  */
 552   6, /* vec_fp_stmt_cost  */
 553   3, /* vec_permute_cost  */
 554   6, /* vec_to_scalar_cost  */
 555   5, /* scalar_to_vec_cost  */
 556   8, /* vec_align_load_cost  */
 557   8, /* vec_unalign_load_cost  */
 558   4, /* vec_unalign_store_cost  */
 559   4, /* vec_store_cost  */
 560   2, /* cond_taken_branch_cost  */
 561   1  /* cond_not_taken_branch_cost  */
 562 };
 563
 564 /* Generic costs for branch instructions.  */
 565 static const struct cpu_branch_cost generic_branch_cost =
 566 {
 567   1,  /* Predictable.  */
 568   3   /* Unpredictable.  */
 569 };
 570
 571 /* Generic approximation modes.  */
 572 static const cpu_approx_modes generic_approx_modes =
 573 {
 574   AARCH64_APPROX_NONE,  /* division  */
 575   AARCH64_APPROX_NONE,  /* sqrt  */
 576   AARCH64_APPROX_NONE   /* recip_sqrt  */
 577 };
 578
 579 /* Approximation modes for Exynos M1.  */
 580 static const cpu_approx_modes exynosm1_approx_modes =
 581 {
 582   AARCH64_APPROX_NONE,  /* division  */
 583   AARCH64_APPROX_ALL,   /* sqrt  */
 584   AARCH64_APPROX_ALL    /* recip_sqrt  */
 585 };
 586
 587 /* Approximation modes for X-Gene 1.  */
 588 static const cpu_approx_modes xgene1_approx_modes =
 589 {
 590   AARCH64_APPROX_NONE,  /* division  */
 591   AARCH64_APPROX_NONE,  /* sqrt  */
 592   AARCH64_APPROX_ALL    /* recip_sqrt  */
 593 };
 594
 595 /* Generic prefetch settings (which disable prefetch).  */
 596 static const cpu_prefetch_tune generic_prefetch_tune =
 597 {
 598   0,                    /* num_slots  */
 599   -1,                   /* l1_cache_size  */
 600   -1,                   /* l1_cache_line_size  */
 601   -1,                   /* l2_cache_size  */
 602   true,                 /* prefetch_dynamic_strides */
 603   -1,                   /* minimum_stride */
 604   -1                    /* default_opt_level  */
 605 };
 606
 607 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 608 {
 609   0,                    /* num_slots  */
 610   -1,                   /* l1_cache_size  */
 611   64,                   /* l1_cache_line_size  */
 612   -1,                   /* l2_cache_size  */
 613   true,                 /* prefetch_dynamic_strides */
 614   -1,                   /* minimum_stride */
 615   -1                    /* default_opt_level  */
 616 };
 617
 618 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 619 {
 620   4,                    /* num_slots  */
 621   32,                   /* l1_cache_size  */
 622   64,                   /* l1_cache_line_size  */
 623   512,                  /* l2_cache_size  */
 624   false,                /* prefetch_dynamic_strides */
 625   2048,                 /* minimum_stride */
 626   3                     /* default_opt_level  */
 627 };
 628
 629 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 630 {
 631   8,                    /* num_slots  */
 632   32,                   /* l1_cache_size  */
 633   128,                  /* l1_cache_line_size  */
 634   16*1024,              /* l2_cache_size  */
 635   true,                 /* prefetch_dynamic_strides */
 636   -1,                   /* minimum_stride */
 637   3                     /* default_opt_level  */
 638 };
 639
 640 static const cpu_prefetch_tune thunderx_prefetch_tune =
 641 {
 642   8,                    /* num_slots  */
 643   32,                   /* l1_cache_size  */
 644   128,                  /* l1_cache_line_size  */
 645   -1,                   /* l2_cache_size  */
 646   true,                 /* prefetch_dynamic_strides */
 647   -1,                   /* minimum_stride */
 648   -1                    /* default_opt_level  */
 649 };
 650
 651 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 652 {
 653   8,                    /* num_slots  */
 654   32,                   /* l1_cache_size  */
 655   64,                   /* l1_cache_line_size  */
 656   256,                  /* l2_cache_size  */
 657   true,                 /* prefetch_dynamic_strides */
 658   -1,                   /* minimum_stride */
 659   -1                    /* default_opt_level  */
 660 };
 661
 662 static const cpu_prefetch_tune tsv110_prefetch_tune =
 663 {
 664   0,                    /* num_slots  */
 665   64,                   /* l1_cache_size  */
 666   64,                   /* l1_cache_line_size  */
 667   512,                  /* l2_cache_size  */
 668   true,                 /* prefetch_dynamic_strides */
 669   -1,                   /* minimum_stride */
 670   -1                    /* default_opt_level  */
 671 };
 672
 673 static const cpu_prefetch_tune xgene1_prefetch_tune =
 674 {
 675   8,                    /* num_slots  */
 676   32,                   /* l1_cache_size  */
 677   64,                   /* l1_cache_line_size  */
 678   256,                  /* l2_cache_size  */
 679   true,                 /* prefetch_dynamic_strides */
 680   -1,                   /* minimum_stride */
 681   -1                    /* default_opt_level  */
 682 };
 683
 684 static const struct tune_params generic_tunings =
 685 {
 686   &cortexa57_extra_costs,
 687   &generic_addrcost_table,
 688   &generic_regmove_cost,
 689   &generic_vector_cost,
 690   &generic_branch_cost,
 691   &generic_approx_modes,
 692   SVE_NOT_IMPLEMENTED, /* sve_width  */
 693   4, /* memmov_cost  */
 694   2, /* issue_rate  */
 695   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 696   "8",  /* function_align.  */
 697   "4",  /* jump_align.  */
 698   "8",  /* loop_align.  */
 699   2,    /* int_reassoc_width.  */
 700   4,    /* fp_reassoc_width.  */
 701   1,    /* vec_reassoc_width.  */
 702   2,    /* min_div_recip_mul_sf.  */
 703   2,    /* min_div_recip_mul_df.  */
 704   0,    /* max_case_values.  */
 705   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 706   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 707   &generic_prefetch_tune
 708 };
 709
 710 static const struct tune_params cortexa35_tunings =
 711 {
 712   &cortexa53_extra_costs,
 713   &generic_addrcost_table,
 714   &cortexa53_regmove_cost,
 715   &generic_vector_cost,
 716   &generic_branch_cost,
 717   &generic_approx_modes,
 718   SVE_NOT_IMPLEMENTED, /* sve_width  */
 719   4, /* memmov_cost  */
 720   1, /* issue_rate  */
 721   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 722    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 723   "16", /* function_align.  */
 724   "4",  /* jump_align.  */
 725   "8",  /* loop_align.  */
 726   2,    /* int_reassoc_width.  */
 727   4,    /* fp_reassoc_width.  */
 728   1,    /* vec_reassoc_width.  */
 729   2,    /* min_div_recip_mul_sf.  */
 730   2,    /* min_div_recip_mul_df.  */
 731   0,    /* max_case_values.  */
 732   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 733   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 734   &generic_prefetch_tune
 735 };
 736
 737 static const struct tune_params cortexa53_tunings =
 738 {
 739   &cortexa53_extra_costs,
 740   &generic_addrcost_table,
 741   &cortexa53_regmove_cost,
 742   &generic_vector_cost,
 743   &generic_branch_cost,
 744   &generic_approx_modes,
 745   SVE_NOT_IMPLEMENTED, /* sve_width  */
 746   4, /* memmov_cost  */
 747   2, /* issue_rate  */
 748   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 749    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 750   "16", /* function_align.  */
 751   "4",  /* jump_align.  */
 752   "8",  /* loop_align.  */
 753   2,    /* int_reassoc_width.  */
 754   4,    /* fp_reassoc_width.  */
 755   1,    /* vec_reassoc_width.  */
 756   2,    /* min_div_recip_mul_sf.  */
 757   2,    /* min_div_recip_mul_df.  */
 758   0,    /* max_case_values.  */
 759   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 760   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 761   &generic_prefetch_tune
 762 };
 763
 764 static const struct tune_params cortexa57_tunings =
 765 {
 766   &cortexa57_extra_costs,
 767   &generic_addrcost_table,
 768   &cortexa57_regmove_cost,
 769   &cortexa57_vector_cost,
 770   &generic_branch_cost,
 771   &generic_approx_modes,
 772   SVE_NOT_IMPLEMENTED, /* sve_width  */
 773   4, /* memmov_cost  */
 774   3, /* issue_rate  */
 775   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 776    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 777   "16", /* function_align.  */
 778   "4",  /* jump_align.  */
 779   "8",  /* loop_align.  */
 780   2,    /* int_reassoc_width.  */
 781   4,    /* fp_reassoc_width.  */
 782   1,    /* vec_reassoc_width.  */
 783   2,    /* min_div_recip_mul_sf.  */
 784   2,    /* min_div_recip_mul_df.  */
 785   0,    /* max_case_values.  */
 786   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 787   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 788   &generic_prefetch_tune
 789 };
 790
 791 static const struct tune_params cortexa72_tunings =
 792 {
 793   &cortexa57_extra_costs,
 794   &generic_addrcost_table,
 795   &cortexa57_regmove_cost,
 796   &cortexa57_vector_cost,
 797   &generic_branch_cost,
 798   &generic_approx_modes,
 799   SVE_NOT_IMPLEMENTED, /* sve_width  */
 800   4, /* memmov_cost  */
 801   3, /* issue_rate  */
 802   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 803    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 804   "16", /* function_align.  */
 805   "4",  /* jump_align.  */
 806   "8",  /* loop_align.  */
 807   2,    /* int_reassoc_width.  */
 808   4,    /* fp_reassoc_width.  */
 809   1,    /* vec_reassoc_width.  */
 810   2,    /* min_div_recip_mul_sf.  */
 811   2,    /* min_div_recip_mul_df.  */
 812   0,    /* max_case_values.  */
 813   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 814   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 815   &generic_prefetch_tune
 816 };
 817
 818 static const struct tune_params cortexa73_tunings =
 819 {
 820   &cortexa57_extra_costs,
 821   &generic_addrcost_table,
 822   &cortexa57_regmove_cost,
 823   &cortexa57_vector_cost,
 824   &generic_branch_cost,
 825   &generic_approx_modes,
 826   SVE_NOT_IMPLEMENTED, /* sve_width  */
 827   4, /* memmov_cost.  */
 828   2, /* issue_rate.  */
 829   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 830    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 831   "16", /* function_align.  */
 832   "4",  /* jump_align.  */
 833   "8",  /* loop_align.  */
 834   2,    /* int_reassoc_width.  */
 835   4,    /* fp_reassoc_width.  */
 836   1,    /* vec_reassoc_width.  */
 837   2,    /* min_div_recip_mul_sf.  */
 838   2,    /* min_div_recip_mul_df.  */
 839   0,    /* max_case_values.  */
 840   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 841   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 842   &generic_prefetch_tune
 843 };
 844
 845
 846
 847 static const struct tune_params exynosm1_tunings =
 848 {
 849   &exynosm1_extra_costs,
 850   &exynosm1_addrcost_table,
 851   &exynosm1_regmove_cost,
 852   &exynosm1_vector_cost,
 853   &generic_branch_cost,
 854   &exynosm1_approx_modes,
 855   SVE_NOT_IMPLEMENTED, /* sve_width  */
 856   4,    /* memmov_cost  */
 857   3,    /* issue_rate  */
 858   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 859   "4",  /* function_align.  */
 860   "4",  /* jump_align.  */
 861   "4",  /* loop_align.  */
 862   2,    /* int_reassoc_width.  */
 863   4,    /* fp_reassoc_width.  */
 864   1,    /* vec_reassoc_width.  */
 865   2,    /* min_div_recip_mul_sf.  */
 866   2,    /* min_div_recip_mul_df.  */
 867   48,   /* max_case_values.  */
 868   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 869   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 870   &exynosm1_prefetch_tune
 871 };
 872
 873 static const struct tune_params thunderxt88_tunings =
 874 {
 875   &thunderx_extra_costs,
 876   &generic_addrcost_table,
 877   &thunderx_regmove_cost,
 878   &thunderx_vector_cost,
 879   &generic_branch_cost,
 880   &generic_approx_modes,
 881   SVE_NOT_IMPLEMENTED, /* sve_width  */
 882   6, /* memmov_cost  */
 883   2, /* issue_rate  */
 884   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 885   "8",  /* function_align.  */
 886   "8",  /* jump_align.  */
 887   "8",  /* loop_align.  */
 888   2,    /* int_reassoc_width.  */
 889   4,    /* fp_reassoc_width.  */
 890   1,    /* vec_reassoc_width.  */
 891   2,    /* min_div_recip_mul_sf.  */
 892   2,    /* min_div_recip_mul_df.  */
 893   0,    /* max_case_values.  */
 894   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 895   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 896   &thunderxt88_prefetch_tune
 897 };
 898
 899 static const struct tune_params thunderx_tunings =
 900 {
 901   &thunderx_extra_costs,
 902   &generic_addrcost_table,
 903   &thunderx_regmove_cost,
 904   &thunderx_vector_cost,
 905   &generic_branch_cost,
 906   &generic_approx_modes,
 907   SVE_NOT_IMPLEMENTED, /* sve_width  */
 908   6, /* memmov_cost  */
 909   2, /* issue_rate  */
 910   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 911   "8",  /* function_align.  */
 912   "8",  /* jump_align.  */
 913   "8",  /* loop_align.  */
 914   2,    /* int_reassoc_width.  */
 915   4,    /* fp_reassoc_width.  */
 916   1,    /* vec_reassoc_width.  */
 917   2,    /* min_div_recip_mul_sf.  */
 918   2,    /* min_div_recip_mul_df.  */
 919   0,    /* max_case_values.  */
 920   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 921   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 922    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 923   &thunderx_prefetch_tune
 924 };
 925
 926 static const struct tune_params tsv110_tunings =
 927 {
 928   &tsv110_extra_costs,
 929   &tsv110_addrcost_table,
 930   &tsv110_regmove_cost,
 931   &tsv110_vector_cost,
 932   &generic_branch_cost,
 933   &generic_approx_modes,
 934   SVE_NOT_IMPLEMENTED, /* sve_width  */
 935   4,    /* memmov_cost  */
 936   4,    /* issue_rate  */
 937   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
 938    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 939   "16", /* function_align.  */
 940   "4",  /* jump_align.  */
 941   "8",  /* loop_align.  */
 942   2,    /* int_reassoc_width.  */
 943   4,    /* fp_reassoc_width.  */
 944   1,    /* vec_reassoc_width.  */
 945   2,    /* min_div_recip_mul_sf.  */
 946   2,    /* min_div_recip_mul_df.  */
 947   0,    /* max_case_values.  */
 948   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 949   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
 950   &tsv110_prefetch_tune
 951 };
 952
 953 static const struct tune_params xgene1_tunings =
 954 {
 955   &xgene1_extra_costs,
 956   &xgene1_addrcost_table,
 957   &xgene1_regmove_cost,
 958   &xgene1_vector_cost,
 959   &generic_branch_cost,
 960   &xgene1_approx_modes,
 961   SVE_NOT_IMPLEMENTED, /* sve_width  */
 962   6, /* memmov_cost  */
 963   4, /* issue_rate  */
 964   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 965   "16", /* function_align.  */
 966   "16", /* jump_align.  */
 967   "16", /* loop_align.  */
 968   2,    /* int_reassoc_width.  */
 969   4,    /* fp_reassoc_width.  */
 970   1,    /* vec_reassoc_width.  */
 971   2,    /* min_div_recip_mul_sf.  */
 972   2,    /* min_div_recip_mul_df.  */
 973   17,   /* max_case_values.  */
 974   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 975   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
 976   &xgene1_prefetch_tune
 977 };
 978
 979 static const struct tune_params emag_tunings =
 980 {
 981   &xgene1_extra_costs,
 982   &xgene1_addrcost_table,
 983   &xgene1_regmove_cost,
 984   &xgene1_vector_cost,
 985   &generic_branch_cost,
 986   &xgene1_approx_modes,
 987   SVE_NOT_IMPLEMENTED,
 988   6, /* memmov_cost  */
 989   4, /* issue_rate  */
 990   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 991   "16", /* function_align.  */
 992   "16", /* jump_align.  */
 993   "16", /* loop_align.  */
 994   2,    /* int_reassoc_width.  */
 995   4,    /* fp_reassoc_width.  */
 996   1,    /* vec_reassoc_width.  */
 997   2,    /* min_div_recip_mul_sf.  */
 998   2,    /* min_div_recip_mul_df.  */
 999   17,   /* max_case_values.  */
1000   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1001   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1002   &xgene1_prefetch_tune
1003 };
1004
1005 static const struct tune_params qdf24xx_tunings =
1006 {
1007   &qdf24xx_extra_costs,
1008   &qdf24xx_addrcost_table,
1009   &qdf24xx_regmove_cost,
1010   &qdf24xx_vector_cost,
1011   &generic_branch_cost,
1012   &generic_approx_modes,
1013   SVE_NOT_IMPLEMENTED, /* sve_width  */
1014   4, /* memmov_cost  */
1015   4, /* issue_rate  */
1016   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1017    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1018   "16", /* function_align.  */
1019   "8",  /* jump_align.  */
1020   "16", /* loop_align.  */
1021   2,    /* int_reassoc_width.  */
1022   4,    /* fp_reassoc_width.  */
1023   1,    /* vec_reassoc_width.  */
1024   2,    /* min_div_recip_mul_sf.  */
1025   2,    /* min_div_recip_mul_df.  */
1026   0,    /* max_case_values.  */
1027   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1028   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1029   &qdf24xx_prefetch_tune
1030 };
1031
1032 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1033    for now.  */
1034 static const struct tune_params saphira_tunings =
1035 {
1036   &generic_extra_costs,
1037   &generic_addrcost_table,
1038   &generic_regmove_cost,
1039   &generic_vector_cost,
1040   &generic_branch_cost,
1041   &generic_approx_modes,
1042   SVE_NOT_IMPLEMENTED, /* sve_width  */
1043   4, /* memmov_cost  */
1044   4, /* issue_rate  */
1045   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1046    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1047   "16", /* function_align.  */
1048   "8",  /* jump_align.  */
1049   "16", /* loop_align.  */
1050   2,    /* int_reassoc_width.  */
1051   4,    /* fp_reassoc_width.  */
1052   1,    /* vec_reassoc_width.  */
1053   2,    /* min_div_recip_mul_sf.  */
1054   2,    /* min_div_recip_mul_df.  */
1055   0,    /* max_case_values.  */
1056   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1057   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1058   &generic_prefetch_tune
1059 };
1060
1061 static const struct tune_params thunderx2t99_tunings =
1062 {
1063   &thunderx2t99_extra_costs,
1064   &thunderx2t99_addrcost_table,
1065   &thunderx2t99_regmove_cost,
1066   &thunderx2t99_vector_cost,
1067   &generic_branch_cost,
1068   &generic_approx_modes,
1069   SVE_NOT_IMPLEMENTED, /* sve_width  */
1070   4, /* memmov_cost.  */
1071   4, /* issue_rate.  */
1072   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1073    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
1074   "16", /* function_align.  */
1075   "8",  /* jump_align.  */
1076   "16", /* loop_align.  */
1077   3,    /* int_reassoc_width.  */
1078   2,    /* fp_reassoc_width.  */
1079   2,    /* vec_reassoc_width.  */
1080   2,    /* min_div_recip_mul_sf.  */
1081   2,    /* min_div_recip_mul_df.  */
1082   0,    /* max_case_values.  */
1083   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1084   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1085   &thunderx2t99_prefetch_tune
1086 };
1087
1088 static const struct tune_params ares_tunings =
1089 {
1090   &cortexa57_extra_costs,
1091   &generic_addrcost_table,
1092   &generic_regmove_cost,
1093   &cortexa57_vector_cost,
1094   &generic_branch_cost,
1095   &generic_approx_modes,
1096   SVE_NOT_IMPLEMENTED, /* sve_width  */
1097   4, /* memmov_cost  */
1098   3, /* issue_rate  */
1099   AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
1100   "32:16",      /* function_align.  */
1101   "32:16",      /* jump_align.  */
1102   "32:16",      /* loop_align.  */
1103   2,    /* int_reassoc_width.  */
1104   4,    /* fp_reassoc_width.  */
1105   2,    /* vec_reassoc_width.  */
1106   2,    /* min_div_recip_mul_sf.  */
1107   2,    /* min_div_recip_mul_df.  */
1108   0,    /* max_case_values.  */
1109   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1110   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1111   &generic_prefetch_tune
1112 };
1113
1114 /* Support for fine-grained override of the tuning structures.  */
1115 struct aarch64_tuning_override_function
1116 {
1117   const char* name;
1118   void (*parse_override)(const char*, struct tune_params*);
1119 };
1120
1121 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1122 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1123 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1124
1125 static const struct aarch64_tuning_override_function
1126 aarch64_tuning_override_functions[] =
1127 {
1128   { "fuse", aarch64_parse_fuse_string },
1129   { "tune", aarch64_parse_tune_string },
1130   { "sve_width", aarch64_parse_sve_width_string },
1131   { NULL, NULL }
1132 };
1133
1134 /* A processor implementing AArch64.  */
1135 struct processor
1136 {
1137   const char *const name;
1138   enum aarch64_processor ident;
1139   enum aarch64_processor sched_core;
1140   enum aarch64_arch arch;
1141   unsigned architecture_version;
1142   const unsigned long flags;
1143   const struct tune_params *const tune;
1144 };
1145
1146 /* Architectures implementing AArch64.  */
1147 static const struct processor all_architectures[] =
1148 {
1149 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1150   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1151 #include "aarch64-arches.def"
1152   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1153 };
1154
1155 /* Processor cores implementing AArch64.  */
1156 static const struct processor all_cores[] =
1157 {
1158 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1159   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1160   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1161   FLAGS, &COSTS##_tunings},
1162 #include "aarch64-cores.def"
1163   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1164     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1165   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1166 };
1167
1168
1169 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1170    handling code or by target attributes.  */
1171 static const struct processor *selected_arch;
1172 static const struct processor *selected_cpu;
1173 static const struct processor *selected_tune;
1174
1175 /* The current tuning set.  */
1176 struct tune_params aarch64_tune_params = generic_tunings;
1177
1178 /* Table of machine attributes.  */
1179 static const struct attribute_spec aarch64_attribute_table[] =
1180 {
1181   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1182        affects_type_identity, handler, exclude } */
1183   { "aarch64_vector_pcs", 0, 0, false, true,  true,  false, NULL, NULL },
1184   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1185 };
1186
1187 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1188
1189 /* An ISA extension in the co-processor and main instruction set space.  */
1190 struct aarch64_option_extension
1191 {
1192   const char *const name;
1193   const unsigned long flags_on;
1194   const unsigned long flags_off;
1195 };
1196
1197 typedef enum aarch64_cond_code
1198 {
1199   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1200   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1201   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1202 }
1203 aarch64_cc;
1204
1205 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1206
1207 struct aarch64_branch_protect_type
1208 {
1209   /* The type's name that the user passes to the branch-protection option
1210     string.  */
1211   const char* name;
1212   /* Function to handle the protection type and set global variables.
1213     First argument is the string token corresponding with this type and the
1214     second argument is the next token in the option string.
1215     Return values:
1216     * AARCH64_PARSE_OK: Handling was sucessful.
1217     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1218       should print an error.
1219     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1220       own error.  */
1221   enum aarch64_parse_opt_result (*handler)(char*, char*);
1222   /* A list of types that can follow this type in the option string.  */
1223   const aarch64_branch_protect_type* subtypes;
1224   unsigned int num_subtypes;
1225 };
1226
1227 static enum aarch64_parse_opt_result
1228 aarch64_handle_no_branch_protection (char* str, char* rest)
1229 {
1230   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1231   aarch64_enable_bti = 0;
1232   if (rest)
1233     {
1234       error ("unexpected %<%s%> after %<%s%>", rest, str);
1235       return AARCH64_PARSE_INVALID_FEATURE;
1236     }
1237   return AARCH64_PARSE_OK;
1238 }
1239
1240 static enum aarch64_parse_opt_result
1241 aarch64_handle_standard_branch_protection (char* str, char* rest)
1242 {
1243   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1244   aarch64_enable_bti = 1;
1245   if (rest)
1246     {
1247       error ("unexpected %<%s%> after %<%s%>", rest, str);
1248       return AARCH64_PARSE_INVALID_FEATURE;
1249     }
1250   return AARCH64_PARSE_OK;
1251 }
1252
1253 static enum aarch64_parse_opt_result
1254 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1255                                     char* rest ATTRIBUTE_UNUSED)
1256 {
1257   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1258   return AARCH64_PARSE_OK;
1259 }
1260
1261 static enum aarch64_parse_opt_result
1262 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1263                               char* rest ATTRIBUTE_UNUSED)
1264 {
1265   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1266   return AARCH64_PARSE_OK;
1267 }
1268
1269 static enum aarch64_parse_opt_result
1270 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1271                                     char* rest ATTRIBUTE_UNUSED)
1272 {
1273   aarch64_enable_bti = 1;
1274   return AARCH64_PARSE_OK;
1275 }
1276
1277 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1278   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1279   { NULL, NULL, NULL, 0 }
1280 };
1281
1282 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1283   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1284   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1285   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1286     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1287   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1288   { NULL, NULL, NULL, 0 }
1289 };
1290
1291 /* The condition codes of the processor, and the inverse function.  */
1292 static const char * const aarch64_condition_codes[] =
1293 {
1294   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1295   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1296 };
1297
1298 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1299 const char *
1300 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1301                         const char * branch_format)
1302 {
1303     rtx_code_label * tmp_label = gen_label_rtx ();
1304     char label_buf[256];
1305     char buffer[128];
1306     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1307                                  CODE_LABEL_NUMBER (tmp_label));
1308     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1309     rtx dest_label = operands[pos_label];
1310     operands[pos_label] = tmp_label;
1311
1312     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1313     output_asm_insn (buffer, operands);
1314
1315     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1316     operands[pos_label] = dest_label;
1317     output_asm_insn (buffer, operands);
1318     return "";
1319 }
1320
1321 void
1322 aarch64_err_no_fpadvsimd (machine_mode mode)
1323 {
1324   if (TARGET_GENERAL_REGS_ONLY)
1325     if (FLOAT_MODE_P (mode))
1326       error ("%qs is incompatible with the use of floating-point types",
1327              "-mgeneral-regs-only");
1328     else
1329       error ("%qs is incompatible with the use of vector types",
1330              "-mgeneral-regs-only");
1331   else
1332     if (FLOAT_MODE_P (mode))
1333       error ("%qs feature modifier is incompatible with the use of"
1334              " floating-point types", "+nofp");
1335     else
1336       error ("%qs feature modifier is incompatible with the use of"
1337              " vector types", "+nofp");
1338 }
1339
1340 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1341    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1342    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1343    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1344    and GENERAL_REGS is lower than the memory cost (in this case the best class
1345    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1346    cost results in bad allocations with many redundant int<->FP moves which
1347    are expensive on various cores.
1348    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1349    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1350    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1351    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1352    The result of this is that it is no longer inefficient to have a higher
1353    memory move cost than the register move cost.
1354 */
1355
1356 static reg_class_t
1357 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1358                                          reg_class_t best_class)
1359 {
1360   machine_mode mode;
1361
1362   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1363       || !reg_class_subset_p (FP_REGS, allocno_class))
1364     return allocno_class;
1365
1366   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1367       || !reg_class_subset_p (FP_REGS, best_class))
1368     return best_class;
1369
1370   mode = PSEUDO_REGNO_MODE (regno);
1371   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1372 }
1373
1374 static unsigned int
1375 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1376 {
1377   if (GET_MODE_UNIT_SIZE (mode) == 4)
1378     return aarch64_tune_params.min_div_recip_mul_sf;
1379   return aarch64_tune_params.min_div_recip_mul_df;
1380 }
1381
1382 /* Return the reassociation width of treeop OPC with mode MODE.  */
1383 static int
1384 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1385 {
1386   if (VECTOR_MODE_P (mode))
1387     return aarch64_tune_params.vec_reassoc_width;
1388   if (INTEGRAL_MODE_P (mode))
1389     return aarch64_tune_params.int_reassoc_width;
1390   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1391   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1392     return aarch64_tune_params.fp_reassoc_width;
1393   return 1;
1394 }
1395
1396 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1397 unsigned
1398 aarch64_dbx_register_number (unsigned regno)
1399 {
1400    if (GP_REGNUM_P (regno))
1401      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1402    else if (regno == SP_REGNUM)
1403      return AARCH64_DWARF_SP;
1404    else if (FP_REGNUM_P (regno))
1405      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1406    else if (PR_REGNUM_P (regno))
1407      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1408    else if (regno == VG_REGNUM)
1409      return AARCH64_DWARF_VG;
1410
1411    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1412       equivalent DWARF register.  */
1413    return DWARF_FRAME_REGISTERS;
1414 }
1415
1416 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1417 static bool
1418 aarch64_advsimd_struct_mode_p (machine_mode mode)
1419 {
1420   return (TARGET_SIMD
1421           && (mode == OImode || mode == CImode || mode == XImode));
1422 }
1423
1424 /* Return true if MODE is an SVE predicate mode.  */
1425 static bool
1426 aarch64_sve_pred_mode_p (machine_mode mode)
1427 {
1428   return (TARGET_SVE
1429           && (mode == VNx16BImode
1430               || mode == VNx8BImode
1431               || mode == VNx4BImode
1432               || mode == VNx2BImode));
1433 }
1434
1435 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1436 const unsigned int VEC_ADVSIMD  = 1;
1437 const unsigned int VEC_SVE_DATA = 2;
1438 const unsigned int VEC_SVE_PRED = 4;
1439 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1440    a structure of 2, 3 or 4 vectors.  */
1441 const unsigned int VEC_STRUCT   = 8;
1442 /* Useful combinations of the above.  */
1443 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1444 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1445
1446 /* Return a set of flags describing the vector properties of mode MODE.
1447    Ignore modes that are not supported by the current target.  */
1448 static unsigned int
1449 aarch64_classify_vector_mode (machine_mode mode)
1450 {
1451   if (aarch64_advsimd_struct_mode_p (mode))
1452     return VEC_ADVSIMD | VEC_STRUCT;
1453
1454   if (aarch64_sve_pred_mode_p (mode))
1455     return VEC_SVE_PRED;
1456
1457   scalar_mode inner = GET_MODE_INNER (mode);
1458   if (VECTOR_MODE_P (mode)
1459       && (inner == QImode
1460           || inner == HImode
1461           || inner == HFmode
1462           || inner == SImode
1463           || inner == SFmode
1464           || inner == DImode
1465           || inner == DFmode))
1466     {
1467       if (TARGET_SVE)
1468         {
1469           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1470             return VEC_SVE_DATA;
1471           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1472               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1473               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1474             return VEC_SVE_DATA | VEC_STRUCT;
1475         }
1476
1477       /* This includes V1DF but not V1DI (which doesn't exist).  */
1478       if (TARGET_SIMD
1479           && (known_eq (GET_MODE_BITSIZE (mode), 64)
1480               || known_eq (GET_MODE_BITSIZE (mode), 128)))
1481         return VEC_ADVSIMD;
1482     }
1483
1484   return 0;
1485 }
1486
1487 /* Return true if MODE is any of the data vector modes, including
1488    structure modes.  */
1489 static bool
1490 aarch64_vector_data_mode_p (machine_mode mode)
1491 {
1492   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1493 }
1494
1495 /* Return true if MODE is an SVE data vector mode; either a single vector
1496    or a structure of vectors.  */
1497 static bool
1498 aarch64_sve_data_mode_p (machine_mode mode)
1499 {
1500   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1501 }
1502
1503 /* Implement target hook TARGET_ARRAY_MODE.  */
1504 static opt_machine_mode
1505 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1506 {
1507   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1508       && IN_RANGE (nelems, 2, 4))
1509     return mode_for_vector (GET_MODE_INNER (mode),
1510                             GET_MODE_NUNITS (mode) * nelems);
1511
1512   return opt_machine_mode ();
1513 }
1514
1515 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1516 static bool
1517 aarch64_array_mode_supported_p (machine_mode mode,
1518                                 unsigned HOST_WIDE_INT nelems)
1519 {
1520   if (TARGET_SIMD
1521       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1522           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1523       && (nelems >= 2 && nelems <= 4))
1524     return true;
1525
1526   return false;
1527 }
1528
1529 /* Return the SVE predicate mode to use for elements that have
1530    ELEM_NBYTES bytes, if such a mode exists.  */
1531
1532 opt_machine_mode
1533 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1534 {
1535   if (TARGET_SVE)
1536     {
1537       if (elem_nbytes == 1)
1538         return VNx16BImode;
1539       if (elem_nbytes == 2)
1540         return VNx8BImode;
1541       if (elem_nbytes == 4)
1542         return VNx4BImode;
1543       if (elem_nbytes == 8)
1544         return VNx2BImode;
1545     }
1546   return opt_machine_mode ();
1547 }
1548
1549 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1550
1551 static opt_machine_mode
1552 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1553 {
1554   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1555     {
1556       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1557       machine_mode pred_mode;
1558       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1559         return pred_mode;
1560     }
1561
1562   return default_get_mask_mode (nunits, nbytes);
1563 }
1564
1565 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1566    prefer to use the first arithmetic operand as the else value if
1567    the else value doesn't matter, since that exactly matches the SVE
1568    destructive merging form.  For ternary operations we could either
1569    pick the first operand and use FMAD-like instructions or the last
1570    operand and use FMLA-like instructions; the latter seems more
1571    natural.  */
1572
1573 static tree
1574 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1575 {
1576   return nops == 3 ? ops[2] : ops[0];
1577 }
1578
1579 /* Implement TARGET_HARD_REGNO_NREGS.  */
1580
1581 static unsigned int
1582 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1583 {
1584   /* ??? Logically we should only need to provide a value when
1585      HARD_REGNO_MODE_OK says that the combination is valid,
1586      but at the moment we need to handle all modes.  Just ignore
1587      any runtime parts for registers that can't store them.  */
1588   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1589   switch (aarch64_regno_regclass (regno))
1590     {
1591     case FP_REGS:
1592     case FP_LO_REGS:
1593       if (aarch64_sve_data_mode_p (mode))
1594         return exact_div (GET_MODE_SIZE (mode),
1595                           BYTES_PER_SVE_VECTOR).to_constant ();
1596       return CEIL (lowest_size, UNITS_PER_VREG);
1597     case PR_REGS:
1598     case PR_LO_REGS:
1599     case PR_HI_REGS:
1600       return 1;
1601     default:
1602       return CEIL (lowest_size, UNITS_PER_WORD);
1603     }
1604   gcc_unreachable ();
1605 }
1606
1607 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1608
1609 static bool
1610 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1611 {
1612   if (GET_MODE_CLASS (mode) == MODE_CC)
1613     return regno == CC_REGNUM;
1614
1615   if (regno == VG_REGNUM)
1616     /* This must have the same size as _Unwind_Word.  */
1617     return mode == DImode;
1618
1619   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1620   if (vec_flags & VEC_SVE_PRED)
1621     return PR_REGNUM_P (regno);
1622
1623   if (PR_REGNUM_P (regno))
1624     return 0;
1625
1626   if (regno == SP_REGNUM)
1627     /* The purpose of comparing with ptr_mode is to support the
1628        global register variable associated with the stack pointer
1629        register via the syntax of asm ("wsp") in ILP32.  */
1630     return mode == Pmode || mode == ptr_mode;
1631
1632   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1633     return mode == Pmode;
1634
1635   if (GP_REGNUM_P (regno))
1636     {
1637       if (known_le (GET_MODE_SIZE (mode), 8))
1638         return true;
1639       else if (known_le (GET_MODE_SIZE (mode), 16))
1640         return (regno & 1) == 0;
1641     }
1642   else if (FP_REGNUM_P (regno))
1643     {
1644       if (vec_flags & VEC_STRUCT)
1645         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1646       else
1647         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1648     }
1649
1650   return false;
1651 }
1652
1653 /* Return true if this is a definition of a vectorized simd function.  */
1654
1655 static bool
1656 aarch64_simd_decl_p (tree fndecl)
1657 {
1658   tree fntype;
1659
1660   if (fndecl == NULL)
1661     return false;
1662   fntype = TREE_TYPE (fndecl);
1663   if (fntype == NULL)
1664     return false;
1665
1666   /* Functions with the aarch64_vector_pcs attribute use the simd ABI.  */
1667   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1668     return true;
1669
1670   return false;
1671 }
1672
1673 /* Return the mode a register save/restore should use.  DImode for integer
1674    registers, DFmode for FP registers in non-SIMD functions (they only save
1675    the bottom half of a 128 bit register), or TFmode for FP registers in
1676    SIMD functions.  */
1677
1678 static machine_mode
1679 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1680 {
1681   return GP_REGNUM_P (regno)
1682            ? E_DImode
1683            : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1684 }
1685
1686 /* Return true if the instruction is a call to a SIMD function, false
1687    if it is not a SIMD function or if we do not know anything about
1688    the function.  */
1689
1690 static bool
1691 aarch64_simd_call_p (rtx_insn *insn)
1692 {
1693   rtx symbol;
1694   rtx call;
1695   tree fndecl;
1696
1697   gcc_assert (CALL_P (insn));
1698   call = get_call_rtx_from (insn);
1699   symbol = XEXP (XEXP (call, 0), 0);
1700   if (GET_CODE (symbol) != SYMBOL_REF)
1701     return false;
1702   fndecl = SYMBOL_REF_DECL (symbol);
1703   if (!fndecl)
1704     return false;
1705
1706   return aarch64_simd_decl_p (fndecl);
1707 }
1708
1709 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS.  If INSN calls
1710    a function that uses the SIMD ABI, take advantage of the extra
1711    call-preserved registers that the ABI provides.  */
1712
1713 void
1714 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1715                                           HARD_REG_SET *return_set)
1716 {
1717   if (aarch64_simd_call_p (insn))
1718     {
1719       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1720         if (FP_SIMD_SAVED_REGNUM_P (regno))
1721           CLEAR_HARD_REG_BIT (*return_set, regno);
1722     }
1723 }
1724
1725 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1726    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1727    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1728
1729 static bool
1730 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1731                                         machine_mode mode)
1732 {
1733   bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1734   return FP_REGNUM_P (regno)
1735          && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1736 }
1737
1738 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS.  */
1739
1740 rtx_insn *
1741 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1742 {
1743   gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1744
1745   if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1746     return call_1;
1747   else
1748     return call_2;
1749 }
1750
1751 /* Implement REGMODE_NATURAL_SIZE.  */
1752 poly_uint64
1753 aarch64_regmode_natural_size (machine_mode mode)
1754 {
1755   /* The natural size for SVE data modes is one SVE data vector,
1756      and similarly for predicates.  We can't independently modify
1757      anything smaller than that.  */
1758   /* ??? For now, only do this for variable-width SVE registers.
1759      Doing it for constant-sized registers breaks lower-subreg.c.  */
1760   /* ??? And once that's fixed, we should probably have similar
1761      code for Advanced SIMD.  */
1762   if (!aarch64_sve_vg.is_constant ())
1763     {
1764       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1765       if (vec_flags & VEC_SVE_PRED)
1766         return BYTES_PER_SVE_PRED;
1767       if (vec_flags & VEC_SVE_DATA)
1768         return BYTES_PER_SVE_VECTOR;
1769     }
1770   return UNITS_PER_WORD;
1771 }
1772
1773 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1774 machine_mode
1775 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1776                                      machine_mode mode)
1777 {
1778   /* The predicate mode determines which bits are significant and
1779      which are "don't care".  Decreasing the number of lanes would
1780      lose data while increasing the number of lanes would make bits
1781      unnecessarily significant.  */
1782   if (PR_REGNUM_P (regno))
1783     return mode;
1784   if (known_ge (GET_MODE_SIZE (mode), 4))
1785     return mode;
1786   else
1787     return SImode;
1788 }
1789
1790 /* Return true if I's bits are consecutive ones from the MSB.  */
1791 bool
1792 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1793 {
1794   return exact_log2 (-i) != HOST_WIDE_INT_M1;
1795 }
1796
1797 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1798    that strcpy from constants will be faster.  */
1799
1800 static HOST_WIDE_INT
1801 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1802 {
1803   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1804     return MAX (align, BITS_PER_WORD);
1805   return align;
1806 }
1807
1808 /* Return true if calls to DECL should be treated as
1809    long-calls (ie called via a register).  */
1810 static bool
1811 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1812 {
1813   return false;
1814 }
1815
1816 /* Return true if calls to symbol-ref SYM should be treated as
1817    long-calls (ie called via a register).  */
1818 bool
1819 aarch64_is_long_call_p (rtx sym)
1820 {
1821   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1822 }
1823
1824 /* Return true if calls to symbol-ref SYM should not go through
1825    plt stubs.  */
1826
1827 bool
1828 aarch64_is_noplt_call_p (rtx sym)
1829 {
1830   const_tree decl = SYMBOL_REF_DECL (sym);
1831
1832   if (flag_pic
1833       && decl
1834       && (!flag_plt
1835           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1836       && !targetm.binds_local_p (decl))
1837     return true;
1838
1839   return false;
1840 }
1841
1842 /* Return true if the offsets to a zero/sign-extract operation
1843    represent an expression that matches an extend operation.  The
1844    operands represent the paramters from
1845
1846    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1847 bool
1848 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1849                                 rtx extract_imm)
1850 {
1851   HOST_WIDE_INT mult_val, extract_val;
1852
1853   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1854     return false;
1855
1856   mult_val = INTVAL (mult_imm);
1857   extract_val = INTVAL (extract_imm);
1858
1859   if (extract_val > 8
1860       && extract_val < GET_MODE_BITSIZE (mode)
1861       && exact_log2 (extract_val & ~7) > 0
1862       && (extract_val & 7) <= 4
1863       && mult_val == (1 << (extract_val & 7)))
1864     return true;
1865
1866   return false;
1867 }
1868
1869 /* Emit an insn that's a simple single-set.  Both the operands must be
1870    known to be valid.  */
1871 inline static rtx_insn *
1872 emit_set_insn (rtx x, rtx y)
1873 {
1874   return emit_insn (gen_rtx_SET (x, y));
1875 }
1876
1877 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1878    return the rtx for register 0 in the proper mode.  */
1879 rtx
1880 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1881 {
1882   machine_mode mode = SELECT_CC_MODE (code, x, y);
1883   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1884
1885   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1886   return cc_reg;
1887 }
1888
1889 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
1890
1891 static rtx
1892 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
1893                                   machine_mode y_mode)
1894 {
1895   if (y_mode == E_QImode || y_mode == E_HImode)
1896     {
1897       if (CONST_INT_P (y))
1898         y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
1899       else
1900         {
1901           rtx t, cc_reg;
1902           machine_mode cc_mode;
1903
1904           t = gen_rtx_ZERO_EXTEND (SImode, y);
1905           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
1906           cc_mode = CC_SWPmode;
1907           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
1908           emit_set_insn (cc_reg, t);
1909           return cc_reg;
1910         }
1911     }
1912
1913   return aarch64_gen_compare_reg (code, x, y);
1914 }
1915
1916 /* Build the SYMBOL_REF for __tls_get_addr.  */
1917
1918 static GTY(()) rtx tls_get_addr_libfunc;
1919
1920 rtx
1921 aarch64_tls_get_addr (void)
1922 {
1923   if (!tls_get_addr_libfunc)
1924     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1925   return tls_get_addr_libfunc;
1926 }
1927
1928 /* Return the TLS model to use for ADDR.  */
1929
1930 static enum tls_model
1931 tls_symbolic_operand_type (rtx addr)
1932 {
1933   enum tls_model tls_kind = TLS_MODEL_NONE;
1934   if (GET_CODE (addr) == CONST)
1935     {
1936       poly_int64 addend;
1937       rtx sym = strip_offset (addr, &addend);
1938       if (GET_CODE (sym) == SYMBOL_REF)
1939         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1940     }
1941   else if (GET_CODE (addr) == SYMBOL_REF)
1942     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1943
1944   return tls_kind;
1945 }
1946
1947 /* We'll allow lo_sum's in addresses in our legitimate addresses
1948    so that combine would take care of combining addresses where
1949    necessary, but for generation purposes, we'll generate the address
1950    as :
1951    RTL                               Absolute
1952    tmp = hi (symbol_ref);            adrp  x1, foo
1953    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1954                                      nop
1955
1956    PIC                               TLS
1957    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1958    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1959                                      bl   __tls_get_addr
1960                                      nop
1961
1962    Load TLS symbol, depending on TLS mechanism and TLS access model.
1963
1964    Global Dynamic - Traditional TLS:
1965    adrp tmp, :tlsgd:imm
1966    add  dest, tmp, #:tlsgd_lo12:imm
1967    bl   __tls_get_addr
1968
1969    Global Dynamic - TLS Descriptors:
1970    adrp dest, :tlsdesc:imm
1971    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1972    add  dest, dest, #:tlsdesc_lo12:imm
1973    blr  tmp
1974    mrs  tp, tpidr_el0
1975    add  dest, dest, tp
1976
1977    Initial Exec:
1978    mrs  tp, tpidr_el0
1979    adrp tmp, :gottprel:imm
1980    ldr  dest, [tmp, #:gottprel_lo12:imm]
1981    add  dest, dest, tp
1982
1983    Local Exec:
1984    mrs  tp, tpidr_el0
1985    add  t0, tp, #:tprel_hi12:imm, lsl #12
1986    add  t0, t0, #:tprel_lo12_nc:imm
1987 */
1988
1989 static void
1990 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1991                                    enum aarch64_symbol_type type)
1992 {
1993   switch (type)
1994     {
1995     case SYMBOL_SMALL_ABSOLUTE:
1996       {
1997         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1998         rtx tmp_reg = dest;
1999         machine_mode mode = GET_MODE (dest);
2000
2001         gcc_assert (mode == Pmode || mode == ptr_mode);
2002
2003         if (can_create_pseudo_p ())
2004           tmp_reg = gen_reg_rtx (mode);
2005
2006         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2007         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2008         return;
2009       }
2010
2011     case SYMBOL_TINY_ABSOLUTE:
2012       emit_insn (gen_rtx_SET (dest, imm));
2013       return;
2014
2015     case SYMBOL_SMALL_GOT_28K:
2016       {
2017         machine_mode mode = GET_MODE (dest);
2018         rtx gp_rtx = pic_offset_table_rtx;
2019         rtx insn;
2020         rtx mem;
2021
2022         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2023            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2024            decide rtx costs, in which case pic_offset_table_rtx is not
2025            initialized.  For that case no need to generate the first adrp
2026            instruction as the final cost for global variable access is
2027            one instruction.  */
2028         if (gp_rtx != NULL)
2029           {
2030             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2031                using the page base as GOT base, the first page may be wasted,
2032                in the worst scenario, there is only 28K space for GOT).
2033
2034                The generate instruction sequence for accessing global variable
2035                is:
2036
2037                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2038
2039                Only one instruction needed. But we must initialize
2040                pic_offset_table_rtx properly.  We generate initialize insn for
2041                every global access, and allow CSE to remove all redundant.
2042
2043                The final instruction sequences will look like the following
2044                for multiply global variables access.
2045
2046                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2047
2048                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2049                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2050                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2051                  ...  */
2052
2053             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2054             crtl->uses_pic_offset_table = 1;
2055             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2056
2057             if (mode != GET_MODE (gp_rtx))
2058              gp_rtx = gen_lowpart (mode, gp_rtx);
2059
2060           }
2061
2062         if (mode == ptr_mode)
2063           {
2064             if (mode == DImode)
2065               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2066             else
2067               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2068
2069             mem = XVECEXP (SET_SRC (insn), 0, 0);
2070           }
2071         else
2072           {
2073             gcc_assert (mode == Pmode);
2074
2075             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2076             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2077           }
2078
2079         /* The operand is expected to be MEM.  Whenever the related insn
2080            pattern changed, above code which calculate mem should be
2081            updated.  */
2082         gcc_assert (GET_CODE (mem) == MEM);
2083         MEM_READONLY_P (mem) = 1;
2084         MEM_NOTRAP_P (mem) = 1;
2085         emit_insn (insn);
2086         return;
2087       }
2088
2089     case SYMBOL_SMALL_GOT_4G:
2090       {
2091         /* In ILP32, the mode of dest can be either SImode or DImode,
2092            while the got entry is always of SImode size.  The mode of
2093            dest depends on how dest is used: if dest is assigned to a
2094            pointer (e.g. in the memory), it has SImode; it may have
2095            DImode if dest is dereferenced to access the memeory.
2096            This is why we have to handle three different ldr_got_small
2097            patterns here (two patterns for ILP32).  */
2098
2099         rtx insn;
2100         rtx mem;
2101         rtx tmp_reg = dest;
2102         machine_mode mode = GET_MODE (dest);
2103
2104         if (can_create_pseudo_p ())
2105           tmp_reg = gen_reg_rtx (mode);
2106
2107         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2108         if (mode == ptr_mode)
2109           {
2110             if (mode == DImode)
2111               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2112             else
2113               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2114
2115             mem = XVECEXP (SET_SRC (insn), 0, 0);
2116           }
2117         else
2118           {
2119             gcc_assert (mode == Pmode);
2120
2121             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2122             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2123           }
2124
2125         gcc_assert (GET_CODE (mem) == MEM);
2126         MEM_READONLY_P (mem) = 1;
2127         MEM_NOTRAP_P (mem) = 1;
2128         emit_insn (insn);
2129         return;
2130       }
2131
2132     case SYMBOL_SMALL_TLSGD:
2133       {
2134         rtx_insn *insns;
2135         machine_mode mode = GET_MODE (dest);
2136         rtx result = gen_rtx_REG (mode, R0_REGNUM);
2137
2138         start_sequence ();
2139         if (TARGET_ILP32)
2140           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2141         else
2142           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2143         insns = get_insns ();
2144         end_sequence ();
2145
2146         RTL_CONST_CALL_P (insns) = 1;
2147         emit_libcall_block (insns, dest, result, imm);
2148         return;
2149       }
2150
2151     case SYMBOL_SMALL_TLSDESC:
2152       {
2153         machine_mode mode = GET_MODE (dest);
2154         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2155         rtx tp;
2156
2157         gcc_assert (mode == Pmode || mode == ptr_mode);
2158
2159         /* In ILP32, the got entry is always of SImode size.  Unlike
2160            small GOT, the dest is fixed at reg 0.  */
2161         if (TARGET_ILP32)
2162           emit_insn (gen_tlsdesc_small_si (imm));
2163         else
2164           emit_insn (gen_tlsdesc_small_di (imm));
2165         tp = aarch64_load_tp (NULL);
2166
2167         if (mode != Pmode)
2168           tp = gen_lowpart (mode, tp);
2169
2170         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2171         if (REG_P (dest))
2172           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2173         return;
2174       }
2175
2176     case SYMBOL_SMALL_TLSIE:
2177       {
2178         /* In ILP32, the mode of dest can be either SImode or DImode,
2179            while the got entry is always of SImode size.  The mode of
2180            dest depends on how dest is used: if dest is assigned to a
2181            pointer (e.g. in the memory), it has SImode; it may have
2182            DImode if dest is dereferenced to access the memeory.
2183            This is why we have to handle three different tlsie_small
2184            patterns here (two patterns for ILP32).  */
2185         machine_mode mode = GET_MODE (dest);
2186         rtx tmp_reg = gen_reg_rtx (mode);
2187         rtx tp = aarch64_load_tp (NULL);
2188
2189         if (mode == ptr_mode)
2190           {
2191             if (mode == DImode)
2192               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2193             else
2194               {
2195                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2196                 tp = gen_lowpart (mode, tp);
2197               }
2198           }
2199         else
2200           {
2201             gcc_assert (mode == Pmode);
2202             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2203           }
2204
2205         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2206         if (REG_P (dest))
2207           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2208         return;
2209       }
2210
2211     case SYMBOL_TLSLE12:
2212     case SYMBOL_TLSLE24:
2213     case SYMBOL_TLSLE32:
2214     case SYMBOL_TLSLE48:
2215       {
2216         machine_mode mode = GET_MODE (dest);
2217         rtx tp = aarch64_load_tp (NULL);
2218
2219         if (mode != Pmode)
2220           tp = gen_lowpart (mode, tp);
2221
2222         switch (type)
2223           {
2224           case SYMBOL_TLSLE12:
2225             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2226                         (dest, tp, imm));
2227             break;
2228           case SYMBOL_TLSLE24:
2229             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2230                         (dest, tp, imm));
2231           break;
2232           case SYMBOL_TLSLE32:
2233             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2234                         (dest, imm));
2235             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2236                         (dest, dest, tp));
2237           break;
2238           case SYMBOL_TLSLE48:
2239             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2240                         (dest, imm));
2241             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2242                         (dest, dest, tp));
2243             break;
2244           default:
2245             gcc_unreachable ();
2246           }
2247
2248         if (REG_P (dest))
2249           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2250         return;
2251       }
2252
2253     case SYMBOL_TINY_GOT:
2254       emit_insn (gen_ldr_got_tiny (dest, imm));
2255       return;
2256
2257     case SYMBOL_TINY_TLSIE:
2258       {
2259         machine_mode mode = GET_MODE (dest);
2260         rtx tp = aarch64_load_tp (NULL);
2261
2262         if (mode == ptr_mode)
2263           {
2264             if (mode == DImode)
2265               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2266             else
2267               {
2268                 tp = gen_lowpart (mode, tp);
2269                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2270               }
2271           }
2272         else
2273           {
2274             gcc_assert (mode == Pmode);
2275             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2276           }
2277
2278         if (REG_P (dest))
2279           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2280         return;
2281       }
2282
2283     default:
2284       gcc_unreachable ();
2285     }
2286 }
2287
2288 /* Emit a move from SRC to DEST.  Assume that the move expanders can
2289    handle all moves if !can_create_pseudo_p ().  The distinction is
2290    important because, unlike emit_move_insn, the move expanders know
2291    how to force Pmode objects into the constant pool even when the
2292    constant pool address is not itself legitimate.  */
2293 static rtx
2294 aarch64_emit_move (rtx dest, rtx src)
2295 {
2296   return (can_create_pseudo_p ()
2297           ? emit_move_insn (dest, src)
2298           : emit_move_insn_1 (dest, src));
2299 }
2300
2301 /* Apply UNOPTAB to OP and store the result in DEST.  */
2302
2303 static void
2304 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2305 {
2306   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2307   if (dest != tmp)
2308     emit_move_insn (dest, tmp);
2309 }
2310
2311 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2312
2313 static void
2314 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2315 {
2316   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2317                           OPTAB_DIRECT);
2318   if (dest != tmp)
2319     emit_move_insn (dest, tmp);
2320 }
2321
2322 /* Split a 128-bit move operation into two 64-bit move operations,
2323    taking care to handle partial overlap of register to register
2324    copies.  Special cases are needed when moving between GP regs and
2325    FP regs.  SRC can be a register, constant or memory; DST a register
2326    or memory.  If either operand is memory it must not have any side
2327    effects.  */
2328 void
2329 aarch64_split_128bit_move (rtx dst, rtx src)
2330 {
2331   rtx dst_lo, dst_hi;
2332   rtx src_lo, src_hi;
2333
2334   machine_mode mode = GET_MODE (dst);
2335
2336   gcc_assert (mode == TImode || mode == TFmode);
2337   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2338   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2339
2340   if (REG_P (dst) && REG_P (src))
2341     {
2342       int src_regno = REGNO (src);
2343       int dst_regno = REGNO (dst);
2344
2345       /* Handle FP <-> GP regs.  */
2346       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2347         {
2348           src_lo = gen_lowpart (word_mode, src);
2349           src_hi = gen_highpart (word_mode, src);
2350
2351           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2352           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2353           return;
2354         }
2355       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2356         {
2357           dst_lo = gen_lowpart (word_mode, dst);
2358           dst_hi = gen_highpart (word_mode, dst);
2359
2360           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2361           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2362           return;
2363         }
2364     }
2365
2366   dst_lo = gen_lowpart (word_mode, dst);
2367   dst_hi = gen_highpart (word_mode, dst);
2368   src_lo = gen_lowpart (word_mode, src);
2369   src_hi = gen_highpart_mode (word_mode, mode, src);
2370
2371   /* At most one pairing may overlap.  */
2372   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2373     {
2374       aarch64_emit_move (dst_hi, src_hi);
2375       aarch64_emit_move (dst_lo, src_lo);
2376     }
2377   else
2378     {
2379       aarch64_emit_move (dst_lo, src_lo);
2380       aarch64_emit_move (dst_hi, src_hi);
2381     }
2382 }
2383
2384 bool
2385 aarch64_split_128bit_move_p (rtx dst, rtx src)
2386 {
2387   return (! REG_P (src)
2388           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2389 }
2390
2391 /* Split a complex SIMD combine.  */
2392
2393 void
2394 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2395 {
2396   machine_mode src_mode = GET_MODE (src1);
2397   machine_mode dst_mode = GET_MODE (dst);
2398
2399   gcc_assert (VECTOR_MODE_P (dst_mode));
2400   gcc_assert (register_operand (dst, dst_mode)
2401               && register_operand (src1, src_mode)
2402               && register_operand (src2, src_mode));
2403
2404   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2405   return;
2406 }
2407
2408 /* Split a complex SIMD move.  */
2409
2410 void
2411 aarch64_split_simd_move (rtx dst, rtx src)
2412 {
2413   machine_mode src_mode = GET_MODE (src);
2414   machine_mode dst_mode = GET_MODE (dst);
2415
2416   gcc_assert (VECTOR_MODE_P (dst_mode));
2417
2418   if (REG_P (dst) && REG_P (src))
2419     {
2420       gcc_assert (VECTOR_MODE_P (src_mode));
2421       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2422     }
2423 }
2424
2425 bool
2426 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2427                               machine_mode ymode, rtx y)
2428 {
2429   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2430   gcc_assert (r != NULL);
2431   return rtx_equal_p (x, r);
2432 }
2433
2434
2435 static rtx
2436 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2437 {
2438   if (can_create_pseudo_p ())
2439     return force_reg (mode, value);
2440   else
2441     {
2442       gcc_assert (x);
2443       aarch64_emit_move (x, value);
2444       return x;
2445     }
2446 }
2447
2448 /* Return true if we can move VALUE into a register using a single
2449    CNT[BHWD] instruction.  */
2450
2451 static bool
2452 aarch64_sve_cnt_immediate_p (poly_int64 value)
2453 {
2454   HOST_WIDE_INT factor = value.coeffs[0];
2455   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2456   return (value.coeffs[1] == factor
2457           && IN_RANGE (factor, 2, 16 * 16)
2458           && (factor & 1) == 0
2459           && factor <= 16 * (factor & -factor));
2460 }
2461
2462 /* Likewise for rtx X.  */
2463
2464 bool
2465 aarch64_sve_cnt_immediate_p (rtx x)
2466 {
2467   poly_int64 value;
2468   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2469 }
2470
2471 /* Return the asm string for an instruction with a CNT-like vector size
2472    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2473    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2474    first part of the operands template (the part that comes before the
2475    vector size itself).  FACTOR is the number of quadwords.
2476    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2477    If it is zero, we can use any element size.  */
2478
2479 static char *
2480 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2481                                   unsigned int factor,
2482                                   unsigned int nelts_per_vq)
2483 {
2484   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2485
2486   if (nelts_per_vq == 0)
2487     /* There is some overlap in the ranges of the four CNT instructions.
2488        Here we always use the smallest possible element size, so that the
2489        multiplier is 1 whereever possible.  */
2490     nelts_per_vq = factor & -factor;
2491   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2492   gcc_assert (IN_RANGE (shift, 1, 4));
2493   char suffix = "dwhb"[shift - 1];
2494
2495   factor >>= shift;
2496   unsigned int written;
2497   if (factor == 1)
2498     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2499                         prefix, suffix, operands);
2500   else
2501     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2502                         prefix, suffix, operands, factor);
2503   gcc_assert (written < sizeof (buffer));
2504   return buffer;
2505 }
2506
2507 /* Return the asm string for an instruction with a CNT-like vector size
2508    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2509    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2510    first part of the operands template (the part that comes before the
2511    vector size itself).  X is the value of the vector size operand,
2512    as a polynomial integer rtx.  */
2513
2514 char *
2515 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2516                                   rtx x)
2517 {
2518   poly_int64 value = rtx_to_poly_int64 (x);
2519   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2520   return aarch64_output_sve_cnt_immediate (prefix, operands,
2521                                            value.coeffs[1], 0);
2522 }
2523
2524 /* Return true if we can add VALUE to a register using a single ADDVL
2525    or ADDPL instruction.  */
2526
2527 static bool
2528 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2529 {
2530   HOST_WIDE_INT factor = value.coeffs[0];
2531   if (factor == 0 || value.coeffs[1] != factor)
2532     return false;
2533   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2534      and a value of 16 is one vector width.  */
2535   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2536           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2537 }
2538
2539 /* Likewise for rtx X.  */
2540
2541 bool
2542 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2543 {
2544   poly_int64 value;
2545   return (poly_int_rtx_p (x, &value)
2546           && aarch64_sve_addvl_addpl_immediate_p (value));
2547 }
2548
2549 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2550    and storing the result in operand 0.  */
2551
2552 char *
2553 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2554 {
2555   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2556   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2557   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2558
2559   /* Use INC or DEC if possible.  */
2560   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2561     {
2562       if (aarch64_sve_cnt_immediate_p (offset_value))
2563         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2564                                                  offset_value.coeffs[1], 0);
2565       if (aarch64_sve_cnt_immediate_p (-offset_value))
2566         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2567                                                  -offset_value.coeffs[1], 0);
2568     }
2569
2570   int factor = offset_value.coeffs[1];
2571   if ((factor & 15) == 0)
2572     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2573   else
2574     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2575   return buffer;
2576 }
2577
2578 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2579    instruction.  If it is, store the number of elements in each vector
2580    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2581    factor in *FACTOR_OUT (if nonnull).  */
2582
2583 bool
2584 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2585                                  unsigned int *nelts_per_vq_out)
2586 {
2587   rtx elt;
2588   poly_int64 value;
2589
2590   if (!const_vec_duplicate_p (x, &elt)
2591       || !poly_int_rtx_p (elt, &value))
2592     return false;
2593
2594   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2595   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2596     /* There's no vector INCB.  */
2597     return false;
2598
2599   HOST_WIDE_INT factor = value.coeffs[0];
2600   if (value.coeffs[1] != factor)
2601     return false;
2602
2603   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2604   if ((factor % nelts_per_vq) != 0
2605       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2606     return false;
2607
2608   if (factor_out)
2609     *factor_out = factor;
2610   if (nelts_per_vq_out)
2611     *nelts_per_vq_out = nelts_per_vq;
2612   return true;
2613 }
2614
2615 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2616    instruction.  */
2617
2618 bool
2619 aarch64_sve_inc_dec_immediate_p (rtx x)
2620 {
2621   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2622 }
2623
2624 /* Return the asm template for an SVE vector INC or DEC instruction.
2625    OPERANDS gives the operands before the vector count and X is the
2626    value of the vector count operand itself.  */
2627
2628 char *
2629 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2630 {
2631   int factor;
2632   unsigned int nelts_per_vq;
2633   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2634     gcc_unreachable ();
2635   if (factor < 0)
2636     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2637                                              nelts_per_vq);
2638   else
2639     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2640                                              nelts_per_vq);
2641 }
2642
2643 static int
2644 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2645                                 scalar_int_mode mode)
2646 {
2647   int i;
2648   unsigned HOST_WIDE_INT val, val2, mask;
2649   int one_match, zero_match;
2650   int num_insns;
2651
2652   val = INTVAL (imm);
2653
2654   if (aarch64_move_imm (val, mode))
2655     {
2656       if (generate)
2657         emit_insn (gen_rtx_SET (dest, imm));
2658       return 1;
2659     }
2660
2661   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2662      (with XXXX non-zero). In that case check to see if the move can be done in
2663      a smaller mode.  */
2664   val2 = val & 0xffffffff;
2665   if (mode == DImode
2666       && aarch64_move_imm (val2, SImode)
2667       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2668     {
2669       if (generate)
2670         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2671
2672       /* Check if we have to emit a second instruction by checking to see
2673          if any of the upper 32 bits of the original DI mode value is set.  */
2674       if (val == val2)
2675         return 1;
2676
2677       i = (val >> 48) ? 48 : 32;
2678
2679       if (generate)
2680          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2681                                     GEN_INT ((val >> i) & 0xffff)));
2682
2683       return 2;
2684     }
2685
2686   if ((val >> 32) == 0 || mode == SImode)
2687     {
2688       if (generate)
2689         {
2690           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2691           if (mode == SImode)
2692             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2693                                        GEN_INT ((val >> 16) & 0xffff)));
2694           else
2695             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2696                                        GEN_INT ((val >> 16) & 0xffff)));
2697         }
2698       return 2;
2699     }
2700
2701   /* Remaining cases are all for DImode.  */
2702
2703   mask = 0xffff;
2704   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2705     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2706   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2707     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2708
2709   if (zero_match != 2 && one_match != 2)
2710     {
2711       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2712          For a 64-bit bitmask try whether changing 16 bits to all ones or
2713          zeroes creates a valid bitmask.  To check any repeated bitmask,
2714          try using 16 bits from the other 32-bit half of val.  */
2715
2716       for (i = 0; i < 64; i += 16, mask <<= 16)
2717         {
2718           val2 = val & ~mask;
2719           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2720             break;
2721           val2 = val | mask;
2722           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2723             break;
2724           val2 = val2 & ~mask;
2725           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2726           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2727             break;
2728         }
2729       if (i != 64)
2730         {
2731           if (generate)
2732             {
2733               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2734               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2735                                          GEN_INT ((val >> i) & 0xffff)));
2736             }
2737           return 2;
2738         }
2739     }
2740
2741   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2742      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2743      otherwise skip zero bits.  */
2744
2745   num_insns = 1;
2746   mask = 0xffff;
2747   val2 = one_match > zero_match ? ~val : val;
2748   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2749
2750   if (generate)
2751     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2752                                            ? (val | ~(mask << i))
2753                                            : (val & (mask << i)))));
2754   for (i += 16; i < 64; i += 16)
2755     {
2756       if ((val2 & (mask << i)) == 0)
2757         continue;
2758       if (generate)
2759         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2760                                    GEN_INT ((val >> i) & 0xffff)));
2761       num_insns ++;
2762     }
2763
2764   return num_insns;
2765 }
2766
2767 /* Return whether imm is a 128-bit immediate which is simple enough to
2768    expand inline.  */
2769 bool
2770 aarch64_mov128_immediate (rtx imm)
2771 {
2772   if (GET_CODE (imm) == CONST_INT)
2773     return true;
2774
2775   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2776
2777   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2778   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2779
2780   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2781          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2782 }
2783
2784
2785 /* Return the number of temporary registers that aarch64_add_offset_1
2786    would need to add OFFSET to a register.  */
2787
2788 static unsigned int
2789 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2790 {
2791   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2792 }
2793
2794 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2795    a non-polynomial OFFSET.  MODE is the mode of the addition.
2796    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2797    be set and CFA adjustments added to the generated instructions.
2798
2799    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2800    temporary if register allocation is already complete.  This temporary
2801    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2802    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2803    the immediate again.
2804
2805    Since this function may be used to adjust the stack pointer, we must
2806    ensure that it cannot cause transient stack deallocation (for example
2807    by first incrementing SP and then decrementing when adjusting by a
2808    large immediate).  */
2809
2810 static void
2811 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2812                       rtx src, HOST_WIDE_INT offset, rtx temp1,
2813                       bool frame_related_p, bool emit_move_imm)
2814 {
2815   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2816   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2817
2818   HOST_WIDE_INT moffset = abs_hwi (offset);
2819   rtx_insn *insn;
2820
2821   if (!moffset)
2822     {
2823       if (!rtx_equal_p (dest, src))
2824         {
2825           insn = emit_insn (gen_rtx_SET (dest, src));
2826           RTX_FRAME_RELATED_P (insn) = frame_related_p;
2827         }
2828       return;
2829     }
2830
2831   /* Single instruction adjustment.  */
2832   if (aarch64_uimm12_shift (moffset))
2833     {
2834       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2835       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2836       return;
2837     }
2838
2839   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2840      and either:
2841
2842      a) the offset cannot be loaded by a 16-bit move or
2843      b) there is no spare register into which we can move it.  */
2844   if (moffset < 0x1000000
2845       && ((!temp1 && !can_create_pseudo_p ())
2846           || !aarch64_move_imm (moffset, mode)))
2847     {
2848       HOST_WIDE_INT low_off = moffset & 0xfff;
2849
2850       low_off = offset < 0 ? -low_off : low_off;
2851       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2852       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2853       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2854       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2855       return;
2856     }
2857
2858   /* Emit a move immediate if required and an addition/subtraction.  */
2859   if (emit_move_imm)
2860     {
2861       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2862       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2863     }
2864   insn = emit_insn (offset < 0
2865                     ? gen_sub3_insn (dest, src, temp1)
2866                     : gen_add3_insn (dest, src, temp1));
2867   if (frame_related_p)
2868     {
2869       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2870       rtx adj = plus_constant (mode, src, offset);
2871       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2872     }
2873 }
2874
2875 /* Return the number of temporary registers that aarch64_add_offset
2876    would need to move OFFSET into a register or add OFFSET to a register;
2877    ADD_P is true if we want the latter rather than the former.  */
2878
2879 static unsigned int
2880 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2881 {
2882   /* This follows the same structure as aarch64_add_offset.  */
2883   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2884     return 0;
2885
2886   unsigned int count = 0;
2887   HOST_WIDE_INT factor = offset.coeffs[1];
2888   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2889   poly_int64 poly_offset (factor, factor);
2890   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2891     /* Need one register for the ADDVL/ADDPL result.  */
2892     count += 1;
2893   else if (factor != 0)
2894     {
2895       factor = abs (factor);
2896       if (factor > 16 * (factor & -factor))
2897         /* Need one register for the CNT result and one for the multiplication
2898            factor.  If necessary, the second temporary can be reused for the
2899            constant part of the offset.  */
2900         return 2;
2901       /* Need one register for the CNT result (which might then
2902          be shifted).  */
2903       count += 1;
2904     }
2905   return count + aarch64_add_offset_1_temporaries (constant);
2906 }
2907
2908 /* If X can be represented as a poly_int64, return the number
2909    of temporaries that are required to add it to a register.
2910    Return -1 otherwise.  */
2911
2912 int
2913 aarch64_add_offset_temporaries (rtx x)
2914 {
2915   poly_int64 offset;
2916   if (!poly_int_rtx_p (x, &offset))
2917     return -1;
2918   return aarch64_offset_temporaries (true, offset);
2919 }
2920
2921 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
2922    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2923    be set and CFA adjustments added to the generated instructions.
2924
2925    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2926    temporary if register allocation is already complete.  This temporary
2927    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2928    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2929    false to avoid emitting the immediate again.
2930
2931    TEMP2, if nonnull, is a second temporary register that doesn't
2932    overlap either DEST or REG.
2933
2934    Since this function may be used to adjust the stack pointer, we must
2935    ensure that it cannot cause transient stack deallocation (for example
2936    by first incrementing SP and then decrementing when adjusting by a
2937    large immediate).  */
2938
2939 static void
2940 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2941                     poly_int64 offset, rtx temp1, rtx temp2,
2942                     bool frame_related_p, bool emit_move_imm = true)
2943 {
2944   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2945   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2946   gcc_assert (temp1 == NULL_RTX
2947               || !frame_related_p
2948               || !reg_overlap_mentioned_p (temp1, dest));
2949   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2950
2951   /* Try using ADDVL or ADDPL to add the whole value.  */
2952   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2953     {
2954       rtx offset_rtx = gen_int_mode (offset, mode);
2955       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2956       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2957       return;
2958     }
2959
2960   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2961      SVE vector register, over and above the minimum size of 128 bits.
2962      This is equivalent to half the value returned by CNTD with a
2963      vector shape of ALL.  */
2964   HOST_WIDE_INT factor = offset.coeffs[1];
2965   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2966
2967   /* Try using ADDVL or ADDPL to add the VG-based part.  */
2968   poly_int64 poly_offset (factor, factor);
2969   if (src != const0_rtx
2970       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2971     {
2972       rtx offset_rtx = gen_int_mode (poly_offset, mode);
2973       if (frame_related_p)
2974         {
2975           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2976           RTX_FRAME_RELATED_P (insn) = true;
2977           src = dest;
2978         }
2979       else
2980         {
2981           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2982           src = aarch64_force_temporary (mode, temp1, addr);
2983           temp1 = temp2;
2984           temp2 = NULL_RTX;
2985         }
2986     }
2987   /* Otherwise use a CNT-based sequence.  */
2988   else if (factor != 0)
2989     {
2990       /* Use a subtraction if we have a negative factor.  */
2991       rtx_code code = PLUS;
2992       if (factor < 0)
2993         {
2994           factor = -factor;
2995           code = MINUS;
2996         }
2997
2998       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
2999          into the multiplication.  */
3000       rtx val;
3001       int shift = 0;
3002       if (factor & 1)
3003         /* Use a right shift by 1.  */
3004         shift = -1;
3005       else
3006         factor /= 2;
3007       HOST_WIDE_INT low_bit = factor & -factor;
3008       if (factor <= 16 * low_bit)
3009         {
3010           if (factor > 16 * 8)
3011             {
3012               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3013                  the value with the minimum multiplier and shift it into
3014                  position.  */
3015               int extra_shift = exact_log2 (low_bit);
3016               shift += extra_shift;
3017               factor >>= extra_shift;
3018             }
3019           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3020         }
3021       else
3022         {
3023           /* Use CNTD, then multiply it by FACTOR.  */
3024           val = gen_int_mode (poly_int64 (2, 2), mode);
3025           val = aarch64_force_temporary (mode, temp1, val);
3026
3027           /* Go back to using a negative multiplication factor if we have
3028              no register from which to subtract.  */
3029           if (code == MINUS && src == const0_rtx)
3030             {
3031               factor = -factor;
3032               code = PLUS;
3033             }
3034           rtx coeff1 = gen_int_mode (factor, mode);
3035           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3036           val = gen_rtx_MULT (mode, val, coeff1);
3037         }
3038
3039       if (shift > 0)
3040         {
3041           /* Multiply by 1 << SHIFT.  */
3042           val = aarch64_force_temporary (mode, temp1, val);
3043           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3044         }
3045       else if (shift == -1)
3046         {
3047           /* Divide by 2.  */
3048           val = aarch64_force_temporary (mode, temp1, val);
3049           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3050         }
3051
3052       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
3053       if (src != const0_rtx)
3054         {
3055           val = aarch64_force_temporary (mode, temp1, val);
3056           val = gen_rtx_fmt_ee (code, mode, src, val);
3057         }
3058       else if (code == MINUS)
3059         {
3060           val = aarch64_force_temporary (mode, temp1, val);
3061           val = gen_rtx_NEG (mode, val);
3062         }
3063
3064       if (constant == 0 || frame_related_p)
3065         {
3066           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3067           if (frame_related_p)
3068             {
3069               RTX_FRAME_RELATED_P (insn) = true;
3070               add_reg_note (insn, REG_CFA_ADJUST_CFA,
3071                             gen_rtx_SET (dest, plus_constant (Pmode, src,
3072                                                               poly_offset)));
3073             }
3074           src = dest;
3075           if (constant == 0)
3076             return;
3077         }
3078       else
3079         {
3080           src = aarch64_force_temporary (mode, temp1, val);
3081           temp1 = temp2;
3082           temp2 = NULL_RTX;
3083         }
3084
3085       emit_move_imm = true;
3086     }
3087
3088   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3089                         frame_related_p, emit_move_imm);
3090 }
3091
3092 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3093    than a poly_int64.  */
3094
3095 void
3096 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3097                           rtx offset_rtx, rtx temp1, rtx temp2)
3098 {
3099   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3100                       temp1, temp2, false);
3101 }
3102
3103 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3104    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
3105    if TEMP1 already contains abs (DELTA).  */
3106
3107 static inline void
3108 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3109 {
3110   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3111                       temp1, temp2, true, emit_move_imm);
3112 }
3113
3114 /* Subtract DELTA from the stack pointer, marking the instructions
3115    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
3116    if nonnull.  */
3117
3118 static inline void
3119 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3120                 bool emit_move_imm = true)
3121 {
3122   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3123                       temp1, temp2, frame_related_p, emit_move_imm);
3124 }
3125
3126 /* Set DEST to (vec_series BASE STEP).  */
3127
3128 static void
3129 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3130 {
3131   machine_mode mode = GET_MODE (dest);
3132   scalar_mode inner = GET_MODE_INNER (mode);
3133
3134   /* Each operand can be a register or an immediate in the range [-16, 15].  */
3135   if (!aarch64_sve_index_immediate_p (base))
3136     base = force_reg (inner, base);
3137   if (!aarch64_sve_index_immediate_p (step))
3138     step = force_reg (inner, step);
3139
3140   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3141 }
3142
3143 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
3144    integer of mode INT_MODE.  Return true on success.  */
3145
3146 static bool
3147 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
3148                                       rtx src)
3149 {
3150   /* If the constant is smaller than 128 bits, we can do the move
3151      using a vector of SRC_MODEs.  */
3152   if (src_mode != TImode)
3153     {
3154       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
3155                                      GET_MODE_SIZE (src_mode));
3156       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
3157       emit_move_insn (gen_lowpart (dup_mode, dest),
3158                       gen_const_vec_duplicate (dup_mode, src));
3159       return true;
3160     }
3161
3162   /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
3163   src = force_const_mem (src_mode, src);
3164   if (!src)
3165     return false;
3166
3167   /* Make sure that the address is legitimate.  */
3168   if (!aarch64_sve_ld1r_operand_p (src))
3169     {
3170       rtx addr = force_reg (Pmode, XEXP (src, 0));
3171       src = replace_equiv_address (src, addr);
3172     }
3173
3174   machine_mode mode = GET_MODE (dest);
3175   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3176   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3177   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3178   src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
3179   emit_insn (gen_rtx_SET (dest, src));
3180   return true;
3181 }
3182
3183 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
3184    isn't a simple duplicate or series.  */
3185
3186 static void
3187 aarch64_expand_sve_const_vector (rtx dest, rtx src)
3188 {
3189   machine_mode mode = GET_MODE (src);
3190   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3191   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3192   gcc_assert (npatterns > 1);
3193
3194   if (nelts_per_pattern == 1)
3195     {
3196       /* The constant is a repeating seqeuence of at least two elements,
3197          where the repeating elements occupy no more than 128 bits.
3198          Get an integer representation of the replicated value.  */
3199       scalar_int_mode int_mode;
3200       if (BYTES_BIG_ENDIAN)
3201         /* For now, always use LD1RQ to load the value on big-endian
3202            targets, since the handling of smaller integers includes a
3203            subreg that is semantically an element reverse.  */
3204         int_mode = TImode;
3205       else
3206         {
3207           unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
3208           gcc_assert (int_bits <= 128);
3209           int_mode = int_mode_for_size (int_bits, 0).require ();
3210         }
3211       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
3212       if (int_value
3213           && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
3214         return;
3215     }
3216
3217   /* Expand each pattern individually.  */
3218   rtx_vector_builder builder;
3219   auto_vec<rtx, 16> vectors (npatterns);
3220   for (unsigned int i = 0; i < npatterns; ++i)
3221     {
3222       builder.new_vector (mode, 1, nelts_per_pattern);
3223       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3224         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3225       vectors.quick_push (force_reg (mode, builder.build ()));
3226     }
3227
3228   /* Use permutes to interleave the separate vectors.  */
3229   while (npatterns > 1)
3230     {
3231       npatterns /= 2;
3232       for (unsigned int i = 0; i < npatterns; ++i)
3233         {
3234           rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
3235           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3236           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3237           vectors[i] = tmp;
3238         }
3239     }
3240   gcc_assert (vectors[0] == dest);
3241 }
3242
3243 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
3244    is a pattern that can be used to set DEST to a replicated scalar
3245    element.  */
3246
3247 void
3248 aarch64_expand_mov_immediate (rtx dest, rtx imm,
3249                               rtx (*gen_vec_duplicate) (rtx, rtx))
3250 {
3251   machine_mode mode = GET_MODE (dest);
3252
3253   /* Check on what type of symbol it is.  */
3254   scalar_int_mode int_mode;
3255   if ((GET_CODE (imm) == SYMBOL_REF
3256        || GET_CODE (imm) == LABEL_REF
3257        || GET_CODE (imm) == CONST
3258        || GET_CODE (imm) == CONST_POLY_INT)
3259       && is_a <scalar_int_mode> (mode, &int_mode))
3260     {
3261       rtx mem;
3262       poly_int64 offset;
3263       HOST_WIDE_INT const_offset;
3264       enum aarch64_symbol_type sty;
3265
3266       /* If we have (const (plus symbol offset)), separate out the offset
3267          before we start classifying the symbol.  */
3268       rtx base = strip_offset (imm, &offset);
3269
3270       /* We must always add an offset involving VL separately, rather than
3271          folding it into the relocation.  */
3272       if (!offset.is_constant (&const_offset))
3273         {
3274           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
3275             emit_insn (gen_rtx_SET (dest, imm));
3276           else
3277             {
3278               /* Do arithmetic on 32-bit values if the result is smaller
3279                  than that.  */
3280               if (partial_subreg_p (int_mode, SImode))
3281                 {
3282                   /* It is invalid to do symbol calculations in modes
3283                      narrower than SImode.  */
3284                   gcc_assert (base == const0_rtx);
3285                   dest = gen_lowpart (SImode, dest);
3286                   int_mode = SImode;
3287                 }
3288               if (base != const0_rtx)
3289                 {
3290                   base = aarch64_force_temporary (int_mode, dest, base);
3291                   aarch64_add_offset (int_mode, dest, base, offset,
3292                                       NULL_RTX, NULL_RTX, false);
3293                 }
3294               else
3295                 aarch64_add_offset (int_mode, dest, base, offset,
3296                                     dest, NULL_RTX, false);
3297             }
3298           return;
3299         }
3300
3301       sty = aarch64_classify_symbol (base, const_offset);
3302       switch (sty)
3303         {
3304         case SYMBOL_FORCE_TO_MEM:
3305           if (const_offset != 0
3306               && targetm.cannot_force_const_mem (int_mode, imm))
3307             {
3308               gcc_assert (can_create_pseudo_p ());
3309               base = aarch64_force_temporary (int_mode, dest, base);
3310               aarch64_add_offset (int_mode, dest, base, const_offset,
3311                                   NULL_RTX, NULL_RTX, false);
3312               return;
3313             }
3314
3315           mem = force_const_mem (ptr_mode, imm);
3316           gcc_assert (mem);
3317
3318           /* If we aren't generating PC relative literals, then
3319              we need to expand the literal pool access carefully.
3320              This is something that needs to be done in a number
3321              of places, so could well live as a separate function.  */
3322           if (!aarch64_pcrelative_literal_loads)
3323             {
3324               gcc_assert (can_create_pseudo_p ());
3325               base = gen_reg_rtx (ptr_mode);
3326               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3327               if (ptr_mode != Pmode)
3328                 base = convert_memory_address (Pmode, base);
3329               mem = gen_rtx_MEM (ptr_mode, base);
3330             }
3331
3332           if (int_mode != ptr_mode)
3333             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3334
3335           emit_insn (gen_rtx_SET (dest, mem));
3336
3337           return;
3338
3339         case SYMBOL_SMALL_TLSGD:
3340         case SYMBOL_SMALL_TLSDESC:
3341         case SYMBOL_SMALL_TLSIE:
3342         case SYMBOL_SMALL_GOT_28K:
3343         case SYMBOL_SMALL_GOT_4G:
3344         case SYMBOL_TINY_GOT:
3345         case SYMBOL_TINY_TLSIE:
3346           if (const_offset != 0)
3347             {
3348               gcc_assert(can_create_pseudo_p ());
3349               base = aarch64_force_temporary (int_mode, dest, base);
3350               aarch64_add_offset (int_mode, dest, base, const_offset,
3351                                   NULL_RTX, NULL_RTX, false);
3352               return;
3353             }
3354           /* FALLTHRU */
3355
3356         case SYMBOL_SMALL_ABSOLUTE:
3357         case SYMBOL_TINY_ABSOLUTE:
3358         case SYMBOL_TLSLE12:
3359         case SYMBOL_TLSLE24:
3360         case SYMBOL_TLSLE32:
3361         case SYMBOL_TLSLE48:
3362           aarch64_load_symref_appropriately (dest, imm, sty);
3363           return;
3364
3365         default:
3366           gcc_unreachable ();
3367         }
3368     }
3369
3370   if (!CONST_INT_P (imm))
3371     {
3372       rtx base, step, value;
3373       if (GET_CODE (imm) == HIGH
3374           || aarch64_simd_valid_immediate (imm, NULL))
3375         emit_insn (gen_rtx_SET (dest, imm));
3376       else if (const_vec_series_p (imm, &base, &step))
3377         aarch64_expand_vec_series (dest, base, step);
3378       else if (const_vec_duplicate_p (imm, &value))
3379         {
3380           /* If the constant is out of range of an SVE vector move,
3381              load it from memory if we can, otherwise move it into
3382              a register and use a DUP.  */
3383           scalar_mode inner_mode = GET_MODE_INNER (mode);
3384           rtx op = force_const_mem (inner_mode, value);
3385           if (!op)
3386             op = force_reg (inner_mode, value);
3387           else if (!aarch64_sve_ld1r_operand_p (op))
3388             {
3389               rtx addr = force_reg (Pmode, XEXP (op, 0));
3390               op = replace_equiv_address (op, addr);
3391             }
3392           emit_insn (gen_vec_duplicate (dest, op));
3393         }
3394       else if (GET_CODE (imm) == CONST_VECTOR
3395                && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3396         aarch64_expand_sve_const_vector (dest, imm);
3397       else
3398         {
3399           rtx mem = force_const_mem (mode, imm);
3400           gcc_assert (mem);
3401           emit_move_insn (dest, mem);
3402         }
3403
3404       return;
3405     }
3406
3407   aarch64_internal_mov_immediate (dest, imm, true,
3408                                   as_a <scalar_int_mode> (mode));
3409 }
3410
3411 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3412    that is known to contain PTRUE.  */
3413
3414 void
3415 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3416 {
3417   emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3418                                                 gen_rtvec (2, pred, src),
3419                                                 UNSPEC_MERGE_PTRUE)));
3420 }
3421
3422 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3423    operand is in memory.  In this case we need to use the predicated LD1
3424    and ST1 instead of LDR and STR, both for correctness on big-endian
3425    targets and because LD1 and ST1 support a wider range of addressing modes.
3426    PRED_MODE is the mode of the predicate.
3427
3428    See the comment at the head of aarch64-sve.md for details about the
3429    big-endian handling.  */
3430
3431 void
3432 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3433 {
3434   machine_mode mode = GET_MODE (dest);
3435   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3436   if (!register_operand (src, mode)
3437       && !register_operand (dest, mode))
3438     {
3439       rtx tmp = gen_reg_rtx (mode);
3440       if (MEM_P (src))
3441         aarch64_emit_sve_pred_move (tmp, ptrue, src);
3442       else
3443         emit_move_insn (tmp, src);
3444       src = tmp;
3445     }
3446   aarch64_emit_sve_pred_move (dest, ptrue, src);
3447 }
3448
3449 /* Called only on big-endian targets.  See whether an SVE vector move
3450    from SRC to DEST is effectively a REV[BHW] instruction, because at
3451    least one operand is a subreg of an SVE vector that has wider or
3452    narrower elements.  Return true and emit the instruction if so.
3453
3454    For example:
3455
3456      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3457
3458    represents a VIEW_CONVERT between the following vectors, viewed
3459    in memory order:
3460
3461      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
3462      R1: { [0],      [1],      [2],      [3],     ... }
3463
3464    The high part of lane X in R2 should therefore correspond to lane X*2
3465    of R1, but the register representations are:
3466
3467          msb                                      lsb
3468      R2: ...... [1].high  [1].low   [0].high  [0].low
3469      R1: ...... [3]       [2]       [1]       [0]
3470
3471    where the low part of lane X in R2 corresponds to lane X*2 in R1.
3472    We therefore need a reverse operation to swap the high and low values
3473    around.
3474
3475    This is purely an optimization.  Without it we would spill the
3476    subreg operand to the stack in one mode and reload it in the
3477    other mode, which has the same effect as the REV.  */
3478
3479 bool
3480 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3481 {
3482   gcc_assert (BYTES_BIG_ENDIAN);
3483   if (GET_CODE (dest) == SUBREG)
3484     dest = SUBREG_REG (dest);
3485   if (GET_CODE (src) == SUBREG)
3486     src = SUBREG_REG (src);
3487
3488   /* The optimization handles two single SVE REGs with different element
3489      sizes.  */
3490   if (!REG_P (dest)
3491       || !REG_P (src)
3492       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3493       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3494       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3495           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3496     return false;
3497
3498   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
3499   rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3500   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3501                                UNSPEC_REV_SUBREG);
3502   emit_insn (gen_rtx_SET (dest, unspec));
3503   return true;
3504 }
3505
3506 /* Return a copy of X with mode MODE, without changing its other
3507    attributes.  Unlike gen_lowpart, this doesn't care whether the
3508    mode change is valid.  */
3509
3510 static rtx
3511 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3512 {
3513   if (GET_MODE (x) == mode)
3514     return x;
3515
3516   x = shallow_copy_rtx (x);
3517   set_mode_and_regno (x, mode, REGNO (x));
3518   return x;
3519 }
3520
3521 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3522    operands.  */
3523
3524 void
3525 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3526 {
3527   /* Decide which REV operation we need.  The mode with narrower elements
3528      determines the mode of the operands and the mode with the wider
3529      elements determines the reverse width.  */
3530   machine_mode mode_with_wider_elts = GET_MODE (dest);
3531   machine_mode mode_with_narrower_elts = GET_MODE (src);
3532   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3533       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3534     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3535
3536   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3537   unsigned int unspec;
3538   if (wider_bytes == 8)
3539     unspec = UNSPEC_REV64;
3540   else if (wider_bytes == 4)
3541     unspec = UNSPEC_REV32;
3542   else if (wider_bytes == 2)
3543     unspec = UNSPEC_REV16;
3544   else
3545     gcc_unreachable ();
3546   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3547
3548   /* Emit:
3549
3550        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3551                          UNSPEC_MERGE_PTRUE))
3552
3553      with the appropriate modes.  */
3554   ptrue = gen_lowpart (pred_mode, ptrue);
3555   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3556   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3557   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3558   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3559                         UNSPEC_MERGE_PTRUE);
3560   emit_insn (gen_rtx_SET (dest, src));
3561 }
3562
3563 static bool
3564 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3565                                  tree exp ATTRIBUTE_UNUSED)
3566 {
3567   if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
3568     return false;
3569
3570   return true;
3571 }
3572
3573 /* Implement TARGET_PASS_BY_REFERENCE.  */
3574
3575 static bool
3576 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3577                            machine_mode mode,
3578                            const_tree type,
3579                            bool named ATTRIBUTE_UNUSED)
3580 {
3581   HOST_WIDE_INT size;
3582   machine_mode dummymode;
3583   int nregs;
3584
3585   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3586   if (mode == BLKmode && type)
3587     size = int_size_in_bytes (type);
3588   else
3589     /* No frontends can create types with variable-sized modes, so we
3590        shouldn't be asked to pass or return them.  */
3591     size = GET_MODE_SIZE (mode).to_constant ();
3592
3593   /* Aggregates are passed by reference based on their size.  */
3594   if (type && AGGREGATE_TYPE_P (type))
3595     {
3596       size = int_size_in_bytes (type);
3597     }
3598
3599   /* Variable sized arguments are always returned by reference.  */
3600   if (size < 0)
3601     return true;
3602
3603   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3604   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3605                                                &dummymode, &nregs,
3606                                                NULL))
3607     return false;
3608
3609   /* Arguments which are variable sized or larger than 2 registers are
3610      passed by reference unless they are a homogenous floating point
3611      aggregate.  */
3612   return size > 2 * UNITS_PER_WORD;
3613 }
3614
3615 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3616 static bool
3617 aarch64_return_in_msb (const_tree valtype)
3618 {
3619   machine_mode dummy_mode;
3620   int dummy_int;
3621
3622   /* Never happens in little-endian mode.  */
3623   if (!BYTES_BIG_ENDIAN)
3624     return false;
3625
3626   /* Only composite types smaller than or equal to 16 bytes can
3627      be potentially returned in registers.  */
3628   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3629       || int_size_in_bytes (valtype) <= 0
3630       || int_size_in_bytes (valtype) > 16)
3631     return false;
3632
3633   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3634      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3635      is always passed/returned in the least significant bits of fp/simd
3636      register(s).  */
3637   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3638                                                &dummy_mode, &dummy_int, NULL))
3639     return false;
3640
3641   return true;
3642 }
3643
3644 /* Implement TARGET_FUNCTION_VALUE.
3645    Define how to find the value returned by a function.  */
3646
3647 static rtx
3648 aarch64_function_value (const_tree type, const_tree func,
3649                         bool outgoing ATTRIBUTE_UNUSED)
3650 {
3651   machine_mode mode;
3652   int unsignedp;
3653   int count;
3654   machine_mode ag_mode;
3655
3656   mode = TYPE_MODE (type);
3657   if (INTEGRAL_TYPE_P (type))
3658     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3659
3660   if (aarch64_return_in_msb (type))
3661     {
3662       HOST_WIDE_INT size = int_size_in_bytes (type);
3663
3664       if (size % UNITS_PER_WORD != 0)
3665         {
3666           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3667           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3668         }
3669     }
3670
3671   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3672                                                &ag_mode, &count, NULL))
3673     {
3674       if (!aarch64_composite_type_p (type, mode))
3675         {
3676           gcc_assert (count == 1 && mode == ag_mode);
3677           return gen_rtx_REG (mode, V0_REGNUM);
3678         }
3679       else
3680         {
3681           int i;
3682           rtx par;
3683
3684           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3685           for (i = 0; i < count; i++)
3686             {
3687               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3688               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3689               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3690               XVECEXP (par, 0, i) = tmp;
3691             }
3692           return par;
3693         }
3694     }
3695   else
3696     return gen_rtx_REG (mode, R0_REGNUM);
3697 }
3698
3699 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3700    Return true if REGNO is the number of a hard register in which the values
3701    of called function may come back.  */
3702
3703 static bool
3704 aarch64_function_value_regno_p (const unsigned int regno)
3705 {
3706   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3707      of 16-byte return values are: 128-bit integers and 16-byte small
3708      structures (excluding homogeneous floating-point aggregates).  */
3709   if (regno == R0_REGNUM || regno == R1_REGNUM)
3710     return true;
3711
3712   /* Up to four fp/simd registers can return a function value, e.g. a
3713      homogeneous floating-point aggregate having four members.  */
3714   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3715     return TARGET_FLOAT;
3716
3717   return false;
3718 }
3719
3720 /* Implement TARGET_RETURN_IN_MEMORY.
3721
3722    If the type T of the result of a function is such that
3723      void func (T arg)
3724    would require that arg be passed as a value in a register (or set of
3725    registers) according to the parameter passing rules, then the result
3726    is returned in the same registers as would be used for such an
3727    argument.  */
3728
3729 static bool
3730 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3731 {
3732   HOST_WIDE_INT size;
3733   machine_mode ag_mode;
3734   int count;
3735
3736   if (!AGGREGATE_TYPE_P (type)
3737       && TREE_CODE (type) != COMPLEX_TYPE
3738       && TREE_CODE (type) != VECTOR_TYPE)
3739     /* Simple scalar types always returned in registers.  */
3740     return false;
3741
3742   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3743                                                type,
3744                                                &ag_mode,
3745                                                &count,
3746                                                NULL))
3747     return false;
3748
3749   /* Types larger than 2 registers returned in memory.  */
3750   size = int_size_in_bytes (type);
3751   return (size < 0 || size > 2 * UNITS_PER_WORD);
3752 }
3753
3754 static bool
3755 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3756                                const_tree type, int *nregs)
3757 {
3758   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3759   return aarch64_vfp_is_call_or_return_candidate (mode,
3760                                                   type,
3761                                                   &pcum->aapcs_vfp_rmode,
3762                                                   nregs,
3763                                                   NULL);
3764 }
3765
3766 /* Given MODE and TYPE of a function argument, return the alignment in
3767    bits.  The idea is to suppress any stronger alignment requested by
3768    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3769    This is a helper function for local use only.  */
3770
3771 static unsigned int
3772 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3773 {
3774   if (!type)
3775     return GET_MODE_ALIGNMENT (mode);
3776
3777   if (integer_zerop (TYPE_SIZE (type)))
3778     return 0;
3779
3780   gcc_assert (TYPE_MODE (type) == mode);
3781
3782   if (!AGGREGATE_TYPE_P (type))
3783     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3784
3785   if (TREE_CODE (type) == ARRAY_TYPE)
3786     return TYPE_ALIGN (TREE_TYPE (type));
3787
3788   unsigned int alignment = 0;
3789   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3790     if (TREE_CODE (field) == FIELD_DECL)
3791       alignment = std::max (alignment, DECL_ALIGN (field));
3792
3793   return alignment;
3794 }
3795
3796 /* Layout a function argument according to the AAPCS64 rules.  The rule
3797    numbers refer to the rule numbers in the AAPCS64.  */
3798
3799 static void
3800 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3801                     const_tree type,
3802                     bool named ATTRIBUTE_UNUSED)
3803 {
3804   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3805   int ncrn, nvrn, nregs;
3806   bool allocate_ncrn, allocate_nvrn;
3807   HOST_WIDE_INT size;
3808
3809   /* We need to do this once per argument.  */
3810   if (pcum->aapcs_arg_processed)
3811     return;
3812
3813   pcum->aapcs_arg_processed = true;
3814
3815   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
3816   if (type)
3817     size = int_size_in_bytes (type);
3818   else
3819     /* No frontends can create types with variable-sized modes, so we
3820        shouldn't be asked to pass or return them.  */
3821     size = GET_MODE_SIZE (mode).to_constant ();
3822   size = ROUND_UP (size, UNITS_PER_WORD);
3823
3824   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3825   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3826                                                  mode,
3827                                                  type,
3828                                                  &nregs);
3829
3830   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3831      The following code thus handles passing by SIMD/FP registers first.  */
3832
3833   nvrn = pcum->aapcs_nvrn;
3834
3835   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3836      and homogenous short-vector aggregates (HVA).  */
3837   if (allocate_nvrn)
3838     {
3839       if (!TARGET_FLOAT)
3840         aarch64_err_no_fpadvsimd (mode);
3841
3842       if (nvrn + nregs <= NUM_FP_ARG_REGS)
3843         {
3844           pcum->aapcs_nextnvrn = nvrn + nregs;
3845           if (!aarch64_composite_type_p (type, mode))
3846             {
3847               gcc_assert (nregs == 1);
3848               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3849             }
3850           else
3851             {
3852               rtx par;
3853               int i;
3854               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3855               for (i = 0; i < nregs; i++)
3856                 {
3857                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3858                                          V0_REGNUM + nvrn + i);
3859                   rtx offset = gen_int_mode
3860                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3861                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3862                   XVECEXP (par, 0, i) = tmp;
3863                 }
3864               pcum->aapcs_reg = par;
3865             }
3866           return;
3867         }
3868       else
3869         {
3870           /* C.3 NSRN is set to 8.  */
3871           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3872           goto on_stack;
3873         }
3874     }
3875
3876   ncrn = pcum->aapcs_ncrn;
3877   nregs = size / UNITS_PER_WORD;
3878
3879   /* C6 - C9.  though the sign and zero extension semantics are
3880      handled elsewhere.  This is the case where the argument fits
3881      entirely general registers.  */
3882   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3883     {
3884
3885       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3886
3887       /* C.8 if the argument has an alignment of 16 then the NGRN is
3888          rounded up to the next even number.  */
3889       if (nregs == 2
3890           && ncrn % 2
3891           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3892              comparison is there because for > 16 * BITS_PER_UNIT
3893              alignment nregs should be > 2 and therefore it should be
3894              passed by reference rather than value.  */
3895           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3896         {
3897           ++ncrn;
3898           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3899         }
3900
3901       /* NREGS can be 0 when e.g. an empty structure is to be passed.
3902          A reg is still generated for it, but the caller should be smart
3903          enough not to use it.  */
3904       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3905         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3906       else
3907         {
3908           rtx par;
3909           int i;
3910
3911           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3912           for (i = 0; i < nregs; i++)
3913             {
3914               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3915               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3916                                        GEN_INT (i * UNITS_PER_WORD));
3917               XVECEXP (par, 0, i) = tmp;
3918             }
3919           pcum->aapcs_reg = par;
3920         }
3921
3922       pcum->aapcs_nextncrn = ncrn + nregs;
3923       return;
3924     }
3925
3926   /* C.11  */
3927   pcum->aapcs_nextncrn = NUM_ARG_REGS;
3928
3929   /* The argument is passed on stack; record the needed number of words for
3930      this argument and align the total size if necessary.  */
3931 on_stack:
3932   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3933
3934   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3935     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3936                                        16 / UNITS_PER_WORD);
3937   return;
3938 }
3939
3940 /* Implement TARGET_FUNCTION_ARG.  */
3941
3942 static rtx
3943 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3944                       const_tree type, bool named)
3945 {
3946   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3947   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3948
3949   if (mode == VOIDmode)
3950     return NULL_RTX;
3951
3952   aarch64_layout_arg (pcum_v, mode, type, named);
3953   return pcum->aapcs_reg;
3954 }
3955
3956 void
3957 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3958                            const_tree fntype ATTRIBUTE_UNUSED,
3959                            rtx libname ATTRIBUTE_UNUSED,
3960                            const_tree fndecl ATTRIBUTE_UNUSED,
3961                            unsigned n_named ATTRIBUTE_UNUSED)
3962 {
3963   pcum->aapcs_ncrn = 0;
3964   pcum->aapcs_nvrn = 0;
3965   pcum->aapcs_nextncrn = 0;
3966   pcum->aapcs_nextnvrn = 0;
3967   pcum->pcs_variant = ARM_PCS_AAPCS64;
3968   pcum->aapcs_reg = NULL_RTX;
3969   pcum->aapcs_arg_processed = false;
3970   pcum->aapcs_stack_words = 0;
3971   pcum->aapcs_stack_size = 0;
3972
3973   if (!TARGET_FLOAT
3974       && fndecl && TREE_PUBLIC (fndecl)
3975       && fntype && fntype != error_mark_node)
3976     {
3977       const_tree type = TREE_TYPE (fntype);
3978       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
3979       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
3980       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3981                                                    &mode, &nregs, NULL))
3982         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
3983     }
3984   return;
3985 }
3986
3987 static void
3988 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3989                               machine_mode mode,
3990                               const_tree type,
3991                               bool named)
3992 {
3993   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3994   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3995     {
3996       aarch64_layout_arg (pcum_v, mode, type, named);
3997       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3998                   != (pcum->aapcs_stack_words != 0));
3999       pcum->aapcs_arg_processed = false;
4000       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4001       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4002       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4003       pcum->aapcs_stack_words = 0;
4004       pcum->aapcs_reg = NULL_RTX;
4005     }
4006 }
4007
4008 bool
4009 aarch64_function_arg_regno_p (unsigned regno)
4010 {
4011   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4012           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4013 }
4014
4015 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
4016    PARM_BOUNDARY bits of alignment, but will be given anything up
4017    to STACK_BOUNDARY bits if the type requires it.  This makes sure
4018    that both before and after the layout of each argument, the Next
4019    Stacked Argument Address (NSAA) will have a minimum alignment of
4020    8 bytes.  */
4021
4022 static unsigned int
4023 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4024 {
4025   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
4026   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4027 }
4028
4029 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
4030
4031 static fixed_size_mode
4032 aarch64_get_reg_raw_mode (int regno)
4033 {
4034   if (TARGET_SVE && FP_REGNUM_P (regno))
4035     /* Don't use the SVE part of the register for __builtin_apply and
4036        __builtin_return.  The SVE registers aren't used by the normal PCS,
4037        so using them there would be a waste of time.  The PCS extensions
4038        for SVE types are fundamentally incompatible with the
4039        __builtin_return/__builtin_apply interface.  */
4040     return as_a <fixed_size_mode> (V16QImode);
4041   return default_get_reg_raw_mode (regno);
4042 }
4043
4044 /* Implement TARGET_FUNCTION_ARG_PADDING.
4045
4046    Small aggregate types are placed in the lowest memory address.
4047
4048    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
4049
4050 static pad_direction
4051 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4052 {
4053   /* On little-endian targets, the least significant byte of every stack
4054      argument is passed at the lowest byte address of the stack slot.  */
4055   if (!BYTES_BIG_ENDIAN)
4056     return PAD_UPWARD;
4057
4058   /* Otherwise, integral, floating-point and pointer types are padded downward:
4059      the least significant byte of a stack argument is passed at the highest
4060      byte address of the stack slot.  */
4061   if (type
4062       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4063          || POINTER_TYPE_P (type))
4064       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4065     return PAD_DOWNWARD;
4066
4067   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
4068   return PAD_UPWARD;
4069 }
4070
4071 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4072
4073    It specifies padding for the last (may also be the only)
4074    element of a block move between registers and memory.  If
4075    assuming the block is in the memory, padding upward means that
4076    the last element is padded after its highest significant byte,
4077    while in downward padding, the last element is padded at the
4078    its least significant byte side.
4079
4080    Small aggregates and small complex types are always padded
4081    upwards.
4082
4083    We don't need to worry about homogeneous floating-point or
4084    short-vector aggregates; their move is not affected by the
4085    padding direction determined here.  Regardless of endianness,
4086    each element of such an aggregate is put in the least
4087    significant bits of a fp/simd register.
4088
4089    Return !BYTES_BIG_ENDIAN if the least significant byte of the
4090    register has useful data, and return the opposite if the most
4091    significant byte does.  */
4092
4093 bool
4094 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4095                      bool first ATTRIBUTE_UNUSED)
4096 {
4097
4098   /* Small composite types are always padded upward.  */
4099   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4100     {
4101       HOST_WIDE_INT size;
4102       if (type)
4103         size = int_size_in_bytes (type);
4104       else
4105         /* No frontends can create types with variable-sized modes, so we
4106            shouldn't be asked to pass or return them.  */
4107         size = GET_MODE_SIZE (mode).to_constant ();
4108       if (size < 2 * UNITS_PER_WORD)
4109         return true;
4110     }
4111
4112   /* Otherwise, use the default padding.  */
4113   return !BYTES_BIG_ENDIAN;
4114 }
4115
4116 static scalar_int_mode
4117 aarch64_libgcc_cmp_return_mode (void)
4118 {
4119   return SImode;
4120 }
4121
4122 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4123
4124 /* We use the 12-bit shifted immediate arithmetic instructions so values
4125    must be multiple of (1 << 12), i.e. 4096.  */
4126 #define ARITH_FACTOR 4096
4127
4128 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4129 #error Cannot use simple address calculation for stack probing
4130 #endif
4131
4132 /* The pair of scratch registers used for stack probing.  */
4133 #define PROBE_STACK_FIRST_REG  R9_REGNUM
4134 #define PROBE_STACK_SECOND_REG R10_REGNUM
4135
4136 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4137    inclusive.  These are offsets from the current stack pointer.  */
4138
4139 static void
4140 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4141 {
4142   HOST_WIDE_INT size;
4143   if (!poly_size.is_constant (&size))
4144     {
4145       sorry ("stack probes for SVE frames");
4146       return;
4147     }
4148
4149   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4150
4151   /* See the same assertion on PROBE_INTERVAL above.  */
4152   gcc_assert ((first % ARITH_FACTOR) == 0);
4153
4154   /* See if we have a constant small number of probes to generate.  If so,
4155      that's the easy case.  */
4156   if (size <= PROBE_INTERVAL)
4157     {
4158       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4159
4160       emit_set_insn (reg1,
4161                      plus_constant (Pmode,
4162                                     stack_pointer_rtx, -(first + base)));
4163       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4164     }
4165
4166   /* The run-time loop is made up of 8 insns in the generic case while the
4167      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
4168   else if (size <= 4 * PROBE_INTERVAL)
4169     {
4170       HOST_WIDE_INT i, rem;
4171
4172       emit_set_insn (reg1,
4173                      plus_constant (Pmode,
4174                                     stack_pointer_rtx,
4175                                     -(first + PROBE_INTERVAL)));
4176       emit_stack_probe (reg1);
4177
4178       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4179          it exceeds SIZE.  If only two probes are needed, this will not
4180          generate any code.  Then probe at FIRST + SIZE.  */
4181       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4182         {
4183           emit_set_insn (reg1,
4184                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4185           emit_stack_probe (reg1);
4186         }
4187
4188       rem = size - (i - PROBE_INTERVAL);
4189       if (rem > 256)
4190         {
4191           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4192
4193           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4194           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4195         }
4196       else
4197         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4198     }
4199
4200   /* Otherwise, do the same as above, but in a loop.  Note that we must be
4201      extra careful with variables wrapping around because we might be at
4202      the very top (or the very bottom) of the address space and we have
4203      to be able to handle this case properly; in particular, we use an
4204      equality test for the loop condition.  */
4205   else
4206     {
4207       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
4208
4209       /* Step 1: round SIZE to the previous multiple of the interval.  */
4210
4211       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
4212
4213
4214       /* Step 2: compute initial and final value of the loop counter.  */
4215
4216       /* TEST_ADDR = SP + FIRST.  */
4217       emit_set_insn (reg1,
4218                      plus_constant (Pmode, stack_pointer_rtx, -first));
4219
4220       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
4221       HOST_WIDE_INT adjustment = - (first + rounded_size);
4222       if (! aarch64_uimm12_shift (adjustment))
4223         {
4224           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
4225                                           true, Pmode);
4226           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
4227         }
4228       else
4229         emit_set_insn (reg2,
4230                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
4231
4232       /* Step 3: the loop
4233
4234          do
4235            {
4236              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4237              probe at TEST_ADDR
4238            }
4239          while (TEST_ADDR != LAST_ADDR)
4240
4241          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4242          until it is equal to ROUNDED_SIZE.  */
4243
4244       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
4245
4246
4247       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4248          that SIZE is equal to ROUNDED_SIZE.  */
4249
4250       if (size != rounded_size)
4251         {
4252           HOST_WIDE_INT rem = size - rounded_size;
4253
4254           if (rem > 256)
4255             {
4256               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4257
4258               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
4259               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
4260             }
4261           else
4262             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
4263         }
4264     }
4265
4266   /* Make sure nothing is scheduled before we are done.  */
4267   emit_insn (gen_blockage ());
4268 }
4269
4270 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
4271    absolute addresses.  */
4272
4273 const char *
4274 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
4275 {
4276   static int labelno = 0;
4277   char loop_lab[32];
4278   rtx xops[2];
4279
4280   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
4281
4282   /* Loop.  */
4283   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
4284
4285   HOST_WIDE_INT stack_clash_probe_interval
4286     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
4287
4288   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
4289   xops[0] = reg1;
4290   HOST_WIDE_INT interval;
4291   if (flag_stack_clash_protection)
4292     interval = stack_clash_probe_interval;
4293   else
4294     interval = PROBE_INTERVAL;
4295
4296   gcc_assert (aarch64_uimm12_shift (interval));
4297   xops[1] = GEN_INT (interval);
4298
4299   output_asm_insn ("sub\t%0, %0, %1", xops);
4300
4301   /* If doing stack clash protection then we probe up by the ABI specified
4302      amount.  We do this because we're dropping full pages at a time in the
4303      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
4304   if (flag_stack_clash_protection)
4305     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
4306   else
4307     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
4308
4309   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
4310      by this amount for each iteration.  */
4311   output_asm_insn ("str\txzr, [%0, %1]", xops);
4312
4313   /* Test if TEST_ADDR == LAST_ADDR.  */
4314   xops[1] = reg2;
4315   output_asm_insn ("cmp\t%0, %1", xops);
4316
4317   /* Branch.  */
4318   fputs ("\tb.ne\t", asm_out_file);
4319   assemble_name_raw (asm_out_file, loop_lab);
4320   fputc ('\n', asm_out_file);
4321
4322   return "";
4323 }
4324
4325 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4326    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4327    of GUARD_SIZE.  When a probe is emitted it is done at most
4328    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4329    at most MIN_PROBE_THRESHOLD.  By the end of this function
4330    BASE = BASE - ADJUSTMENT.  */
4331
4332 const char *
4333 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
4334                                       rtx min_probe_threshold, rtx guard_size)
4335 {
4336   /* This function is not allowed to use any instruction generation function
4337      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
4338      so instead emit the code you want using output_asm_insn.  */
4339   gcc_assert (flag_stack_clash_protection);
4340   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
4341   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
4342
4343   /* The minimum required allocation before the residual requires probing.  */
4344   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
4345
4346   /* Clamp the value down to the nearest value that can be used with a cmp.  */
4347   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
4348   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
4349
4350   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
4351   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
4352
4353   static int labelno = 0;
4354   char loop_start_lab[32];
4355   char loop_end_lab[32];
4356   rtx xops[2];
4357
4358   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
4359   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
4360
4361   /* Emit loop start label.  */
4362   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
4363
4364   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
4365   xops[0] = adjustment;
4366   xops[1] = probe_offset_value_rtx;
4367   output_asm_insn ("cmp\t%0, %1", xops);
4368
4369   /* Branch to end if not enough adjustment to probe.  */
4370   fputs ("\tb.lt\t", asm_out_file);
4371   assemble_name_raw (asm_out_file, loop_end_lab);
4372   fputc ('\n', asm_out_file);
4373
4374   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
4375   xops[0] = base;
4376   xops[1] = probe_offset_value_rtx;
4377   output_asm_insn ("sub\t%0, %0, %1", xops);
4378
4379   /* Probe at BASE.  */
4380   xops[1] = const0_rtx;
4381   output_asm_insn ("str\txzr, [%0, %1]", xops);
4382
4383   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
4384   xops[0] = adjustment;
4385   xops[1] = probe_offset_value_rtx;
4386   output_asm_insn ("sub\t%0, %0, %1", xops);
4387
4388   /* Branch to start if still more bytes to allocate.  */
4389   fputs ("\tb\t", asm_out_file);
4390   assemble_name_raw (asm_out_file, loop_start_lab);
4391   fputc ('\n', asm_out_file);
4392
4393   /* No probe leave.  */
4394   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
4395
4396   /* BASE = BASE - ADJUSTMENT.  */
4397   xops[0] = base;
4398   xops[1] = adjustment;
4399   output_asm_insn ("sub\t%0, %0, %1", xops);
4400   return "";
4401 }
4402
4403 /* Determine whether a frame chain needs to be generated.  */
4404 static bool
4405 aarch64_needs_frame_chain (void)
4406 {
4407   /* Force a frame chain for EH returns so the return address is at FP+8.  */
4408   if (frame_pointer_needed || crtl->calls_eh_return)
4409     return true;
4410
4411   /* A leaf function cannot have calls or write LR.  */
4412   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4413
4414   /* Don't use a frame chain in leaf functions if leaf frame pointers
4415      are disabled.  */
4416   if (flag_omit_leaf_frame_pointer && is_leaf)
4417     return false;
4418
4419   return aarch64_use_frame_pointer;
4420 }
4421
4422 /* Mark the registers that need to be saved by the callee and calculate
4423    the size of the callee-saved registers area and frame record (both FP
4424    and LR may be omitted).  */
4425 static void
4426 aarch64_layout_frame (void)
4427 {
4428   HOST_WIDE_INT offset = 0;
4429   int regno, last_fp_reg = INVALID_REGNUM;
4430   bool simd_function = aarch64_simd_decl_p (cfun->decl);
4431
4432   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4433
4434   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
4435      the mid-end is doing.  */
4436   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
4437
4438 #define SLOT_NOT_REQUIRED (-2)
4439 #define SLOT_REQUIRED     (-1)
4440
4441   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4442   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4443
4444   /* If this is a non-leaf simd function with calls we assume that
4445      at least one of those calls is to a non-simd function and thus
4446      we must save V8 to V23 in the prologue.  */
4447
4448   if (simd_function && !crtl->is_leaf)
4449     {
4450       for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4451         if (FP_SIMD_SAVED_REGNUM_P (regno))
4452           df_set_regs_ever_live (regno, true);
4453     }
4454
4455   /* First mark all the registers that really need to be saved...  */
4456   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4457     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4458
4459   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4460     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4461
4462   /* ... that includes the eh data registers (if needed)...  */
4463   if (crtl->calls_eh_return)
4464     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4465       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4466         = SLOT_REQUIRED;
4467
4468   /* ... and any callee saved register that dataflow says is live.  */
4469   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4470     if (df_regs_ever_live_p (regno)
4471         && (regno == R30_REGNUM
4472             || !call_used_regs[regno]))
4473       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4474
4475   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4476     if (df_regs_ever_live_p (regno)
4477         && (!call_used_regs[regno]
4478             || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
4479       {
4480         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4481         last_fp_reg = regno;
4482       }
4483
4484   if (cfun->machine->frame.emit_frame_chain)
4485     {
4486       /* FP and LR are placed in the linkage record.  */
4487       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4488       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4489       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4490       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4491       offset = 2 * UNITS_PER_WORD;
4492     }
4493
4494   /* With stack-clash, LR must be saved in non-leaf functions.  */
4495   gcc_assert (crtl->is_leaf
4496               || (cfun->machine->frame.reg_offset[R30_REGNUM]
4497                   != SLOT_NOT_REQUIRED));
4498
4499   /* Now assign stack slots for them.  */
4500   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4501     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4502       {
4503         cfun->machine->frame.reg_offset[regno] = offset;
4504         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4505           cfun->machine->frame.wb_candidate1 = regno;
4506         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4507           cfun->machine->frame.wb_candidate2 = regno;
4508         offset += UNITS_PER_WORD;
4509       }
4510
4511   HOST_WIDE_INT max_int_offset = offset;
4512   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4513   bool has_align_gap = offset != max_int_offset;
4514
4515   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4516     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4517       {
4518         /* If there is an alignment gap between integer and fp callee-saves,
4519            allocate the last fp register to it if possible.  */
4520         if (regno == last_fp_reg
4521             && has_align_gap
4522             && !simd_function
4523             && (offset & 8) == 0)
4524           {
4525             cfun->machine->frame.reg_offset[regno] = max_int_offset;
4526             break;
4527           }
4528
4529         cfun->machine->frame.reg_offset[regno] = offset;
4530         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4531           cfun->machine->frame.wb_candidate1 = regno;
4532         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4533                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4534           cfun->machine->frame.wb_candidate2 = regno;
4535         offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
4536       }
4537
4538   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4539
4540   cfun->machine->frame.saved_regs_size = offset;
4541
4542   HOST_WIDE_INT varargs_and_saved_regs_size
4543     = offset + cfun->machine->frame.saved_varargs_size;
4544
4545   cfun->machine->frame.hard_fp_offset
4546     = aligned_upper_bound (varargs_and_saved_regs_size
4547                            + get_frame_size (),
4548                            STACK_BOUNDARY / BITS_PER_UNIT);
4549
4550   /* Both these values are already aligned.  */
4551   gcc_assert (multiple_p (crtl->outgoing_args_size,
4552                           STACK_BOUNDARY / BITS_PER_UNIT));
4553   cfun->machine->frame.frame_size
4554     = (cfun->machine->frame.hard_fp_offset
4555        + crtl->outgoing_args_size);
4556
4557   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4558
4559   cfun->machine->frame.initial_adjust = 0;
4560   cfun->machine->frame.final_adjust = 0;
4561   cfun->machine->frame.callee_adjust = 0;
4562   cfun->machine->frame.callee_offset = 0;
4563
4564   HOST_WIDE_INT max_push_offset = 0;
4565   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4566     max_push_offset = 512;
4567   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4568     max_push_offset = 256;
4569
4570   HOST_WIDE_INT const_size, const_fp_offset;
4571   if (cfun->machine->frame.frame_size.is_constant (&const_size)
4572       && const_size < max_push_offset
4573       && known_eq (crtl->outgoing_args_size, 0))
4574     {
4575       /* Simple, small frame with no outgoing arguments:
4576          stp reg1, reg2, [sp, -frame_size]!
4577          stp reg3, reg4, [sp, 16]  */
4578       cfun->machine->frame.callee_adjust = const_size;
4579     }
4580   else if (known_lt (crtl->outgoing_args_size
4581                      + cfun->machine->frame.saved_regs_size, 512)
4582            && !(cfun->calls_alloca
4583                 && known_lt (cfun->machine->frame.hard_fp_offset,
4584                              max_push_offset)))
4585     {
4586       /* Frame with small outgoing arguments:
4587          sub sp, sp, frame_size
4588          stp reg1, reg2, [sp, outgoing_args_size]
4589          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
4590       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4591       cfun->machine->frame.callee_offset
4592         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4593     }
4594   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4595            && const_fp_offset < max_push_offset)
4596     {
4597       /* Frame with large outgoing arguments but a small local area:
4598          stp reg1, reg2, [sp, -hard_fp_offset]!
4599          stp reg3, reg4, [sp, 16]
4600          sub sp, sp, outgoing_args_size  */
4601       cfun->machine->frame.callee_adjust = const_fp_offset;
4602       cfun->machine->frame.final_adjust
4603         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4604     }
4605   else
4606     {
4607       /* Frame with large local area and outgoing arguments using frame pointer:
4608          sub sp, sp, hard_fp_offset
4609          stp x29, x30, [sp, 0]
4610          add x29, sp, 0
4611          stp reg3, reg4, [sp, 16]
4612          sub sp, sp, outgoing_args_size  */
4613       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4614       cfun->machine->frame.final_adjust
4615         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4616     }
4617
4618   cfun->machine->frame.laid_out = true;
4619 }
4620
4621 /* Return true if the register REGNO is saved on entry to
4622    the current function.  */
4623
4624 static bool
4625 aarch64_register_saved_on_entry (int regno)
4626 {
4627   return cfun->machine->frame.reg_offset[regno] >= 0;
4628 }
4629
4630 /* Return the next register up from REGNO up to LIMIT for the callee
4631    to save.  */
4632
4633 static unsigned
4634 aarch64_next_callee_save (unsigned regno, unsigned limit)
4635 {
4636   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4637     regno ++;
4638   return regno;
4639 }
4640
4641 /* Push the register number REGNO of mode MODE to the stack with write-back
4642    adjusting the stack by ADJUSTMENT.  */
4643
4644 static void
4645 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4646                            HOST_WIDE_INT adjustment)
4647  {
4648   rtx base_rtx = stack_pointer_rtx;
4649   rtx insn, reg, mem;
4650
4651   reg = gen_rtx_REG (mode, regno);
4652   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4653                             plus_constant (Pmode, base_rtx, -adjustment));
4654   mem = gen_frame_mem (mode, mem);
4655
4656   insn = emit_move_insn (mem, reg);
4657   RTX_FRAME_RELATED_P (insn) = 1;
4658 }
4659
4660 /* Generate and return an instruction to store the pair of registers
4661    REG and REG2 of mode MODE to location BASE with write-back adjusting
4662    the stack location BASE by ADJUSTMENT.  */
4663
4664 static rtx
4665 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4666                           HOST_WIDE_INT adjustment)
4667 {
4668   switch (mode)
4669     {
4670     case E_DImode:
4671       return gen_storewb_pairdi_di (base, base, reg, reg2,
4672                                     GEN_INT (-adjustment),
4673                                     GEN_INT (UNITS_PER_WORD - adjustment));
4674     case E_DFmode:
4675       return gen_storewb_pairdf_di (base, base, reg, reg2,
4676                                     GEN_INT (-adjustment),
4677                                     GEN_INT (UNITS_PER_WORD - adjustment));
4678     case E_TFmode:
4679       return gen_storewb_pairtf_di (base, base, reg, reg2,
4680                                     GEN_INT (-adjustment),
4681                                     GEN_INT (UNITS_PER_VREG - adjustment));
4682     default:
4683       gcc_unreachable ();
4684     }
4685 }
4686
4687 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4688    stack pointer by ADJUSTMENT.  */
4689
4690 static void
4691 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4692 {
4693   rtx_insn *insn;
4694   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4695
4696   if (regno2 == INVALID_REGNUM)
4697     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4698
4699   rtx reg1 = gen_rtx_REG (mode, regno1);
4700   rtx reg2 = gen_rtx_REG (mode, regno2);
4701
4702   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4703                                               reg2, adjustment));
4704   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4705   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4706   RTX_FRAME_RELATED_P (insn) = 1;
4707 }
4708
4709 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4710    adjusting it by ADJUSTMENT afterwards.  */
4711
4712 static rtx
4713 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4714                          HOST_WIDE_INT adjustment)
4715 {
4716   switch (mode)
4717     {
4718     case E_DImode:
4719       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4720                                    GEN_INT (UNITS_PER_WORD));
4721     case E_DFmode:
4722       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4723                                    GEN_INT (UNITS_PER_WORD));
4724     case E_TFmode:
4725       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
4726                                    GEN_INT (UNITS_PER_VREG));
4727     default:
4728       gcc_unreachable ();
4729     }
4730 }
4731
4732 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4733    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4734    into CFI_OPS.  */
4735
4736 static void
4737 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4738                   rtx *cfi_ops)
4739 {
4740   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4741   rtx reg1 = gen_rtx_REG (mode, regno1);
4742
4743   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4744
4745   if (regno2 == INVALID_REGNUM)
4746     {
4747       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4748       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4749       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4750     }
4751   else
4752     {
4753       rtx reg2 = gen_rtx_REG (mode, regno2);
4754       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4755       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4756                                           reg2, adjustment));
4757     }
4758 }
4759
4760 /* Generate and return a store pair instruction of mode MODE to store
4761    register REG1 to MEM1 and register REG2 to MEM2.  */
4762
4763 static rtx
4764 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4765                         rtx reg2)
4766 {
4767   switch (mode)
4768     {
4769     case E_DImode:
4770       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4771
4772     case E_DFmode:
4773       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4774
4775     case E_TFmode:
4776       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
4777
4778     default:
4779       gcc_unreachable ();
4780     }
4781 }
4782
4783 /* Generate and regurn a load pair isntruction of mode MODE to load register
4784    REG1 from MEM1 and register REG2 from MEM2.  */
4785
4786 static rtx
4787 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4788                        rtx mem2)
4789 {
4790   switch (mode)
4791     {
4792     case E_DImode:
4793       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4794
4795     case E_DFmode:
4796       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4797
4798     case E_TFmode:
4799       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
4800
4801     default:
4802       gcc_unreachable ();
4803     }
4804 }
4805
4806 /* Return TRUE if return address signing should be enabled for the current
4807    function, otherwise return FALSE.  */
4808
4809 bool
4810 aarch64_return_address_signing_enabled (void)
4811 {
4812   /* This function should only be called after frame laid out.   */
4813   gcc_assert (cfun->machine->frame.laid_out);
4814
4815   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4816      if it's LR is pushed onto stack.  */
4817   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4818           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4819               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4820 }
4821
4822 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
4823 bool
4824 aarch64_bti_enabled (void)
4825 {
4826   return (aarch64_enable_bti == 1);
4827 }
4828
4829 /* Emit code to save the callee-saved registers from register number START
4830    to LIMIT to the stack at the location starting at offset START_OFFSET,
4831    skipping any write-back candidates if SKIP_WB is true.  */
4832
4833 static void
4834 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4835                            unsigned start, unsigned limit, bool skip_wb)
4836 {
4837   rtx_insn *insn;
4838   unsigned regno;
4839   unsigned regno2;
4840
4841   for (regno = aarch64_next_callee_save (start, limit);
4842        regno <= limit;
4843        regno = aarch64_next_callee_save (regno + 1, limit))
4844     {
4845       rtx reg, mem;
4846       poly_int64 offset;
4847       int offset_diff;
4848
4849       if (skip_wb
4850           && (regno == cfun->machine->frame.wb_candidate1
4851               || regno == cfun->machine->frame.wb_candidate2))
4852         continue;
4853
4854       if (cfun->machine->reg_is_wrapped_separately[regno])
4855        continue;
4856
4857       reg = gen_rtx_REG (mode, regno);
4858       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4859       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4860                                                 offset));
4861
4862       regno2 = aarch64_next_callee_save (regno + 1, limit);
4863       offset_diff = cfun->machine->frame.reg_offset[regno2]
4864                     - cfun->machine->frame.reg_offset[regno];
4865
4866       if (regno2 <= limit
4867           && !cfun->machine->reg_is_wrapped_separately[regno2]
4868           && known_eq (GET_MODE_SIZE (mode), offset_diff))
4869         {
4870           rtx reg2 = gen_rtx_REG (mode, regno2);
4871           rtx mem2;
4872
4873           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4874           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4875                                                      offset));
4876           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4877                                                     reg2));
4878
4879           /* The first part of a frame-related parallel insn is
4880              always assumed to be relevant to the frame
4881              calculations; subsequent parts, are only
4882              frame-related if explicitly marked.  */
4883           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4884           regno = regno2;
4885         }
4886       else
4887         insn = emit_move_insn (mem, reg);
4888
4889       RTX_FRAME_RELATED_P (insn) = 1;
4890     }
4891 }
4892
4893 /* Emit code to restore the callee registers of mode MODE from register
4894    number START up to and including LIMIT.  Restore from the stack offset
4895    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4896    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
4897
4898 static void
4899 aarch64_restore_callee_saves (machine_mode mode,
4900                               poly_int64 start_offset, unsigned start,
4901                               unsigned limit, bool skip_wb, rtx *cfi_ops)
4902 {
4903   rtx base_rtx = stack_pointer_rtx;
4904   unsigned regno;
4905   unsigned regno2;
4906   poly_int64 offset;
4907
4908   for (regno = aarch64_next_callee_save (start, limit);
4909        regno <= limit;
4910        regno = aarch64_next_callee_save (regno + 1, limit))
4911     {
4912       if (cfun->machine->reg_is_wrapped_separately[regno])
4913        continue;
4914
4915       rtx reg, mem;
4916       int offset_diff;
4917
4918       if (skip_wb
4919           && (regno == cfun->machine->frame.wb_candidate1
4920               || regno == cfun->machine->frame.wb_candidate2))
4921         continue;
4922
4923       reg = gen_rtx_REG (mode, regno);
4924       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4925       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4926
4927       regno2 = aarch64_next_callee_save (regno + 1, limit);
4928       offset_diff = cfun->machine->frame.reg_offset[regno2]
4929                     - cfun->machine->frame.reg_offset[regno];
4930
4931       if (regno2 <= limit
4932           && !cfun->machine->reg_is_wrapped_separately[regno2]
4933           && known_eq (GET_MODE_SIZE (mode), offset_diff))
4934         {
4935           rtx reg2 = gen_rtx_REG (mode, regno2);
4936           rtx mem2;
4937
4938           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4939           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4940           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4941
4942           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4943           regno = regno2;
4944         }
4945       else
4946         emit_move_insn (reg, mem);
4947       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4948     }
4949 }
4950
4951 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4952    of MODE.  */
4953
4954 static inline bool
4955 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4956 {
4957   HOST_WIDE_INT multiple;
4958   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4959           && IN_RANGE (multiple, -8, 7));
4960 }
4961
4962 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4963    of MODE.  */
4964
4965 static inline bool
4966 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4967 {
4968   HOST_WIDE_INT multiple;
4969   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4970           && IN_RANGE (multiple, 0, 63));
4971 }
4972
4973 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4974    of MODE.  */
4975
4976 bool
4977 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4978 {
4979   HOST_WIDE_INT multiple;
4980   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4981           && IN_RANGE (multiple, -64, 63));
4982 }
4983
4984 /* Return true if OFFSET is a signed 9-bit value.  */
4985
4986 bool
4987 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4988                                        poly_int64 offset)
4989 {
4990   HOST_WIDE_INT const_offset;
4991   return (offset.is_constant (&const_offset)
4992           && IN_RANGE (const_offset, -256, 255));
4993 }
4994
4995 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4996    of MODE.  */
4997
4998 static inline bool
4999 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5000 {
5001   HOST_WIDE_INT multiple;
5002   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5003           && IN_RANGE (multiple, -256, 255));
5004 }
5005
5006 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5007    of MODE.  */
5008
5009 static inline bool
5010 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5011 {
5012   HOST_WIDE_INT multiple;
5013   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5014           && IN_RANGE (multiple, 0, 4095));
5015 }
5016
5017 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
5018
5019 static sbitmap
5020 aarch64_get_separate_components (void)
5021 {
5022   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5023   bitmap_clear (components);
5024
5025   /* The registers we need saved to the frame.  */
5026   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5027     if (aarch64_register_saved_on_entry (regno))
5028       {
5029         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5030         if (!frame_pointer_needed)
5031           offset += cfun->machine->frame.frame_size
5032                     - cfun->machine->frame.hard_fp_offset;
5033         /* Check that we can access the stack slot of the register with one
5034            direct load with no adjustments needed.  */
5035         if (offset_12bit_unsigned_scaled_p (DImode, offset))
5036           bitmap_set_bit (components, regno);
5037       }
5038
5039   /* Don't mess with the hard frame pointer.  */
5040   if (frame_pointer_needed)
5041     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5042
5043   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5044   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5045   /* If registers have been chosen to be stored/restored with
5046      writeback don't interfere with them to avoid having to output explicit
5047      stack adjustment instructions.  */
5048   if (reg2 != INVALID_REGNUM)
5049     bitmap_clear_bit (components, reg2);
5050   if (reg1 != INVALID_REGNUM)
5051     bitmap_clear_bit (components, reg1);
5052
5053   bitmap_clear_bit (components, LR_REGNUM);
5054   bitmap_clear_bit (components, SP_REGNUM);
5055
5056   return components;
5057 }
5058
5059 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
5060
5061 static sbitmap
5062 aarch64_components_for_bb (basic_block bb)
5063 {
5064   bitmap in = DF_LIVE_IN (bb);
5065   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5066   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5067   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5068
5069   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5070   bitmap_clear (components);
5071
5072   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
5073   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5074     if ((!call_used_regs[regno]
5075         || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5076        && (bitmap_bit_p (in, regno)
5077            || bitmap_bit_p (gen, regno)
5078            || bitmap_bit_p (kill, regno)))
5079       {
5080         unsigned regno2, offset, offset2;
5081         bitmap_set_bit (components, regno);
5082
5083         /* If there is a callee-save at an adjacent offset, add it too
5084            to increase the use of LDP/STP.  */
5085         offset = cfun->machine->frame.reg_offset[regno];
5086         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5087
5088         if (regno2 <= LAST_SAVED_REGNUM)
5089           {
5090             offset2 = cfun->machine->frame.reg_offset[regno2];
5091             if ((offset & ~8) == (offset2 & ~8))
5092               bitmap_set_bit (components, regno2);
5093           }
5094       }
5095
5096   return components;
5097 }
5098
5099 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5100    Nothing to do for aarch64.  */
5101
5102 static void
5103 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5104 {
5105 }
5106
5107 /* Return the next set bit in BMP from START onwards.  Return the total number
5108    of bits in BMP if no set bit is found at or after START.  */
5109
5110 static unsigned int
5111 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5112 {
5113   unsigned int nbits = SBITMAP_SIZE (bmp);
5114   if (start == nbits)
5115     return start;
5116
5117   gcc_assert (start < nbits);
5118   for (unsigned int i = start; i < nbits; i++)
5119     if (bitmap_bit_p (bmp, i))
5120       return i;
5121
5122   return nbits;
5123 }
5124
5125 /* Do the work for aarch64_emit_prologue_components and
5126    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
5127    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5128    for these components or the epilogue sequence.  That is, it determines
5129    whether we should emit stores or loads and what kind of CFA notes to attach
5130    to the insns.  Otherwise the logic for the two sequences is very
5131    similar.  */
5132
5133 static void
5134 aarch64_process_components (sbitmap components, bool prologue_p)
5135 {
5136   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5137                              ? HARD_FRAME_POINTER_REGNUM
5138                              : STACK_POINTER_REGNUM);
5139
5140   unsigned last_regno = SBITMAP_SIZE (components);
5141   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5142   rtx_insn *insn = NULL;
5143
5144   while (regno != last_regno)
5145     {
5146       /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5147          so DFmode for the vector registers is enough.  For simd functions
5148          we want to save the low 128 bits.  */
5149       machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5150
5151       rtx reg = gen_rtx_REG (mode, regno);
5152       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5153       if (!frame_pointer_needed)
5154         offset += cfun->machine->frame.frame_size
5155                   - cfun->machine->frame.hard_fp_offset;
5156       rtx addr = plus_constant (Pmode, ptr_reg, offset);
5157       rtx mem = gen_frame_mem (mode, addr);
5158
5159       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5160       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5161       /* No more registers to handle after REGNO.
5162          Emit a single save/restore and exit.  */
5163       if (regno2 == last_regno)
5164         {
5165           insn = emit_insn (set);
5166           RTX_FRAME_RELATED_P (insn) = 1;
5167           if (prologue_p)
5168             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5169           else
5170             add_reg_note (insn, REG_CFA_RESTORE, reg);
5171           break;
5172         }
5173
5174       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5175       /* The next register is not of the same class or its offset is not
5176          mergeable with the current one into a pair.  */
5177       if (!satisfies_constraint_Ump (mem)
5178           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5179           || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5180           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5181                        GET_MODE_SIZE (mode)))
5182         {
5183           insn = emit_insn (set);
5184           RTX_FRAME_RELATED_P (insn) = 1;
5185           if (prologue_p)
5186             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5187           else
5188             add_reg_note (insn, REG_CFA_RESTORE, reg);
5189
5190           regno = regno2;
5191           continue;
5192         }
5193
5194       /* REGNO2 can be saved/restored in a pair with REGNO.  */
5195       rtx reg2 = gen_rtx_REG (mode, regno2);
5196       if (!frame_pointer_needed)
5197         offset2 += cfun->machine->frame.frame_size
5198                   - cfun->machine->frame.hard_fp_offset;
5199       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5200       rtx mem2 = gen_frame_mem (mode, addr2);
5201       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5202                              : gen_rtx_SET (reg2, mem2);
5203
5204       if (prologue_p)
5205         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
5206       else
5207         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5208
5209       RTX_FRAME_RELATED_P (insn) = 1;
5210       if (prologue_p)
5211         {
5212           add_reg_note (insn, REG_CFA_OFFSET, set);
5213           add_reg_note (insn, REG_CFA_OFFSET, set2);
5214         }
5215       else
5216         {
5217           add_reg_note (insn, REG_CFA_RESTORE, reg);
5218           add_reg_note (insn, REG_CFA_RESTORE, reg2);
5219         }
5220
5221       regno = aarch64_get_next_set_bit (components, regno2 + 1);
5222     }
5223 }
5224
5225 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
5226
5227 static void
5228 aarch64_emit_prologue_components (sbitmap components)
5229 {
5230   aarch64_process_components (components, true);
5231 }
5232
5233 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
5234
5235 static void
5236 aarch64_emit_epilogue_components (sbitmap components)
5237 {
5238   aarch64_process_components (components, false);
5239 }
5240
5241 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
5242
5243 static void
5244 aarch64_set_handled_components (sbitmap components)
5245 {
5246   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5247     if (bitmap_bit_p (components, regno))
5248       cfun->machine->reg_is_wrapped_separately[regno] = true;
5249 }
5250
5251 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
5252    determining the probe offset for alloca.  */
5253
5254 static HOST_WIDE_INT
5255 aarch64_stack_clash_protection_alloca_probe_range (void)
5256 {
5257   return STACK_CLASH_CALLER_GUARD;
5258 }
5259
5260
5261 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5262    registers.  If POLY_SIZE is not large enough to require a probe this function
5263    will only adjust the stack.  When allocating the stack space
5264    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5265    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5266    arguments.  If we are then we ensure that any allocation larger than the ABI
5267    defined buffer needs a probe so that the invariant of having a 1KB buffer is
5268    maintained.
5269
5270    We emit barriers after each stack adjustment to prevent optimizations from
5271    breaking the invariant that we never drop the stack more than a page.  This
5272    invariant is needed to make it easier to correctly handle asynchronous
5273    events, e.g. if we were to allow the stack to be dropped by more than a page
5274    and then have multiple probes up and we take a signal somewhere in between
5275    then the signal handler doesn't know the state of the stack and can make no
5276    assumptions about which pages have been probed.  */
5277
5278 static void
5279 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
5280                                         poly_int64 poly_size,
5281                                         bool frame_related_p,
5282                                         bool final_adjustment_p)
5283 {
5284   HOST_WIDE_INT guard_size
5285     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5286   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5287   /* When doing the final adjustment for the outgoing argument size we can't
5288      assume that LR was saved at position 0.  So subtract it's offset from the
5289      ABI safe buffer so that we don't accidentally allow an adjustment that
5290      would result in an allocation larger than the ABI buffer without
5291      probing.  */
5292   HOST_WIDE_INT min_probe_threshold
5293     = final_adjustment_p
5294       ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
5295       : guard_size - guard_used_by_caller;
5296
5297   poly_int64 frame_size = cfun->machine->frame.frame_size;
5298
5299   /* We should always have a positive probe threshold.  */
5300   gcc_assert (min_probe_threshold > 0);
5301
5302   if (flag_stack_clash_protection && !final_adjustment_p)
5303     {
5304       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5305       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5306
5307       if (known_eq (frame_size, 0))
5308         {
5309           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
5310         }
5311       else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
5312                && known_lt (final_adjust, guard_used_by_caller))
5313         {
5314           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
5315         }
5316     }
5317
5318   /* If SIZE is not large enough to require probing, just adjust the stack and
5319      exit.  */
5320   if (known_lt (poly_size, min_probe_threshold)
5321       || !flag_stack_clash_protection)
5322     {
5323       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
5324       return;
5325     }
5326
5327   HOST_WIDE_INT size;
5328   /* Handle the SVE non-constant case first.  */
5329   if (!poly_size.is_constant (&size))
5330     {
5331      if (dump_file)
5332       {
5333         fprintf (dump_file, "Stack clash SVE prologue: ");
5334         print_dec (poly_size, dump_file);
5335         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
5336       }
5337
5338       /* First calculate the amount of bytes we're actually spilling.  */
5339       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
5340                           poly_size, temp1, temp2, false, true);
5341
5342       rtx_insn *insn = get_last_insn ();
5343
5344       if (frame_related_p)
5345         {
5346           /* This is done to provide unwinding information for the stack
5347              adjustments we're about to do, however to prevent the optimizers
5348              from removing the R11 move and leaving the CFA note (which would be
5349              very wrong) we tie the old and new stack pointer together.
5350              The tie will expand to nothing but the optimizers will not touch
5351              the instruction.  */
5352           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
5353           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
5354           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
5355
5356           /* We want the CFA independent of the stack pointer for the
5357              duration of the loop.  */
5358           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
5359           RTX_FRAME_RELATED_P (insn) = 1;
5360         }
5361
5362       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
5363       rtx guard_const = gen_int_mode (guard_size, Pmode);
5364
5365       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
5366                                                    stack_pointer_rtx, temp1,
5367                                                    probe_const, guard_const));
5368
5369       /* Now reset the CFA register if needed.  */
5370       if (frame_related_p)
5371         {
5372           add_reg_note (insn, REG_CFA_DEF_CFA,
5373                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
5374                                       gen_int_mode (poly_size, Pmode)));
5375           RTX_FRAME_RELATED_P (insn) = 1;
5376         }
5377
5378       return;
5379     }
5380
5381   if (dump_file)
5382     fprintf (dump_file,
5383              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5384              " bytes, probing will be required.\n", size);
5385
5386   /* Round size to the nearest multiple of guard_size, and calculate the
5387      residual as the difference between the original size and the rounded
5388      size.  */
5389   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
5390   HOST_WIDE_INT residual = size - rounded_size;
5391
5392   /* We can handle a small number of allocations/probes inline.  Otherwise
5393      punt to a loop.  */
5394   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
5395     {
5396       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
5397         {
5398           aarch64_sub_sp (NULL, temp2, guard_size, true);
5399           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5400                                            guard_used_by_caller));
5401           emit_insn (gen_blockage ());
5402         }
5403       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
5404     }
5405   else
5406     {
5407       /* Compute the ending address.  */
5408       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
5409                           temp1, NULL, false, true);
5410       rtx_insn *insn = get_last_insn ();
5411
5412       /* For the initial allocation, we don't have a frame pointer
5413          set up, so we always need CFI notes.  If we're doing the
5414          final allocation, then we may have a frame pointer, in which
5415          case it is the CFA, otherwise we need CFI notes.
5416
5417          We can determine which allocation we are doing by looking at
5418          the value of FRAME_RELATED_P since the final allocations are not
5419          frame related.  */
5420       if (frame_related_p)
5421         {
5422           /* We want the CFA independent of the stack pointer for the
5423              duration of the loop.  */
5424           add_reg_note (insn, REG_CFA_DEF_CFA,
5425                         plus_constant (Pmode, temp1, rounded_size));
5426           RTX_FRAME_RELATED_P (insn) = 1;
5427         }
5428
5429       /* This allocates and probes the stack.  Note that this re-uses some of
5430          the existing Ada stack protection code.  However we are guaranteed not
5431          to enter the non loop or residual branches of that code.
5432
5433          The non-loop part won't be entered because if our allocation amount
5434          doesn't require a loop, the case above would handle it.
5435
5436          The residual amount won't be entered because TEMP1 is a mutliple of
5437          the allocation size.  The residual will always be 0.  As such, the only
5438          part we are actually using from that code is the loop setup.  The
5439          actual probing is done in aarch64_output_probe_stack_range.  */
5440       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
5441                                                stack_pointer_rtx, temp1));
5442
5443       /* Now reset the CFA register if needed.  */
5444       if (frame_related_p)
5445         {
5446           add_reg_note (insn, REG_CFA_DEF_CFA,
5447                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
5448           RTX_FRAME_RELATED_P (insn) = 1;
5449         }
5450
5451       emit_insn (gen_blockage ());
5452       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
5453     }
5454
5455   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
5456      be probed.  This maintains the requirement that each page is probed at
5457      least once.  For initial probing we probe only if the allocation is
5458      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
5459      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
5460      GUARD_SIZE.  This works that for any allocation that is large enough to
5461      trigger a probe here, we'll have at least one, and if they're not large
5462      enough for this code to emit anything for them, The page would have been
5463      probed by the saving of FP/LR either by this function or any callees.  If
5464      we don't have any callees then we won't have more stack adjustments and so
5465      are still safe.  */
5466   if (residual)
5467     {
5468       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
5469       /* If we're doing final adjustments, and we've done any full page
5470          allocations then any residual needs to be probed.  */
5471       if (final_adjustment_p && rounded_size != 0)
5472         min_probe_threshold = 0;
5473       /* If doing a small final adjustment, we always probe at offset 0.
5474          This is done to avoid issues when LR is not at position 0 or when
5475          the final adjustment is smaller than the probing offset.  */
5476       else if (final_adjustment_p && rounded_size == 0)
5477         residual_probe_offset = 0;
5478
5479       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
5480       if (residual >= min_probe_threshold)
5481         {
5482           if (dump_file)
5483             fprintf (dump_file,
5484                      "Stack clash AArch64 prologue residuals: "
5485                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
5486                      "\n", residual);
5487
5488             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5489                                              residual_probe_offset));
5490           emit_insn (gen_blockage ());
5491         }
5492     }
5493 }
5494
5495 /* Return 1 if the register is used by the epilogue.  We need to say the
5496    return register is used, but only after epilogue generation is complete.
5497    Note that in the case of sibcalls, the values "used by the epilogue" are
5498    considered live at the start of the called function.
5499
5500    For SIMD functions we need to return 1 for FP registers that are saved and
5501    restored by a function but are not zero in call_used_regs.  If we do not do
5502    this optimizations may remove the restore of the register.  */
5503
5504 int
5505 aarch64_epilogue_uses (int regno)
5506 {
5507   if (epilogue_completed)
5508     {
5509       if (regno == LR_REGNUM)
5510         return 1;
5511       if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
5512         return 1;
5513     }
5514   return 0;
5515 }
5516
5517 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
5518    is saved at BASE + OFFSET.  */
5519
5520 static void
5521 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
5522                             rtx base, poly_int64 offset)
5523 {
5524   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
5525   add_reg_note (insn, REG_CFA_EXPRESSION,
5526                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
5527 }
5528
5529 /* AArch64 stack frames generated by this compiler look like:
5530
5531         +-------------------------------+
5532         |                               |
5533         |  incoming stack arguments     |
5534         |                               |
5535         +-------------------------------+
5536         |                               | <-- incoming stack pointer (aligned)
5537         |  callee-allocated save area   |
5538         |  for register varargs         |
5539         |                               |
5540         +-------------------------------+
5541         |  local variables              | <-- frame_pointer_rtx
5542         |                               |
5543         +-------------------------------+
5544         |  padding                      | \
5545         +-------------------------------+  |
5546         |  callee-saved registers       |  | frame.saved_regs_size
5547         +-------------------------------+  |
5548         |  LR'                          |  |
5549         +-------------------------------+  |
5550         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
5551         +-------------------------------+
5552         |  dynamic allocation           |
5553         +-------------------------------+
5554         |  padding                      |
5555         +-------------------------------+
5556         |  outgoing stack arguments     | <-- arg_pointer
5557         |                               |
5558         +-------------------------------+
5559         |                               | <-- stack_pointer_rtx (aligned)
5560
5561    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
5562    but leave frame_pointer_rtx and hard_frame_pointer_rtx
5563    unchanged.
5564
5565    By default for stack-clash we assume the guard is at least 64KB, but this
5566    value is configurable to either 4KB or 64KB.  We also force the guard size to
5567    be the same as the probing interval and both values are kept in sync.
5568
5569    With those assumptions the callee can allocate up to 63KB (or 3KB depending
5570    on the guard size) of stack space without probing.
5571
5572    When probing is needed, we emit a probe at the start of the prologue
5573    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
5574
5575    We have to track how much space has been allocated and the only stores
5576    to the stack we track as implicit probes are the FP/LR stores.
5577
5578    For outgoing arguments we probe if the size is larger than 1KB, such that
5579    the ABI specified buffer is maintained for the next callee.
5580
5581    The following registers are reserved during frame layout and should not be
5582    used for any other purpose:
5583
5584    - r11: Used by stack clash protection when SVE is enabled.
5585    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
5586    - r14 and r15: Used for speculation tracking.
5587    - r16(IP0), r17(IP1): Used by indirect tailcalls.
5588    - r30(LR), r29(FP): Used by standard frame layout.
5589
5590    These registers must be avoided in frame layout related code unless the
5591    explicit intention is to interact with one of the features listed above.  */
5592
5593 /* Generate the prologue instructions for entry into a function.
5594    Establish the stack frame by decreasing the stack pointer with a
5595    properly calculated size and, if necessary, create a frame record
5596    filled with the values of LR and previous frame pointer.  The
5597    current FP is also set up if it is in use.  */
5598
5599 void
5600 aarch64_expand_prologue (void)
5601 {
5602   poly_int64 frame_size = cfun->machine->frame.frame_size;
5603   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5604   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5605   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5606   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5607   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5608   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5609   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
5610   rtx_insn *insn;
5611
5612   /* Sign return address for functions.  */
5613   if (aarch64_return_address_signing_enabled ())
5614     {
5615       insn = emit_insn (gen_pacisp ());
5616       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5617       RTX_FRAME_RELATED_P (insn) = 1;
5618     }
5619
5620   if (flag_stack_usage_info)
5621     current_function_static_stack_size = constant_lower_bound (frame_size);
5622
5623   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5624     {
5625       if (crtl->is_leaf && !cfun->calls_alloca)
5626         {
5627           if (maybe_gt (frame_size, PROBE_INTERVAL)
5628               && maybe_gt (frame_size, get_stack_check_protect ()))
5629             aarch64_emit_probe_stack_range (get_stack_check_protect (),
5630                                             (frame_size
5631                                              - get_stack_check_protect ()));
5632         }
5633       else if (maybe_gt (frame_size, 0))
5634         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
5635     }
5636
5637   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5638   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5639
5640   /* In theory we should never have both an initial adjustment
5641      and a callee save adjustment.  Verify that is the case since the
5642      code below does not handle it for -fstack-clash-protection.  */
5643   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
5644
5645   /* Will only probe if the initial adjustment is larger than the guard
5646      less the amount of the guard reserved for use by the caller's
5647      outgoing args.  */
5648   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
5649                                           true, false);
5650
5651   if (callee_adjust != 0)
5652     aarch64_push_regs (reg1, reg2, callee_adjust);
5653
5654   if (emit_frame_chain)
5655     {
5656       poly_int64 reg_offset = callee_adjust;
5657       if (callee_adjust == 0)
5658         {
5659           reg1 = R29_REGNUM;
5660           reg2 = R30_REGNUM;
5661           reg_offset = callee_offset;
5662           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
5663         }
5664       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
5665                           stack_pointer_rtx, callee_offset,
5666                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
5667       if (frame_pointer_needed && !frame_size.is_constant ())
5668         {
5669           /* Variable-sized frames need to describe the save slot
5670              address using DW_CFA_expression rather than DW_CFA_offset.
5671              This means that, without taking further action, the
5672              locations of the registers that we've already saved would
5673              remain based on the stack pointer even after we redefine
5674              the CFA based on the frame pointer.  We therefore need new
5675              DW_CFA_expressions to re-express the save slots with addresses
5676              based on the frame pointer.  */
5677           rtx_insn *insn = get_last_insn ();
5678           gcc_assert (RTX_FRAME_RELATED_P (insn));
5679
5680           /* Add an explicit CFA definition if this was previously
5681              implicit.  */
5682           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
5683             {
5684               rtx src = plus_constant (Pmode, stack_pointer_rtx,
5685                                        callee_offset);
5686               add_reg_note (insn, REG_CFA_ADJUST_CFA,
5687                             gen_rtx_SET (hard_frame_pointer_rtx, src));
5688             }
5689
5690           /* Change the save slot expressions for the registers that
5691              we've already saved.  */
5692           reg_offset -= callee_offset;
5693           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
5694                                       reg_offset + UNITS_PER_WORD);
5695           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
5696                                       reg_offset);
5697         }
5698       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
5699     }
5700
5701   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5702                              callee_adjust != 0 || emit_frame_chain);
5703   if (aarch64_simd_decl_p (cfun->decl))
5704     aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5705                                callee_adjust != 0 || emit_frame_chain);
5706   else
5707     aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5708                                callee_adjust != 0 || emit_frame_chain);
5709
5710   /* We may need to probe the final adjustment if it is larger than the guard
5711      that is assumed by the called.  */
5712   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
5713                                           !frame_pointer_needed, true);
5714 }
5715
5716 /* Return TRUE if we can use a simple_return insn.
5717
5718    This function checks whether the callee saved stack is empty, which
5719    means no restore actions are need. The pro_and_epilogue will use
5720    this to check whether shrink-wrapping opt is feasible.  */
5721
5722 bool
5723 aarch64_use_return_insn_p (void)
5724 {
5725   if (!reload_completed)
5726     return false;
5727
5728   if (crtl->profile)
5729     return false;
5730
5731   return known_eq (cfun->machine->frame.frame_size, 0);
5732 }
5733
5734 /* Return false for non-leaf SIMD functions in order to avoid
5735    shrink-wrapping them.  Doing this will lose the necessary
5736    save/restore of FP registers.  */
5737
5738 bool
5739 aarch64_use_simple_return_insn_p (void)
5740 {
5741   if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
5742     return false;
5743
5744   return true;
5745 }
5746
5747 /* Generate the epilogue instructions for returning from a function.
5748    This is almost exactly the reverse of the prolog sequence, except
5749    that we need to insert barriers to avoid scheduling loads that read
5750    from a deallocated stack, and we optimize the unwind records by
5751    emitting them all together if possible.  */
5752 void
5753 aarch64_expand_epilogue (bool for_sibcall)
5754 {
5755   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5756   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5757   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5758   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5759   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5760   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5761   rtx cfi_ops = NULL;
5762   rtx_insn *insn;
5763   /* A stack clash protection prologue may not have left EP0_REGNUM or
5764      EP1_REGNUM in a usable state.  The same is true for allocations
5765      with an SVE component, since we then need both temporary registers
5766      for each allocation.  For stack clash we are in a usable state if
5767      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
5768   HOST_WIDE_INT guard_size
5769     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5770   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5771
5772   /* We can re-use the registers when the allocation amount is smaller than
5773      guard_size - guard_used_by_caller because we won't be doing any probes
5774      then.  In such situations the register should remain live with the correct
5775      value.  */
5776   bool can_inherit_p = (initial_adjust.is_constant ()
5777                         && final_adjust.is_constant ())
5778                         && (!flag_stack_clash_protection
5779                             || known_lt (initial_adjust,
5780                                          guard_size - guard_used_by_caller));
5781
5782   /* We need to add memory barrier to prevent read from deallocated stack.  */
5783   bool need_barrier_p
5784     = maybe_ne (get_frame_size ()
5785                 + cfun->machine->frame.saved_varargs_size, 0);
5786
5787   /* Emit a barrier to prevent loads from a deallocated stack.  */
5788   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5789       || cfun->calls_alloca
5790       || crtl->calls_eh_return)
5791     {
5792       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5793       need_barrier_p = false;
5794     }
5795
5796   /* Restore the stack pointer from the frame pointer if it may not
5797      be the same as the stack pointer.  */
5798   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5799   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5800   if (frame_pointer_needed
5801       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5802     /* If writeback is used when restoring callee-saves, the CFA
5803        is restored on the instruction doing the writeback.  */
5804     aarch64_add_offset (Pmode, stack_pointer_rtx,
5805                         hard_frame_pointer_rtx, -callee_offset,
5806                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
5807   else
5808      /* The case where we need to re-use the register here is very rare, so
5809         avoid the complicated condition and just always emit a move if the
5810         immediate doesn't fit.  */
5811      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
5812
5813   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5814                                 callee_adjust != 0, &cfi_ops);
5815   if (aarch64_simd_decl_p (cfun->decl))
5816     aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5817                                   callee_adjust != 0, &cfi_ops);
5818   else
5819     aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5820                                   callee_adjust != 0, &cfi_ops);
5821
5822   if (need_barrier_p)
5823     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5824
5825   if (callee_adjust != 0)
5826     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5827
5828   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5829     {
5830       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
5831       insn = get_last_insn ();
5832       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5833       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5834       RTX_FRAME_RELATED_P (insn) = 1;
5835       cfi_ops = NULL;
5836     }
5837
5838   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
5839      add restriction on emit_move optimization to leaf functions.  */
5840   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
5841                   (!can_inherit_p || !crtl->is_leaf
5842                    || df_regs_ever_live_p (EP0_REGNUM)));
5843
5844   if (cfi_ops)
5845     {
5846       /* Emit delayed restores and reset the CFA to be SP.  */
5847       insn = get_last_insn ();
5848       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5849       REG_NOTES (insn) = cfi_ops;
5850       RTX_FRAME_RELATED_P (insn) = 1;
5851     }
5852
5853   /* We prefer to emit the combined return/authenticate instruction RETAA,
5854      however there are three cases in which we must instead emit an explicit
5855      authentication instruction.
5856
5857         1) Sibcalls don't return in a normal way, so if we're about to call one
5858            we must authenticate.
5859
5860         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5861            generating code for !TARGET_ARMV8_3 we can't use it and must
5862            explicitly authenticate.
5863
5864         3) On an eh_return path we make extra stack adjustments to update the
5865            canonical frame address to be the exception handler's CFA.  We want
5866            to authenticate using the CFA of the function which calls eh_return.
5867     */
5868   if (aarch64_return_address_signing_enabled ()
5869       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5870     {
5871       insn = emit_insn (gen_autisp ());
5872       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5873       RTX_FRAME_RELATED_P (insn) = 1;
5874     }
5875
5876   /* Stack adjustment for exception handler.  */
5877   if (crtl->calls_eh_return)
5878     {
5879       /* We need to unwind the stack by the offset computed by
5880          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
5881          to be SP; letting the CFA move during this adjustment
5882          is just as correct as retaining the CFA from the body
5883          of the function.  Therefore, do nothing special.  */
5884       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5885     }
5886
5887   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5888   if (!for_sibcall)
5889     emit_jump_insn (ret_rtx);
5890 }
5891
5892 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
5893    normally or return to a previous frame after unwinding.
5894
5895    An EH return uses a single shared return sequence.  The epilogue is
5896    exactly like a normal epilogue except that it has an extra input
5897    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5898    that must be applied after the frame has been destroyed.  An extra label
5899    is inserted before the epilogue which initializes this register to zero,
5900    and this is the entry point for a normal return.
5901
5902    An actual EH return updates the return address, initializes the stack
5903    adjustment and jumps directly into the epilogue (bypassing the zeroing
5904    of the adjustment).  Since the return address is typically saved on the
5905    stack when a function makes a call, the saved LR must be updated outside
5906    the epilogue.
5907
5908    This poses problems as the store is generated well before the epilogue,
5909    so the offset of LR is not known yet.  Also optimizations will remove the
5910    store as it appears dead, even after the epilogue is generated (as the
5911    base or offset for loading LR is different in many cases).
5912
5913    To avoid these problems this implementation forces the frame pointer
5914    in eh_return functions so that the location of LR is fixed and known early.
5915    It also marks the store volatile, so no optimization is permitted to
5916    remove the store.  */
5917 rtx
5918 aarch64_eh_return_handler_rtx (void)
5919 {
5920   rtx tmp = gen_frame_mem (Pmode,
5921     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5922
5923   /* Mark the store volatile, so no optimization is permitted to remove it.  */
5924   MEM_VOLATILE_P (tmp) = true;
5925   return tmp;
5926 }
5927
5928 /* Output code to add DELTA to the first argument, and then jump
5929    to FUNCTION.  Used for C++ multiple inheritance.  */
5930 static void
5931 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5932                          HOST_WIDE_INT delta,
5933                          HOST_WIDE_INT vcall_offset,
5934                          tree function)
5935 {
5936   /* The this pointer is always in x0.  Note that this differs from
5937      Arm where the this pointer maybe bumped to r1 if r0 is required
5938      to return a pointer to an aggregate.  On AArch64 a result value
5939      pointer will be in x8.  */
5940   int this_regno = R0_REGNUM;
5941   rtx this_rtx, temp0, temp1, addr, funexp;
5942   rtx_insn *insn;
5943
5944   reload_completed = 1;
5945   emit_note (NOTE_INSN_PROLOGUE_END);
5946
5947   this_rtx = gen_rtx_REG (Pmode, this_regno);
5948   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
5949   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
5950
5951   if (vcall_offset == 0)
5952     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5953   else
5954     {
5955       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5956
5957       addr = this_rtx;
5958       if (delta != 0)
5959         {
5960           if (delta >= -256 && delta < 256)
5961             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5962                                        plus_constant (Pmode, this_rtx, delta));
5963           else
5964             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5965                                 temp1, temp0, false);
5966         }
5967
5968       if (Pmode == ptr_mode)
5969         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5970       else
5971         aarch64_emit_move (temp0,
5972                            gen_rtx_ZERO_EXTEND (Pmode,
5973                                                 gen_rtx_MEM (ptr_mode, addr)));
5974
5975       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5976           addr = plus_constant (Pmode, temp0, vcall_offset);
5977       else
5978         {
5979           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5980                                           Pmode);
5981           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5982         }
5983
5984       if (Pmode == ptr_mode)
5985         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5986       else
5987         aarch64_emit_move (temp1,
5988                            gen_rtx_SIGN_EXTEND (Pmode,
5989                                                 gen_rtx_MEM (ptr_mode, addr)));
5990
5991       emit_insn (gen_add2_insn (this_rtx, temp1));
5992     }
5993
5994   /* Generate a tail call to the target function.  */
5995   if (!TREE_USED (function))
5996     {
5997       assemble_external (function);
5998       TREE_USED (function) = 1;
5999     }
6000   funexp = XEXP (DECL_RTL (function), 0);
6001   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6002   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6003   SIBLING_CALL_P (insn) = 1;
6004
6005   insn = get_insns ();
6006   shorten_branches (insn);
6007   final_start_function (insn, file, 1);
6008   final (insn, file, 1);
6009   final_end_function ();
6010
6011   /* Stop pretending to be a post-reload pass.  */
6012   reload_completed = 0;
6013 }
6014
6015 static bool
6016 aarch64_tls_referenced_p (rtx x)
6017 {
6018   if (!TARGET_HAVE_TLS)
6019     return false;
6020   subrtx_iterator::array_type array;
6021   FOR_EACH_SUBRTX (iter, array, x, ALL)
6022     {
6023       const_rtx x = *iter;
6024       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6025         return true;
6026       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6027          TLS offsets, not real symbol references.  */
6028       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6029         iter.skip_subrtxes ();
6030     }
6031   return false;
6032 }
6033
6034
6035 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6036    a left shift of 0 or 12 bits.  */
6037 bool
6038 aarch64_uimm12_shift (HOST_WIDE_INT val)
6039 {
6040   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6041           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6042           );
6043 }
6044
6045 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6046    that can be created with a left shift of 0 or 12.  */
6047 static HOST_WIDE_INT
6048 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6049 {
6050   /* Check to see if the value fits in 24 bits, as that is the maximum we can
6051      handle correctly.  */
6052   gcc_assert ((val & 0xffffff) == val);
6053
6054   if (((val & 0xfff) << 0) == val)
6055     return val;
6056
6057   return val & (0xfff << 12);
6058 }
6059
6060 /* Return true if val is an immediate that can be loaded into a
6061    register by a MOVZ instruction.  */
6062 static bool
6063 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6064 {
6065   if (GET_MODE_SIZE (mode) > 4)
6066     {
6067       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6068           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6069         return 1;
6070     }
6071   else
6072     {
6073       /* Ignore sign extension.  */
6074       val &= (HOST_WIDE_INT) 0xffffffff;
6075     }
6076   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6077           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6078 }
6079
6080 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
6081    64-bit (DImode) integer.  */
6082
6083 static unsigned HOST_WIDE_INT
6084 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6085 {
6086   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6087   while (size < 64)
6088     {
6089       val &= (HOST_WIDE_INT_1U << size) - 1;
6090       val |= val << size;
6091       size *= 2;
6092     }
6093   return val;
6094 }
6095
6096 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
6097
6098 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6099   {
6100     0x0000000100000001ull,
6101     0x0001000100010001ull,
6102     0x0101010101010101ull,
6103     0x1111111111111111ull,
6104     0x5555555555555555ull,
6105   };
6106
6107
6108 /* Return true if val is a valid bitmask immediate.  */
6109
6110 bool
6111 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6112 {
6113   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6114   int bits;
6115
6116   /* Check for a single sequence of one bits and return quickly if so.
6117      The special cases of all ones and all zeroes returns false.  */
6118   val = aarch64_replicate_bitmask_imm (val_in, mode);
6119   tmp = val + (val & -val);
6120
6121   if (tmp == (tmp & -tmp))
6122     return (val + 1) > 1;
6123
6124   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
6125   if (mode == SImode)
6126     val = (val << 32) | (val & 0xffffffff);
6127
6128   /* Invert if the immediate doesn't start with a zero bit - this means we
6129      only need to search for sequences of one bits.  */
6130   if (val & 1)
6131     val = ~val;
6132
6133   /* Find the first set bit and set tmp to val with the first sequence of one
6134      bits removed.  Return success if there is a single sequence of ones.  */
6135   first_one = val & -val;
6136   tmp = val & (val + first_one);
6137
6138   if (tmp == 0)
6139     return true;
6140
6141   /* Find the next set bit and compute the difference in bit position.  */
6142   next_one = tmp & -tmp;
6143   bits = clz_hwi (first_one) - clz_hwi (next_one);
6144   mask = val ^ tmp;
6145
6146   /* Check the bit position difference is a power of 2, and that the first
6147      sequence of one bits fits within 'bits' bits.  */
6148   if ((mask >> bits) != 0 || bits != (bits & -bits))
6149     return false;
6150
6151   /* Check the sequence of one bits is repeated 64/bits times.  */
6152   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6153 }
6154
6155 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6156    Assumed precondition: VAL_IN Is not zero.  */
6157
6158 unsigned HOST_WIDE_INT
6159 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6160 {
6161   int lowest_bit_set = ctz_hwi (val_in);
6162   int highest_bit_set = floor_log2 (val_in);
6163   gcc_assert (val_in != 0);
6164
6165   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6166           (HOST_WIDE_INT_1U << lowest_bit_set));
6167 }
6168
6169 /* Create constant where bits outside of lowest bit set to highest bit set
6170    are set to 1.  */
6171
6172 unsigned HOST_WIDE_INT
6173 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6174 {
6175   return val_in | ~aarch64_and_split_imm1 (val_in);
6176 }
6177
6178 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
6179
6180 bool
6181 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
6182 {
6183   scalar_int_mode int_mode;
6184   if (!is_a <scalar_int_mode> (mode, &int_mode))
6185     return false;
6186
6187   if (aarch64_bitmask_imm (val_in, int_mode))
6188     return false;
6189
6190   if (aarch64_move_imm (val_in, int_mode))
6191     return false;
6192
6193   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
6194
6195   return aarch64_bitmask_imm (imm2, int_mode);
6196 }
6197
6198 /* Return true if val is an immediate that can be loaded into a
6199    register in a single instruction.  */
6200 bool
6201 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
6202 {
6203   scalar_int_mode int_mode;
6204   if (!is_a <scalar_int_mode> (mode, &int_mode))
6205     return false;
6206
6207   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
6208     return 1;
6209   return aarch64_bitmask_imm (val, int_mode);
6210 }
6211
6212 static bool
6213 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
6214 {
6215   rtx base, offset;
6216
6217   if (GET_CODE (x) == HIGH)
6218     return true;
6219
6220   /* There's no way to calculate VL-based values using relocations.  */
6221   subrtx_iterator::array_type array;
6222   FOR_EACH_SUBRTX (iter, array, x, ALL)
6223     if (GET_CODE (*iter) == CONST_POLY_INT)
6224       return true;
6225
6226   split_const (x, &base, &offset);
6227   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
6228     {
6229       if (aarch64_classify_symbol (base, INTVAL (offset))
6230           != SYMBOL_FORCE_TO_MEM)
6231         return true;
6232       else
6233         /* Avoid generating a 64-bit relocation in ILP32; leave
6234            to aarch64_expand_mov_immediate to handle it properly.  */
6235         return mode != ptr_mode;
6236     }
6237
6238   return aarch64_tls_referenced_p (x);
6239 }
6240
6241 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6242    The expansion for a table switch is quite expensive due to the number
6243    of instructions, the table lookup and hard to predict indirect jump.
6244    When optimizing for speed, and -O3 enabled, use the per-core tuning if
6245    set, otherwise use tables for > 16 cases as a tradeoff between size and
6246    performance.  When optimizing for size, use the default setting.  */
6247
6248 static unsigned int
6249 aarch64_case_values_threshold (void)
6250 {
6251   /* Use the specified limit for the number of cases before using jump
6252      tables at higher optimization levels.  */
6253   if (optimize > 2
6254       && selected_cpu->tune->max_case_values != 0)
6255     return selected_cpu->tune->max_case_values;
6256   else
6257     return optimize_size ? default_case_values_threshold () : 17;
6258 }
6259
6260 /* Return true if register REGNO is a valid index register.
6261    STRICT_P is true if REG_OK_STRICT is in effect.  */
6262
6263 bool
6264 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
6265 {
6266   if (!HARD_REGISTER_NUM_P (regno))
6267     {
6268       if (!strict_p)
6269         return true;
6270
6271       if (!reg_renumber)
6272         return false;
6273
6274       regno = reg_renumber[regno];
6275     }
6276   return GP_REGNUM_P (regno);
6277 }
6278
6279 /* Return true if register REGNO is a valid base register for mode MODE.
6280    STRICT_P is true if REG_OK_STRICT is in effect.  */
6281
6282 bool
6283 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
6284 {
6285   if (!HARD_REGISTER_NUM_P (regno))
6286     {
6287       if (!strict_p)
6288         return true;
6289
6290       if (!reg_renumber)
6291         return false;
6292
6293       regno = reg_renumber[regno];
6294     }
6295
6296   /* The fake registers will be eliminated to either the stack or
6297      hard frame pointer, both of which are usually valid base registers.
6298      Reload deals with the cases where the eliminated form isn't valid.  */
6299   return (GP_REGNUM_P (regno)
6300           || regno == SP_REGNUM
6301           || regno == FRAME_POINTER_REGNUM
6302           || regno == ARG_POINTER_REGNUM);
6303 }
6304
6305 /* Return true if X is a valid base register for mode MODE.
6306    STRICT_P is true if REG_OK_STRICT is in effect.  */
6307
6308 static bool
6309 aarch64_base_register_rtx_p (rtx x, bool strict_p)
6310 {
6311   if (!strict_p
6312       && GET_CODE (x) == SUBREG
6313       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
6314     x = SUBREG_REG (x);
6315
6316   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
6317 }
6318
6319 /* Return true if address offset is a valid index.  If it is, fill in INFO
6320    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6321
6322 static bool
6323 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
6324                         machine_mode mode, bool strict_p)
6325 {
6326   enum aarch64_address_type type;
6327   rtx index;
6328   int shift;
6329
6330   /* (reg:P) */
6331   if ((REG_P (x) || GET_CODE (x) == SUBREG)
6332       && GET_MODE (x) == Pmode)
6333     {
6334       type = ADDRESS_REG_REG;
6335       index = x;
6336       shift = 0;
6337     }
6338   /* (sign_extend:DI (reg:SI)) */
6339   else if ((GET_CODE (x) == SIGN_EXTEND
6340             || GET_CODE (x) == ZERO_EXTEND)
6341            && GET_MODE (x) == DImode
6342            && GET_MODE (XEXP (x, 0)) == SImode)
6343     {
6344       type = (GET_CODE (x) == SIGN_EXTEND)
6345         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6346       index = XEXP (x, 0);
6347       shift = 0;
6348     }
6349   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6350   else if (GET_CODE (x) == MULT
6351            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6352                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6353            && GET_MODE (XEXP (x, 0)) == DImode
6354            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6355            && CONST_INT_P (XEXP (x, 1)))
6356     {
6357       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6358         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6359       index = XEXP (XEXP (x, 0), 0);
6360       shift = exact_log2 (INTVAL (XEXP (x, 1)));
6361     }
6362   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6363   else if (GET_CODE (x) == ASHIFT
6364            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6365                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6366            && GET_MODE (XEXP (x, 0)) == DImode
6367            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6368            && CONST_INT_P (XEXP (x, 1)))
6369     {
6370       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6371         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6372       index = XEXP (XEXP (x, 0), 0);
6373       shift = INTVAL (XEXP (x, 1));
6374     }
6375   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6376   else if ((GET_CODE (x) == SIGN_EXTRACT
6377             || GET_CODE (x) == ZERO_EXTRACT)
6378            && GET_MODE (x) == DImode
6379            && GET_CODE (XEXP (x, 0)) == MULT
6380            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6381            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6382     {
6383       type = (GET_CODE (x) == SIGN_EXTRACT)
6384         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6385       index = XEXP (XEXP (x, 0), 0);
6386       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6387       if (INTVAL (XEXP (x, 1)) != 32 + shift
6388           || INTVAL (XEXP (x, 2)) != 0)
6389         shift = -1;
6390     }
6391   /* (and:DI (mult:DI (reg:DI) (const_int scale))
6392      (const_int 0xffffffff<<shift)) */
6393   else if (GET_CODE (x) == AND
6394            && GET_MODE (x) == DImode
6395            && GET_CODE (XEXP (x, 0)) == MULT
6396            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6397            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6398            && CONST_INT_P (XEXP (x, 1)))
6399     {
6400       type = ADDRESS_REG_UXTW;
6401       index = XEXP (XEXP (x, 0), 0);
6402       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6403       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6404         shift = -1;
6405     }
6406   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
6407   else if ((GET_CODE (x) == SIGN_EXTRACT
6408             || GET_CODE (x) == ZERO_EXTRACT)
6409            && GET_MODE (x) == DImode
6410            && GET_CODE (XEXP (x, 0)) == ASHIFT
6411            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6412            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6413     {
6414       type = (GET_CODE (x) == SIGN_EXTRACT)
6415         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6416       index = XEXP (XEXP (x, 0), 0);
6417       shift = INTVAL (XEXP (XEXP (x, 0), 1));
6418       if (INTVAL (XEXP (x, 1)) != 32 + shift
6419           || INTVAL (XEXP (x, 2)) != 0)
6420         shift = -1;
6421     }
6422   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
6423      (const_int 0xffffffff<<shift)) */
6424   else if (GET_CODE (x) == AND
6425            && GET_MODE (x) == DImode
6426            && GET_CODE (XEXP (x, 0)) == ASHIFT
6427            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6428            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6429            && CONST_INT_P (XEXP (x, 1)))
6430     {
6431       type = ADDRESS_REG_UXTW;
6432       index = XEXP (XEXP (x, 0), 0);
6433       shift = INTVAL (XEXP (XEXP (x, 0), 1));
6434       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6435         shift = -1;
6436     }
6437   /* (mult:P (reg:P) (const_int scale)) */
6438   else if (GET_CODE (x) == MULT
6439            && GET_MODE (x) == Pmode
6440            && GET_MODE (XEXP (x, 0)) == Pmode
6441            && CONST_INT_P (XEXP (x, 1)))
6442     {
6443       type = ADDRESS_REG_REG;
6444       index = XEXP (x, 0);
6445       shift = exact_log2 (INTVAL (XEXP (x, 1)));
6446     }
6447   /* (ashift:P (reg:P) (const_int shift)) */
6448   else if (GET_CODE (x) == ASHIFT
6449            && GET_MODE (x) == Pmode
6450            && GET_MODE (XEXP (x, 0)) == Pmode
6451            && CONST_INT_P (XEXP (x, 1)))
6452     {
6453       type = ADDRESS_REG_REG;
6454       index = XEXP (x, 0);
6455       shift = INTVAL (XEXP (x, 1));
6456     }
6457   else
6458     return false;
6459
6460   if (!strict_p
6461       && GET_CODE (index) == SUBREG
6462       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
6463     index = SUBREG_REG (index);
6464
6465   if (aarch64_sve_data_mode_p (mode))
6466     {
6467       if (type != ADDRESS_REG_REG
6468           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
6469         return false;
6470     }
6471   else
6472     {
6473       if (shift != 0
6474           && !(IN_RANGE (shift, 1, 3)
6475                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
6476         return false;
6477     }
6478
6479   if (REG_P (index)
6480       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
6481     {
6482       info->type = type;
6483       info->offset = index;
6484       info->shift = shift;
6485       return true;
6486     }
6487
6488   return false;
6489 }
6490
6491 /* Return true if MODE is one of the modes for which we
6492    support LDP/STP operations.  */
6493
6494 static bool
6495 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
6496 {
6497   return mode == SImode || mode == DImode
6498          || mode == SFmode || mode == DFmode
6499          || (aarch64_vector_mode_supported_p (mode)
6500              && (known_eq (GET_MODE_SIZE (mode), 8)
6501                  || (known_eq (GET_MODE_SIZE (mode), 16)
6502                     && (aarch64_tune_params.extra_tuning_flags
6503                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
6504 }
6505
6506 /* Return true if REGNO is a virtual pointer register, or an eliminable
6507    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
6508    include stack_pointer or hard_frame_pointer.  */
6509 static bool
6510 virt_or_elim_regno_p (unsigned regno)
6511 {
6512   return ((regno >= FIRST_VIRTUAL_REGISTER
6513            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
6514           || regno == FRAME_POINTER_REGNUM
6515           || regno == ARG_POINTER_REGNUM);
6516 }
6517
6518 /* Return true if X is a valid address of type TYPE for machine mode MODE.
6519    If it is, fill in INFO appropriately.  STRICT_P is true if
6520    REG_OK_STRICT is in effect.  */
6521
6522 bool
6523 aarch64_classify_address (struct aarch64_address_info *info,
6524                           rtx x, machine_mode mode, bool strict_p,
6525                           aarch64_addr_query_type type)
6526 {
6527   enum rtx_code code = GET_CODE (x);
6528   rtx op0, op1;
6529   poly_int64 offset;
6530
6531   HOST_WIDE_INT const_size;
6532
6533   /* On BE, we use load/store pair for all large int mode load/stores.
6534      TI/TFmode may also use a load/store pair.  */
6535   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6536   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
6537   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
6538                             || type == ADDR_QUERY_LDP_STP_N
6539                             || mode == TImode
6540                             || mode == TFmode
6541                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
6542
6543   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
6544      corresponds to the actual size of the memory being loaded/stored and the
6545      mode of the corresponding addressing mode is half of that.  */
6546   if (type == ADDR_QUERY_LDP_STP_N
6547       && known_eq (GET_MODE_SIZE (mode), 16))
6548     mode = DFmode;
6549
6550   bool allow_reg_index_p = (!load_store_pair_p
6551                             && (known_lt (GET_MODE_SIZE (mode), 16)
6552                                 || vec_flags == VEC_ADVSIMD
6553                                 || vec_flags == VEC_SVE_DATA));
6554
6555   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
6556      [Rn, #offset, MUL VL].  */
6557   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
6558       && (code != REG && code != PLUS))
6559     return false;
6560
6561   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
6562      REG addressing.  */
6563   if (advsimd_struct_p
6564       && !BYTES_BIG_ENDIAN
6565       && (code != POST_INC && code != REG))
6566     return false;
6567
6568   gcc_checking_assert (GET_MODE (x) == VOIDmode
6569                        || SCALAR_INT_MODE_P (GET_MODE (x)));
6570
6571   switch (code)
6572     {
6573     case REG:
6574     case SUBREG:
6575       info->type = ADDRESS_REG_IMM;
6576       info->base = x;
6577       info->offset = const0_rtx;
6578       info->const_offset = 0;
6579       return aarch64_base_register_rtx_p (x, strict_p);
6580
6581     case PLUS:
6582       op0 = XEXP (x, 0);
6583       op1 = XEXP (x, 1);
6584
6585       if (! strict_p
6586           && REG_P (op0)
6587           && virt_or_elim_regno_p (REGNO (op0))
6588           && poly_int_rtx_p (op1, &offset))
6589         {
6590           info->type = ADDRESS_REG_IMM;
6591           info->base = op0;
6592           info->offset = op1;
6593           info->const_offset = offset;
6594
6595           return true;
6596         }
6597
6598       if (maybe_ne (GET_MODE_SIZE (mode), 0)
6599           && aarch64_base_register_rtx_p (op0, strict_p)
6600           && poly_int_rtx_p (op1, &offset))
6601         {
6602           info->type = ADDRESS_REG_IMM;
6603           info->base = op0;
6604           info->offset = op1;
6605           info->const_offset = offset;
6606
6607           /* TImode and TFmode values are allowed in both pairs of X
6608              registers and individual Q registers.  The available
6609              address modes are:
6610              X,X: 7-bit signed scaled offset
6611              Q:   9-bit signed offset
6612              We conservatively require an offset representable in either mode.
6613              When performing the check for pairs of X registers i.e.  LDP/STP
6614              pass down DImode since that is the natural size of the LDP/STP
6615              instruction memory accesses.  */
6616           if (mode == TImode || mode == TFmode)
6617             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
6618                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6619                         || offset_12bit_unsigned_scaled_p (mode, offset)));
6620
6621           /* A 7bit offset check because OImode will emit a ldp/stp
6622              instruction (only big endian will get here).
6623              For ldp/stp instructions, the offset is scaled for the size of a
6624              single element of the pair.  */
6625           if (mode == OImode)
6626             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
6627
6628           /* Three 9/12 bit offsets checks because CImode will emit three
6629              ldr/str instructions (only big endian will get here).  */
6630           if (mode == CImode)
6631             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6632                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
6633                                                                offset + 32)
6634                         || offset_12bit_unsigned_scaled_p (V16QImode,
6635                                                            offset + 32)));
6636
6637           /* Two 7bit offsets checks because XImode will emit two ldp/stp
6638              instructions (only big endian will get here).  */
6639           if (mode == XImode)
6640             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6641                     && aarch64_offset_7bit_signed_scaled_p (TImode,
6642                                                             offset + 32));
6643
6644           /* Make "m" use the LD1 offset range for SVE data modes, so
6645              that pre-RTL optimizers like ivopts will work to that
6646              instead of the wider LDR/STR range.  */
6647           if (vec_flags == VEC_SVE_DATA)
6648             return (type == ADDR_QUERY_M
6649                     ? offset_4bit_signed_scaled_p (mode, offset)
6650                     : offset_9bit_signed_scaled_p (mode, offset));
6651
6652           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
6653             {
6654               poly_int64 end_offset = (offset
6655                                        + GET_MODE_SIZE (mode)
6656                                        - BYTES_PER_SVE_VECTOR);
6657               return (type == ADDR_QUERY_M
6658                       ? offset_4bit_signed_scaled_p (mode, offset)
6659                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
6660                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
6661                                                          end_offset)));
6662             }
6663
6664           if (vec_flags == VEC_SVE_PRED)
6665             return offset_9bit_signed_scaled_p (mode, offset);
6666
6667           if (load_store_pair_p)
6668             return ((known_eq (GET_MODE_SIZE (mode), 4)
6669                      || known_eq (GET_MODE_SIZE (mode), 8)
6670                      || known_eq (GET_MODE_SIZE (mode), 16))
6671                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6672           else
6673             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6674                     || offset_12bit_unsigned_scaled_p (mode, offset));
6675         }
6676
6677       if (allow_reg_index_p)
6678         {
6679           /* Look for base + (scaled/extended) index register.  */
6680           if (aarch64_base_register_rtx_p (op0, strict_p)
6681               && aarch64_classify_index (info, op1, mode, strict_p))
6682             {
6683               info->base = op0;
6684               return true;
6685             }
6686           if (aarch64_base_register_rtx_p (op1, strict_p)
6687               && aarch64_classify_index (info, op0, mode, strict_p))
6688             {
6689               info->base = op1;
6690               return true;
6691             }
6692         }
6693
6694       return false;
6695
6696     case POST_INC:
6697     case POST_DEC:
6698     case PRE_INC:
6699     case PRE_DEC:
6700       info->type = ADDRESS_REG_WB;
6701       info->base = XEXP (x, 0);
6702       info->offset = NULL_RTX;
6703       return aarch64_base_register_rtx_p (info->base, strict_p);
6704
6705     case POST_MODIFY:
6706     case PRE_MODIFY:
6707       info->type = ADDRESS_REG_WB;
6708       info->base = XEXP (x, 0);
6709       if (GET_CODE (XEXP (x, 1)) == PLUS
6710           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
6711           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
6712           && aarch64_base_register_rtx_p (info->base, strict_p))
6713         {
6714           info->offset = XEXP (XEXP (x, 1), 1);
6715           info->const_offset = offset;
6716
6717           /* TImode and TFmode values are allowed in both pairs of X
6718              registers and individual Q registers.  The available
6719              address modes are:
6720              X,X: 7-bit signed scaled offset
6721              Q:   9-bit signed offset
6722              We conservatively require an offset representable in either mode.
6723            */
6724           if (mode == TImode || mode == TFmode)
6725             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
6726                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
6727
6728           if (load_store_pair_p)
6729             return ((known_eq (GET_MODE_SIZE (mode), 4)
6730                      || known_eq (GET_MODE_SIZE (mode), 8)
6731                      || known_eq (GET_MODE_SIZE (mode), 16))
6732                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6733           else
6734             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
6735         }
6736       return false;
6737
6738     case CONST:
6739     case SYMBOL_REF:
6740     case LABEL_REF:
6741       /* load literal: pc-relative constant pool entry.  Only supported
6742          for SI mode or larger.  */
6743       info->type = ADDRESS_SYMBOLIC;
6744
6745       if (!load_store_pair_p
6746           && GET_MODE_SIZE (mode).is_constant (&const_size)
6747           && const_size >= 4)
6748         {
6749           rtx sym, addend;
6750
6751           split_const (x, &sym, &addend);
6752           return ((GET_CODE (sym) == LABEL_REF
6753                    || (GET_CODE (sym) == SYMBOL_REF
6754                        && CONSTANT_POOL_ADDRESS_P (sym)
6755                        && aarch64_pcrelative_literal_loads)));
6756         }
6757       return false;
6758
6759     case LO_SUM:
6760       info->type = ADDRESS_LO_SUM;
6761       info->base = XEXP (x, 0);
6762       info->offset = XEXP (x, 1);
6763       if (allow_reg_index_p
6764           && aarch64_base_register_rtx_p (info->base, strict_p))
6765         {
6766           rtx sym, offs;
6767           split_const (info->offset, &sym, &offs);
6768           if (GET_CODE (sym) == SYMBOL_REF
6769               && (aarch64_classify_symbol (sym, INTVAL (offs))
6770                   == SYMBOL_SMALL_ABSOLUTE))
6771             {
6772               /* The symbol and offset must be aligned to the access size.  */
6773               unsigned int align;
6774
6775               if (CONSTANT_POOL_ADDRESS_P (sym))
6776                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
6777               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
6778                 {
6779                   tree exp = SYMBOL_REF_DECL (sym);
6780                   align = TYPE_ALIGN (TREE_TYPE (exp));
6781                   align = aarch64_constant_alignment (exp, align);
6782                 }
6783               else if (SYMBOL_REF_DECL (sym))
6784                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
6785               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
6786                        && SYMBOL_REF_BLOCK (sym) != NULL)
6787                 align = SYMBOL_REF_BLOCK (sym)->alignment;
6788               else
6789                 align = BITS_PER_UNIT;
6790
6791               poly_int64 ref_size = GET_MODE_SIZE (mode);
6792               if (known_eq (ref_size, 0))
6793                 ref_size = GET_MODE_SIZE (DImode);
6794
6795               return (multiple_p (INTVAL (offs), ref_size)
6796                       && multiple_p (align / BITS_PER_UNIT, ref_size));
6797             }
6798         }
6799       return false;
6800
6801     default:
6802       return false;
6803     }
6804 }
6805
6806 /* Return true if the address X is valid for a PRFM instruction.
6807    STRICT_P is true if we should do strict checking with
6808    aarch64_classify_address.  */
6809
6810 bool
6811 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
6812 {
6813   struct aarch64_address_info addr;
6814
6815   /* PRFM accepts the same addresses as DImode...  */
6816   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6817   if (!res)
6818     return false;
6819
6820   /* ... except writeback forms.  */
6821   return addr.type != ADDRESS_REG_WB;
6822 }
6823
6824 bool
6825 aarch64_symbolic_address_p (rtx x)
6826 {
6827   rtx offset;
6828
6829   split_const (x, &x, &offset);
6830   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6831 }
6832
6833 /* Classify the base of symbolic expression X.  */
6834
6835 enum aarch64_symbol_type
6836 aarch64_classify_symbolic_expression (rtx x)
6837 {
6838   rtx offset;
6839
6840   split_const (x, &x, &offset);
6841   return aarch64_classify_symbol (x, INTVAL (offset));
6842 }
6843
6844
6845 /* Return TRUE if X is a legitimate address for accessing memory in
6846    mode MODE.  */
6847 static bool
6848 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6849 {
6850   struct aarch64_address_info addr;
6851
6852   return aarch64_classify_address (&addr, x, mode, strict_p);
6853 }
6854
6855 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6856    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6857 bool
6858 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6859                               aarch64_addr_query_type type)
6860 {
6861   struct aarch64_address_info addr;
6862
6863   return aarch64_classify_address (&addr, x, mode, strict_p, type);
6864 }
6865
6866 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
6867
6868 static bool
6869 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6870                                          poly_int64 orig_offset,
6871                                          machine_mode mode)
6872 {
6873   HOST_WIDE_INT size;
6874   if (GET_MODE_SIZE (mode).is_constant (&size))
6875     {
6876       HOST_WIDE_INT const_offset, second_offset;
6877
6878       /* A general SVE offset is A * VQ + B.  Remove the A component from
6879          coefficient 0 in order to get the constant B.  */
6880       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6881
6882       /* Split an out-of-range address displacement into a base and
6883          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
6884          range otherwise to increase opportunities for sharing the base
6885          address of different sizes.  Unaligned accesses use the signed
6886          9-bit range, TImode/TFmode use the intersection of signed
6887          scaled 7-bit and signed 9-bit offset.  */
6888       if (mode == TImode || mode == TFmode)
6889         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6890       else if ((const_offset & (size - 1)) != 0)
6891         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6892       else
6893         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6894
6895       if (second_offset == 0 || known_eq (orig_offset, second_offset))
6896         return false;
6897
6898       /* Split the offset into second_offset and the rest.  */
6899       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6900       *offset2 = gen_int_mode (second_offset, Pmode);
6901       return true;
6902     }
6903   else
6904     {
6905       /* Get the mode we should use as the basis of the range.  For structure
6906          modes this is the mode of one vector.  */
6907       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6908       machine_mode step_mode
6909         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6910
6911       /* Get the "mul vl" multiplier we'd like to use.  */
6912       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6913       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6914       if (vec_flags & VEC_SVE_DATA)
6915         /* LDR supports a 9-bit range, but the move patterns for
6916            structure modes require all vectors to be in range of the
6917            same base.  The simplest way of accomodating that while still
6918            promoting reuse of anchor points between different modes is
6919            to use an 8-bit range unconditionally.  */
6920         vnum = ((vnum + 128) & 255) - 128;
6921       else
6922         /* Predicates are only handled singly, so we might as well use
6923            the full range.  */
6924         vnum = ((vnum + 256) & 511) - 256;
6925       if (vnum == 0)
6926         return false;
6927
6928       /* Convert the "mul vl" multiplier into a byte offset.  */
6929       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6930       if (known_eq (second_offset, orig_offset))
6931         return false;
6932
6933       /* Split the offset into second_offset and the rest.  */
6934       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6935       *offset2 = gen_int_mode (second_offset, Pmode);
6936       return true;
6937     }
6938 }
6939
6940 /* Return the binary representation of floating point constant VALUE in INTVAL.
6941    If the value cannot be converted, return false without setting INTVAL.
6942    The conversion is done in the given MODE.  */
6943 bool
6944 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6945 {
6946
6947   /* We make a general exception for 0.  */
6948   if (aarch64_float_const_zero_rtx_p (value))
6949     {
6950       *intval = 0;
6951       return true;
6952     }
6953
6954   scalar_float_mode mode;
6955   if (GET_CODE (value) != CONST_DOUBLE
6956       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6957       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6958       /* Only support up to DF mode.  */
6959       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6960     return false;
6961
6962   unsigned HOST_WIDE_INT ival = 0;
6963
6964   long res[2];
6965   real_to_target (res,
6966                   CONST_DOUBLE_REAL_VALUE (value),
6967                   REAL_MODE_FORMAT (mode));
6968
6969   if (mode == DFmode)
6970     {
6971       int order = BYTES_BIG_ENDIAN ? 1 : 0;
6972       ival = zext_hwi (res[order], 32);
6973       ival |= (zext_hwi (res[1 - order], 32) << 32);
6974     }
6975   else
6976       ival = zext_hwi (res[0], 32);
6977
6978   *intval = ival;
6979   return true;
6980 }
6981
6982 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6983    single MOV(+MOVK) followed by an FMOV.  */
6984 bool
6985 aarch64_float_const_rtx_p (rtx x)
6986 {
6987   machine_mode mode = GET_MODE (x);
6988   if (mode == VOIDmode)
6989     return false;
6990
6991   /* Determine whether it's cheaper to write float constants as
6992      mov/movk pairs over ldr/adrp pairs.  */
6993   unsigned HOST_WIDE_INT ival;
6994
6995   if (GET_CODE (x) == CONST_DOUBLE
6996       && SCALAR_FLOAT_MODE_P (mode)
6997       && aarch64_reinterpret_float_as_int (x, &ival))
6998     {
6999       scalar_int_mode imode = (mode == HFmode
7000                                ? SImode
7001                                : int_mode_for_mode (mode).require ());
7002       int num_instr = aarch64_internal_mov_immediate
7003                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7004       return num_instr < 3;
7005     }
7006
7007   return false;
7008 }
7009
7010 /* Return TRUE if rtx X is immediate constant 0.0 */
7011 bool
7012 aarch64_float_const_zero_rtx_p (rtx x)
7013 {
7014   if (GET_MODE (x) == VOIDmode)
7015     return false;
7016
7017   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7018     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7019   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7020 }
7021
7022 /* Return TRUE if rtx X is immediate constant that fits in a single
7023    MOVI immediate operation.  */
7024 bool
7025 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7026 {
7027   if (!TARGET_SIMD)
7028      return false;
7029
7030   machine_mode vmode;
7031   scalar_int_mode imode;
7032   unsigned HOST_WIDE_INT ival;
7033
7034   if (GET_CODE (x) == CONST_DOUBLE
7035       && SCALAR_FLOAT_MODE_P (mode))
7036     {
7037       if (!aarch64_reinterpret_float_as_int (x, &ival))
7038         return false;
7039
7040       /* We make a general exception for 0.  */
7041       if (aarch64_float_const_zero_rtx_p (x))
7042         return true;
7043
7044       imode = int_mode_for_mode (mode).require ();
7045     }
7046   else if (GET_CODE (x) == CONST_INT
7047            && is_a <scalar_int_mode> (mode, &imode))
7048     ival = INTVAL (x);
7049   else
7050     return false;
7051
7052    /* use a 64 bit mode for everything except for DI/DF mode, where we use
7053      a 128 bit vector mode.  */
7054   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7055
7056   vmode = aarch64_simd_container_mode (imode, width);
7057   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7058
7059   return aarch64_simd_valid_immediate (v_op, NULL);
7060 }
7061
7062
7063 /* Return the fixed registers used for condition codes.  */
7064
7065 static bool
7066 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7067 {
7068   *p1 = CC_REGNUM;
7069   *p2 = INVALID_REGNUM;
7070   return true;
7071 }
7072
7073 /* This function is used by the call expanders of the machine description.
7074    RESULT is the register in which the result is returned.  It's NULL for
7075    "call" and "sibcall".
7076    MEM is the location of the function call.
7077    SIBCALL indicates whether this function call is normal call or sibling call.
7078    It will generate different pattern accordingly.  */
7079
7080 void
7081 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7082 {
7083   rtx call, callee, tmp;
7084   rtvec vec;
7085   machine_mode mode;
7086
7087   gcc_assert (MEM_P (mem));
7088   callee = XEXP (mem, 0);
7089   mode = GET_MODE (callee);
7090   gcc_assert (mode == Pmode);
7091
7092   /* Decide if we should generate indirect calls by loading the
7093      address of the callee into a register before performing
7094      the branch-and-link.  */
7095   if (SYMBOL_REF_P (callee)
7096       ? (aarch64_is_long_call_p (callee)
7097          || aarch64_is_noplt_call_p (callee))
7098       : !REG_P (callee))
7099     XEXP (mem, 0) = force_reg (mode, callee);
7100
7101   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7102
7103   if (result != NULL_RTX)
7104     call = gen_rtx_SET (result, call);
7105
7106   if (sibcall)
7107     tmp = ret_rtx;
7108   else
7109     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7110
7111   vec = gen_rtvec (2, call, tmp);
7112   call = gen_rtx_PARALLEL (VOIDmode, vec);
7113
7114   aarch64_emit_call_insn (call);
7115 }
7116
7117 /* Emit call insn with PAT and do aarch64-specific handling.  */
7118
7119 void
7120 aarch64_emit_call_insn (rtx pat)
7121 {
7122   rtx insn = emit_call_insn (pat);
7123
7124   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7125   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7126   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7127 }
7128
7129 machine_mode
7130 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7131 {
7132   machine_mode mode_x = GET_MODE (x);
7133   rtx_code code_x = GET_CODE (x);
7134
7135   /* All floating point compares return CCFP if it is an equality
7136      comparison, and CCFPE otherwise.  */
7137   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
7138     {
7139       switch (code)
7140         {
7141         case EQ:
7142         case NE:
7143         case UNORDERED:
7144         case ORDERED:
7145         case UNLT:
7146         case UNLE:
7147         case UNGT:
7148         case UNGE:
7149         case UNEQ:
7150           return CCFPmode;
7151
7152         case LT:
7153         case LE:
7154         case GT:
7155         case GE:
7156         case LTGT:
7157           return CCFPEmode;
7158
7159         default:
7160           gcc_unreachable ();
7161         }
7162     }
7163
7164   /* Equality comparisons of short modes against zero can be performed
7165      using the TST instruction with the appropriate bitmask.  */
7166   if (y == const0_rtx && REG_P (x)
7167       && (code == EQ || code == NE)
7168       && (mode_x == HImode || mode_x == QImode))
7169     return CC_NZmode;
7170
7171   /* Similarly, comparisons of zero_extends from shorter modes can
7172      be performed using an ANDS with an immediate mask.  */
7173   if (y == const0_rtx && code_x == ZERO_EXTEND
7174       && (mode_x == SImode || mode_x == DImode)
7175       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
7176       && (code == EQ || code == NE))
7177     return CC_NZmode;
7178
7179   if ((mode_x == SImode || mode_x == DImode)
7180       && y == const0_rtx
7181       && (code == EQ || code == NE || code == LT || code == GE)
7182       && (code_x == PLUS || code_x == MINUS || code_x == AND
7183           || code_x == NEG
7184           || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7185               && CONST_INT_P (XEXP (x, 2)))))
7186     return CC_NZmode;
7187
7188   /* A compare with a shifted operand.  Because of canonicalization,
7189      the comparison will have to be swapped when we emit the assembly
7190      code.  */
7191   if ((mode_x == SImode || mode_x == DImode)
7192       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
7193       && (code_x == ASHIFT || code_x == ASHIFTRT
7194           || code_x == LSHIFTRT
7195           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
7196     return CC_SWPmode;
7197
7198   /* Similarly for a negated operand, but we can only do this for
7199      equalities.  */
7200   if ((mode_x == SImode || mode_x == DImode)
7201       && (REG_P (y) || GET_CODE (y) == SUBREG)
7202       && (code == EQ || code == NE)
7203       && code_x == NEG)
7204     return CC_Zmode;
7205
7206   /* A test for unsigned overflow from an addition.  */
7207   if ((mode_x == DImode || mode_x == TImode)
7208       && (code == LTU || code == GEU)
7209       && code_x == PLUS
7210       && rtx_equal_p (XEXP (x, 0), y))
7211     return CC_Cmode;
7212
7213   /* A test for unsigned overflow from an add with carry.  */
7214   if ((mode_x == DImode || mode_x == TImode)
7215       && (code == LTU || code == GEU)
7216       && code_x == PLUS
7217       && CONST_SCALAR_INT_P (y)
7218       && (rtx_mode_t (y, mode_x)
7219           == (wi::shwi (1, mode_x)
7220               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
7221     return CC_ADCmode;
7222
7223   /* A test for signed overflow.  */
7224   if ((mode_x == DImode || mode_x == TImode)
7225       && code == NE
7226       && code_x == PLUS
7227       && GET_CODE (y) == SIGN_EXTEND)
7228     return CC_Vmode;
7229
7230   /* For everything else, return CCmode.  */
7231   return CCmode;
7232 }
7233
7234 static int
7235 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
7236
7237 int
7238 aarch64_get_condition_code (rtx x)
7239 {
7240   machine_mode mode = GET_MODE (XEXP (x, 0));
7241   enum rtx_code comp_code = GET_CODE (x);
7242
7243   if (GET_MODE_CLASS (mode) != MODE_CC)
7244     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
7245   return aarch64_get_condition_code_1 (mode, comp_code);
7246 }
7247
7248 static int
7249 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
7250 {
7251   switch (mode)
7252     {
7253     case E_CCFPmode:
7254     case E_CCFPEmode:
7255       switch (comp_code)
7256         {
7257         case GE: return AARCH64_GE;
7258         case GT: return AARCH64_GT;
7259         case LE: return AARCH64_LS;
7260         case LT: return AARCH64_MI;
7261         case NE: return AARCH64_NE;
7262         case EQ: return AARCH64_EQ;
7263         case ORDERED: return AARCH64_VC;
7264         case UNORDERED: return AARCH64_VS;
7265         case UNLT: return AARCH64_LT;
7266         case UNLE: return AARCH64_LE;
7267         case UNGT: return AARCH64_HI;
7268         case UNGE: return AARCH64_PL;
7269         default: return -1;
7270         }
7271       break;
7272
7273     case E_CCmode:
7274       switch (comp_code)
7275         {
7276         case NE: return AARCH64_NE;
7277         case EQ: return AARCH64_EQ;
7278         case GE: return AARCH64_GE;
7279         case GT: return AARCH64_GT;
7280         case LE: return AARCH64_LE;
7281         case LT: return AARCH64_LT;
7282         case GEU: return AARCH64_CS;
7283         case GTU: return AARCH64_HI;
7284         case LEU: return AARCH64_LS;
7285         case LTU: return AARCH64_CC;
7286         default: return -1;
7287         }
7288       break;
7289
7290     case E_CC_SWPmode:
7291       switch (comp_code)
7292         {
7293         case NE: return AARCH64_NE;
7294         case EQ: return AARCH64_EQ;
7295         case GE: return AARCH64_LE;
7296         case GT: return AARCH64_LT;
7297         case LE: return AARCH64_GE;
7298         case LT: return AARCH64_GT;
7299         case GEU: return AARCH64_LS;
7300         case GTU: return AARCH64_CC;
7301         case LEU: return AARCH64_CS;
7302         case LTU: return AARCH64_HI;
7303         default: return -1;
7304         }
7305       break;
7306
7307     case E_CC_NZmode:
7308       switch (comp_code)
7309         {
7310         case NE: return AARCH64_NE;
7311         case EQ: return AARCH64_EQ;
7312         case GE: return AARCH64_PL;
7313         case LT: return AARCH64_MI;
7314         default: return -1;
7315         }
7316       break;
7317
7318     case E_CC_Zmode:
7319       switch (comp_code)
7320         {
7321         case NE: return AARCH64_NE;
7322         case EQ: return AARCH64_EQ;
7323         default: return -1;
7324         }
7325       break;
7326
7327     case E_CC_Cmode:
7328       switch (comp_code)
7329         {
7330         case LTU: return AARCH64_CS;
7331         case GEU: return AARCH64_CC;
7332         default: return -1;
7333         }
7334       break;
7335
7336     case E_CC_ADCmode:
7337       switch (comp_code)
7338         {
7339         case GEU: return AARCH64_CS;
7340         case LTU: return AARCH64_CC;
7341         default: return -1;
7342         }
7343       break;
7344
7345     case E_CC_Vmode:
7346       switch (comp_code)
7347         {
7348         case NE: return AARCH64_VS;
7349         case EQ: return AARCH64_VC;
7350         default: return -1;
7351         }
7352       break;
7353
7354     default:
7355       return -1;
7356     }
7357
7358   return -1;
7359 }
7360
7361 bool
7362 aarch64_const_vec_all_same_in_range_p (rtx x,
7363                                        HOST_WIDE_INT minval,
7364                                        HOST_WIDE_INT maxval)
7365 {
7366   rtx elt;
7367   return (const_vec_duplicate_p (x, &elt)
7368           && CONST_INT_P (elt)
7369           && IN_RANGE (INTVAL (elt), minval, maxval));
7370 }
7371
7372 bool
7373 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
7374 {
7375   return aarch64_const_vec_all_same_in_range_p (x, val, val);
7376 }
7377
7378 /* Return true if VEC is a constant in which every element is in the range
7379    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
7380
7381 static bool
7382 aarch64_const_vec_all_in_range_p (rtx vec,
7383                                   HOST_WIDE_INT minval,
7384                                   HOST_WIDE_INT maxval)
7385 {
7386   if (GET_CODE (vec) != CONST_VECTOR
7387       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
7388     return false;
7389
7390   int nunits;
7391   if (!CONST_VECTOR_STEPPED_P (vec))
7392     nunits = const_vector_encoded_nelts (vec);
7393   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
7394     return false;
7395
7396   for (int i = 0; i < nunits; i++)
7397     {
7398       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
7399       if (!CONST_INT_P (vec_elem)
7400           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
7401         return false;
7402     }
7403   return true;
7404 }
7405
7406 /* N Z C V.  */
7407 #define AARCH64_CC_V 1
7408 #define AARCH64_CC_C (1 << 1)
7409 #define AARCH64_CC_Z (1 << 2)
7410 #define AARCH64_CC_N (1 << 3)
7411
7412 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
7413 static const int aarch64_nzcv_codes[] =
7414 {
7415   0,            /* EQ, Z == 1.  */
7416   AARCH64_CC_Z, /* NE, Z == 0.  */
7417   0,            /* CS, C == 1.  */
7418   AARCH64_CC_C, /* CC, C == 0.  */
7419   0,            /* MI, N == 1.  */
7420   AARCH64_CC_N, /* PL, N == 0.  */
7421   0,            /* VS, V == 1.  */
7422   AARCH64_CC_V, /* VC, V == 0.  */
7423   0,            /* HI, C ==1 && Z == 0.  */
7424   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
7425   AARCH64_CC_V, /* GE, N == V.  */
7426   0,            /* LT, N != V.  */
7427   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
7428   0,            /* LE, !(Z == 0 && N == V).  */
7429   0,            /* AL, Any.  */
7430   0             /* NV, Any.  */
7431 };
7432
7433 /* Print floating-point vector immediate operand X to F, negating it
7434    first if NEGATE is true.  Return true on success, false if it isn't
7435    a constant we can handle.  */
7436
7437 static bool
7438 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
7439 {
7440   rtx elt;
7441
7442   if (!const_vec_duplicate_p (x, &elt))
7443     return false;
7444
7445   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
7446   if (negate)
7447     r = real_value_negate (&r);
7448
7449   /* We only handle the SVE single-bit immediates here.  */
7450   if (real_equal (&r, &dconst0))
7451     asm_fprintf (f, "0.0");
7452   else if (real_equal (&r, &dconst1))
7453     asm_fprintf (f, "1.0");
7454   else if (real_equal (&r, &dconsthalf))
7455     asm_fprintf (f, "0.5");
7456   else
7457     return false;
7458
7459   return true;
7460 }
7461
7462 /* Return the equivalent letter for size.  */
7463 static char
7464 sizetochar (int size)
7465 {
7466   switch (size)
7467     {
7468     case 64: return 'd';
7469     case 32: return 's';
7470     case 16: return 'h';
7471     case 8 : return 'b';
7472     default: gcc_unreachable ();
7473     }
7474 }
7475
7476 /* Print operand X to file F in a target specific manner according to CODE.
7477    The acceptable formatting commands given by CODE are:
7478      'c':               An integer or symbol address without a preceding #
7479                         sign.
7480      'C':               Take the duplicated element in a vector constant
7481                         and print it in hex.
7482      'D':               Take the duplicated element in a vector constant
7483                         and print it as an unsigned integer, in decimal.
7484      'e':               Print the sign/zero-extend size as a character 8->b,
7485                         16->h, 32->w.
7486      'p':               Prints N such that 2^N == X (X must be power of 2 and
7487                         const int).
7488      'P':               Print the number of non-zero bits in X (a const_int).
7489      'H':               Print the higher numbered register of a pair (TImode)
7490                         of regs.
7491      'm':               Print a condition (eq, ne, etc).
7492      'M':               Same as 'm', but invert condition.
7493      'N':               Take the duplicated element in a vector constant
7494                         and print the negative of it in decimal.
7495      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
7496      'S/T/U/V':         Print a FP/SIMD register name for a register list.
7497                         The register printed is the FP/SIMD register name
7498                         of X + 0/1/2/3 for S/T/U/V.
7499      'R':               Print a scalar FP/SIMD register name + 1.
7500      'X':               Print bottom 16 bits of integer constant in hex.
7501      'w/x':             Print a general register name or the zero register
7502                         (32-bit or 64-bit).
7503      '0':               Print a normal operand, if it's a general register,
7504                         then we assume DImode.
7505      'k':               Print NZCV for conditional compare instructions.
7506      'A':               Output address constant representing the first
7507                         argument of X, specifying a relocation offset
7508                         if appropriate.
7509      'L':               Output constant address specified by X
7510                         with a relocation offset if appropriate.
7511      'G':               Prints address of X, specifying a PC relative
7512                         relocation mode if appropriate.
7513      'y':               Output address of LDP or STP - this is used for
7514                         some LDP/STPs which don't use a PARALLEL in their
7515                         pattern (so the mode needs to be adjusted).
7516      'z':               Output address of a typical LDP or STP.  */
7517
7518 static void
7519 aarch64_print_operand (FILE *f, rtx x, int code)
7520 {
7521   rtx elt;
7522   switch (code)
7523     {
7524     case 'c':
7525       switch (GET_CODE (x))
7526         {
7527         case CONST_INT:
7528           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
7529           break;
7530
7531         case SYMBOL_REF:
7532           output_addr_const (f, x);
7533           break;
7534
7535         case CONST:
7536           if (GET_CODE (XEXP (x, 0)) == PLUS
7537               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
7538             {
7539               output_addr_const (f, x);
7540               break;
7541             }
7542           /* Fall through.  */
7543
7544         default:
7545           output_operand_lossage ("unsupported operand for code '%c'", code);
7546         }
7547       break;
7548
7549     case 'e':
7550       {
7551         int n;
7552
7553         if (!CONST_INT_P (x)
7554             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
7555           {
7556             output_operand_lossage ("invalid operand for '%%%c'", code);
7557             return;
7558           }
7559
7560         switch (n)
7561           {
7562           case 3:
7563             fputc ('b', f);
7564             break;
7565           case 4:
7566             fputc ('h', f);
7567             break;
7568           case 5:
7569             fputc ('w', f);
7570             break;
7571           default:
7572             output_operand_lossage ("invalid operand for '%%%c'", code);
7573             return;
7574           }
7575       }
7576       break;
7577
7578     case 'p':
7579       {
7580         int n;
7581
7582         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
7583           {
7584             output_operand_lossage ("invalid operand for '%%%c'", code);
7585             return;
7586           }
7587
7588         asm_fprintf (f, "%d", n);
7589       }
7590       break;
7591
7592     case 'P':
7593       if (!CONST_INT_P (x))
7594         {
7595           output_operand_lossage ("invalid operand for '%%%c'", code);
7596           return;
7597         }
7598
7599       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
7600       break;
7601
7602     case 'H':
7603       if (x == const0_rtx)
7604         {
7605           asm_fprintf (f, "xzr");
7606           break;
7607         }
7608
7609       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
7610         {
7611           output_operand_lossage ("invalid operand for '%%%c'", code);
7612           return;
7613         }
7614
7615       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
7616       break;
7617
7618     case 'M':
7619     case 'm':
7620       {
7621         int cond_code;
7622         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
7623         if (x == const_true_rtx)
7624           {
7625             if (code == 'M')
7626               fputs ("nv", f);
7627             return;
7628           }
7629
7630         if (!COMPARISON_P (x))
7631           {
7632             output_operand_lossage ("invalid operand for '%%%c'", code);
7633             return;
7634           }
7635
7636         cond_code = aarch64_get_condition_code (x);
7637         gcc_assert (cond_code >= 0);
7638         if (code == 'M')
7639           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
7640         fputs (aarch64_condition_codes[cond_code], f);
7641       }
7642       break;
7643
7644     case 'N':
7645       if (!const_vec_duplicate_p (x, &elt))
7646         {
7647           output_operand_lossage ("invalid vector constant");
7648           return;
7649         }
7650
7651       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7652         asm_fprintf (f, "%wd", -INTVAL (elt));
7653       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7654                && aarch64_print_vector_float_operand (f, x, true))
7655         ;
7656       else
7657         {
7658           output_operand_lossage ("invalid vector constant");
7659           return;
7660         }
7661       break;
7662
7663     case 'b':
7664     case 'h':
7665     case 's':
7666     case 'd':
7667     case 'q':
7668       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7669         {
7670           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7671           return;
7672         }
7673       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
7674       break;
7675
7676     case 'S':
7677     case 'T':
7678     case 'U':
7679     case 'V':
7680       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7681         {
7682           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7683           return;
7684         }
7685       asm_fprintf (f, "%c%d",
7686                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
7687                    REGNO (x) - V0_REGNUM + (code - 'S'));
7688       break;
7689
7690     case 'R':
7691       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7692         {
7693           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7694           return;
7695         }
7696       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
7697       break;
7698
7699     case 'X':
7700       if (!CONST_INT_P (x))
7701         {
7702           output_operand_lossage ("invalid operand for '%%%c'", code);
7703           return;
7704         }
7705       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
7706       break;
7707
7708     case 'C':
7709       {
7710         /* Print a replicated constant in hex.  */
7711         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7712           {
7713             output_operand_lossage ("invalid operand for '%%%c'", code);
7714             return;
7715           }
7716         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7717         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7718       }
7719       break;
7720
7721     case 'D':
7722       {
7723         /* Print a replicated constant in decimal, treating it as
7724            unsigned.  */
7725         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7726           {
7727             output_operand_lossage ("invalid operand for '%%%c'", code);
7728             return;
7729           }
7730         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7731         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7732       }
7733       break;
7734
7735     case 'w':
7736     case 'x':
7737       if (x == const0_rtx
7738           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
7739         {
7740           asm_fprintf (f, "%czr", code);
7741           break;
7742         }
7743
7744       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
7745         {
7746           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
7747           break;
7748         }
7749
7750       if (REG_P (x) && REGNO (x) == SP_REGNUM)
7751         {
7752           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
7753           break;
7754         }
7755
7756       /* Fall through */
7757
7758     case 0:
7759       if (x == NULL)
7760         {
7761           output_operand_lossage ("missing operand");
7762           return;
7763         }
7764
7765       switch (GET_CODE (x))
7766         {
7767         case REG:
7768           if (aarch64_sve_data_mode_p (GET_MODE (x)))
7769             {
7770               if (REG_NREGS (x) == 1)
7771                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
7772               else
7773                 {
7774                   char suffix
7775                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
7776                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
7777                                REGNO (x) - V0_REGNUM, suffix,
7778                                END_REGNO (x) - V0_REGNUM - 1, suffix);
7779                 }
7780             }
7781           else
7782             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
7783           break;
7784
7785         case MEM:
7786           output_address (GET_MODE (x), XEXP (x, 0));
7787           break;
7788
7789         case LABEL_REF:
7790         case SYMBOL_REF:
7791           output_addr_const (asm_out_file, x);
7792           break;
7793
7794         case CONST_INT:
7795           asm_fprintf (f, "%wd", INTVAL (x));
7796           break;
7797
7798         case CONST:
7799           if (!VECTOR_MODE_P (GET_MODE (x)))
7800             {
7801               output_addr_const (asm_out_file, x);
7802               break;
7803             }
7804           /* fall through */
7805
7806         case CONST_VECTOR:
7807           if (!const_vec_duplicate_p (x, &elt))
7808             {
7809               output_operand_lossage ("invalid vector constant");
7810               return;
7811             }
7812
7813           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7814             asm_fprintf (f, "%wd", INTVAL (elt));
7815           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7816                    && aarch64_print_vector_float_operand (f, x, false))
7817             ;
7818           else
7819             {
7820               output_operand_lossage ("invalid vector constant");
7821               return;
7822             }
7823           break;
7824
7825         case CONST_DOUBLE:
7826           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
7827              be getting CONST_DOUBLEs holding integers.  */
7828           gcc_assert (GET_MODE (x) != VOIDmode);
7829           if (aarch64_float_const_zero_rtx_p (x))
7830             {
7831               fputc ('0', f);
7832               break;
7833             }
7834           else if (aarch64_float_const_representable_p (x))
7835             {
7836 #define buf_size 20
7837               char float_buf[buf_size] = {'\0'};
7838               real_to_decimal_for_mode (float_buf,
7839                                         CONST_DOUBLE_REAL_VALUE (x),
7840                                         buf_size, buf_size,
7841                                         1, GET_MODE (x));
7842               asm_fprintf (asm_out_file, "%s", float_buf);
7843               break;
7844 #undef buf_size
7845             }
7846           output_operand_lossage ("invalid constant");
7847           return;
7848         default:
7849           output_operand_lossage ("invalid operand");
7850           return;
7851         }
7852       break;
7853
7854     case 'A':
7855       if (GET_CODE (x) == HIGH)
7856         x = XEXP (x, 0);
7857
7858       switch (aarch64_classify_symbolic_expression (x))
7859         {
7860         case SYMBOL_SMALL_GOT_4G:
7861           asm_fprintf (asm_out_file, ":got:");
7862           break;
7863
7864         case SYMBOL_SMALL_TLSGD:
7865           asm_fprintf (asm_out_file, ":tlsgd:");
7866           break;
7867
7868         case SYMBOL_SMALL_TLSDESC:
7869           asm_fprintf (asm_out_file, ":tlsdesc:");
7870           break;
7871
7872         case SYMBOL_SMALL_TLSIE:
7873           asm_fprintf (asm_out_file, ":gottprel:");
7874           break;
7875
7876         case SYMBOL_TLSLE24:
7877           asm_fprintf (asm_out_file, ":tprel:");
7878           break;
7879
7880         case SYMBOL_TINY_GOT:
7881           gcc_unreachable ();
7882           break;
7883
7884         default:
7885           break;
7886         }
7887       output_addr_const (asm_out_file, x);
7888       break;
7889
7890     case 'L':
7891       switch (aarch64_classify_symbolic_expression (x))
7892         {
7893         case SYMBOL_SMALL_GOT_4G:
7894           asm_fprintf (asm_out_file, ":lo12:");
7895           break;
7896
7897         case SYMBOL_SMALL_TLSGD:
7898           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7899           break;
7900
7901         case SYMBOL_SMALL_TLSDESC:
7902           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7903           break;
7904
7905         case SYMBOL_SMALL_TLSIE:
7906           asm_fprintf (asm_out_file, ":gottprel_lo12:");
7907           break;
7908
7909         case SYMBOL_TLSLE12:
7910           asm_fprintf (asm_out_file, ":tprel_lo12:");
7911           break;
7912
7913         case SYMBOL_TLSLE24:
7914           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7915           break;
7916
7917         case SYMBOL_TINY_GOT:
7918           asm_fprintf (asm_out_file, ":got:");
7919           break;
7920
7921         case SYMBOL_TINY_TLSIE:
7922           asm_fprintf (asm_out_file, ":gottprel:");
7923           break;
7924
7925         default:
7926           break;
7927         }
7928       output_addr_const (asm_out_file, x);
7929       break;
7930
7931     case 'G':
7932       switch (aarch64_classify_symbolic_expression (x))
7933         {
7934         case SYMBOL_TLSLE24:
7935           asm_fprintf (asm_out_file, ":tprel_hi12:");
7936           break;
7937         default:
7938           break;
7939         }
7940       output_addr_const (asm_out_file, x);
7941       break;
7942
7943     case 'k':
7944       {
7945         HOST_WIDE_INT cond_code;
7946
7947         if (!CONST_INT_P (x))
7948           {
7949             output_operand_lossage ("invalid operand for '%%%c'", code);
7950             return;
7951           }
7952
7953         cond_code = INTVAL (x);
7954         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7955         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7956       }
7957       break;
7958
7959     case 'y':
7960     case 'z':
7961       {
7962         machine_mode mode = GET_MODE (x);
7963
7964         if (GET_CODE (x) != MEM
7965             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7966           {
7967             output_operand_lossage ("invalid operand for '%%%c'", code);
7968             return;
7969           }
7970
7971         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
7972                                             code == 'y'
7973                                             ? ADDR_QUERY_LDP_STP_N
7974                                             : ADDR_QUERY_LDP_STP))
7975           output_operand_lossage ("invalid operand prefix '%%%c'", code);
7976       }
7977       break;
7978
7979     default:
7980       output_operand_lossage ("invalid operand prefix '%%%c'", code);
7981       return;
7982     }
7983 }
7984
7985 /* Print address 'x' of a memory access with mode 'mode'.
7986    'op' is the context required by aarch64_classify_address.  It can either be
7987    MEM for a normal memory access or PARALLEL for LDP/STP.  */
7988 static bool
7989 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7990                                 aarch64_addr_query_type type)
7991 {
7992   struct aarch64_address_info addr;
7993   unsigned int size;
7994
7995   /* Check all addresses are Pmode - including ILP32.  */
7996   if (GET_MODE (x) != Pmode
7997       && (!CONST_INT_P (x)
7998           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
7999     {
8000       output_operand_lossage ("invalid address mode");
8001       return false;
8002     }
8003
8004   if (aarch64_classify_address (&addr, x, mode, true, type))
8005     switch (addr.type)
8006       {
8007       case ADDRESS_REG_IMM:
8008         if (known_eq (addr.const_offset, 0))
8009           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8010         else if (aarch64_sve_data_mode_p (mode))
8011           {
8012             HOST_WIDE_INT vnum
8013               = exact_div (addr.const_offset,
8014                            BYTES_PER_SVE_VECTOR).to_constant ();
8015             asm_fprintf (f, "[%s, #%wd, mul vl]",
8016                          reg_names[REGNO (addr.base)], vnum);
8017           }
8018         else if (aarch64_sve_pred_mode_p (mode))
8019           {
8020             HOST_WIDE_INT vnum
8021               = exact_div (addr.const_offset,
8022                            BYTES_PER_SVE_PRED).to_constant ();
8023             asm_fprintf (f, "[%s, #%wd, mul vl]",
8024                          reg_names[REGNO (addr.base)], vnum);
8025           }
8026         else
8027           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8028                        INTVAL (addr.offset));
8029         return true;
8030
8031       case ADDRESS_REG_REG:
8032         if (addr.shift == 0)
8033           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8034                        reg_names [REGNO (addr.offset)]);
8035         else
8036           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8037                        reg_names [REGNO (addr.offset)], addr.shift);
8038         return true;
8039
8040       case ADDRESS_REG_UXTW:
8041         if (addr.shift == 0)
8042           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8043                        REGNO (addr.offset) - R0_REGNUM);
8044         else
8045           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8046                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8047         return true;
8048
8049       case ADDRESS_REG_SXTW:
8050         if (addr.shift == 0)
8051           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8052                        REGNO (addr.offset) - R0_REGNUM);
8053         else
8054           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8055                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8056         return true;
8057
8058       case ADDRESS_REG_WB:
8059         /* Writeback is only supported for fixed-width modes.  */
8060         size = GET_MODE_SIZE (mode).to_constant ();
8061         switch (GET_CODE (x))
8062           {
8063           case PRE_INC:
8064             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8065             return true;
8066           case POST_INC:
8067             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8068             return true;
8069           case PRE_DEC:
8070             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8071             return true;
8072           case POST_DEC:
8073             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
8074             return true;
8075           case PRE_MODIFY:
8076             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
8077                          INTVAL (addr.offset));
8078             return true;
8079           case POST_MODIFY:
8080             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8081                          INTVAL (addr.offset));
8082             return true;
8083           default:
8084             break;
8085           }
8086         break;
8087
8088       case ADDRESS_LO_SUM:
8089         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8090         output_addr_const (f, addr.offset);
8091         asm_fprintf (f, "]");
8092         return true;
8093
8094       case ADDRESS_SYMBOLIC:
8095         output_addr_const (f, x);
8096         return true;
8097       }
8098
8099   return false;
8100 }
8101
8102 /* Print address 'x' of a memory access with mode 'mode'.  */
8103 static void
8104 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8105 {
8106   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
8107     output_addr_const (f, x);
8108 }
8109
8110 bool
8111 aarch64_label_mentioned_p (rtx x)
8112 {
8113   const char *fmt;
8114   int i;
8115
8116   if (GET_CODE (x) == LABEL_REF)
8117     return true;
8118
8119   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8120      referencing instruction, but they are constant offsets, not
8121      symbols.  */
8122   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8123     return false;
8124
8125   fmt = GET_RTX_FORMAT (GET_CODE (x));
8126   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8127     {
8128       if (fmt[i] == 'E')
8129         {
8130           int j;
8131
8132           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8133             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8134               return 1;
8135         }
8136       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
8137         return 1;
8138     }
8139
8140   return 0;
8141 }
8142
8143 /* Implement REGNO_REG_CLASS.  */
8144
8145 enum reg_class
8146 aarch64_regno_regclass (unsigned regno)
8147 {
8148   if (GP_REGNUM_P (regno))
8149     return GENERAL_REGS;
8150
8151   if (regno == SP_REGNUM)
8152     return STACK_REG;
8153
8154   if (regno == FRAME_POINTER_REGNUM
8155       || regno == ARG_POINTER_REGNUM)
8156     return POINTER_REGS;
8157
8158   if (FP_REGNUM_P (regno))
8159     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
8160
8161   if (PR_REGNUM_P (regno))
8162     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
8163
8164   return NO_REGS;
8165 }
8166
8167 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8168    If OFFSET is out of range, return an offset of an anchor point
8169    that is in range.  Return 0 otherwise.  */
8170
8171 static HOST_WIDE_INT
8172 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
8173                        machine_mode mode)
8174 {
8175   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
8176   if (size > 16)
8177     return (offset + 0x400) & ~0x7f0;
8178
8179   /* For offsets that aren't a multiple of the access size, the limit is
8180      -256...255.  */
8181   if (offset & (size - 1))
8182     {
8183       /* BLKmode typically uses LDP of X-registers.  */
8184       if (mode == BLKmode)
8185         return (offset + 512) & ~0x3ff;
8186       return (offset + 0x100) & ~0x1ff;
8187     }
8188
8189   /* Small negative offsets are supported.  */
8190   if (IN_RANGE (offset, -256, 0))
8191     return 0;
8192
8193   if (mode == TImode || mode == TFmode)
8194     return (offset + 0x100) & ~0x1ff;
8195
8196   /* Use 12-bit offset by access size.  */
8197   return offset & (~0xfff * size);
8198 }
8199
8200 static rtx
8201 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
8202 {
8203   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8204      where mask is selected by alignment and size of the offset.
8205      We try to pick as large a range for the offset as possible to
8206      maximize the chance of a CSE.  However, for aligned addresses
8207      we limit the range to 4k so that structures with different sized
8208      elements are likely to use the same base.  We need to be careful
8209      not to split a CONST for some forms of address expression, otherwise
8210      it will generate sub-optimal code.  */
8211
8212   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
8213     {
8214       rtx base = XEXP (x, 0);
8215       rtx offset_rtx = XEXP (x, 1);
8216       HOST_WIDE_INT offset = INTVAL (offset_rtx);
8217
8218       if (GET_CODE (base) == PLUS)
8219         {
8220           rtx op0 = XEXP (base, 0);
8221           rtx op1 = XEXP (base, 1);
8222
8223           /* Force any scaling into a temp for CSE.  */
8224           op0 = force_reg (Pmode, op0);
8225           op1 = force_reg (Pmode, op1);
8226
8227           /* Let the pointer register be in op0.  */
8228           if (REG_POINTER (op1))
8229             std::swap (op0, op1);
8230
8231           /* If the pointer is virtual or frame related, then we know that
8232              virtual register instantiation or register elimination is going
8233              to apply a second constant.  We want the two constants folded
8234              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
8235           if (virt_or_elim_regno_p (REGNO (op0)))
8236             {
8237               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
8238                                    NULL_RTX, true, OPTAB_DIRECT);
8239               return gen_rtx_PLUS (Pmode, base, op1);
8240             }
8241
8242           /* Otherwise, in order to encourage CSE (and thence loop strength
8243              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
8244           base = expand_binop (Pmode, add_optab, op0, op1,
8245                                NULL_RTX, true, OPTAB_DIRECT);
8246           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
8247         }
8248
8249       HOST_WIDE_INT size;
8250       if (GET_MODE_SIZE (mode).is_constant (&size))
8251         {
8252           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
8253                                                              mode);
8254           if (base_offset != 0)
8255             {
8256               base = plus_constant (Pmode, base, base_offset);
8257               base = force_operand (base, NULL_RTX);
8258               return plus_constant (Pmode, base, offset - base_offset);
8259             }
8260         }
8261     }
8262
8263   return x;
8264 }
8265
8266 static reg_class_t
8267 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
8268                           reg_class_t rclass,
8269                           machine_mode mode,
8270                           secondary_reload_info *sri)
8271 {
8272   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8273      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
8274      comment at the head of aarch64-sve.md for more details about the
8275      big-endian handling.  */
8276   if (BYTES_BIG_ENDIAN
8277       && reg_class_subset_p (rclass, FP_REGS)
8278       && !((REG_P (x) && HARD_REGISTER_P (x))
8279            || aarch64_simd_valid_immediate (x, NULL))
8280       && aarch64_sve_data_mode_p (mode))
8281     {
8282       sri->icode = CODE_FOR_aarch64_sve_reload_be;
8283       return NO_REGS;
8284     }
8285
8286   /* If we have to disable direct literal pool loads and stores because the
8287      function is too big, then we need a scratch register.  */
8288   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
8289       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
8290           || targetm.vector_mode_supported_p (GET_MODE (x)))
8291       && !aarch64_pcrelative_literal_loads)
8292     {
8293       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
8294       return NO_REGS;
8295     }
8296
8297   /* Without the TARGET_SIMD instructions we cannot move a Q register
8298      to a Q register directly.  We need a scratch.  */
8299   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
8300       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
8301       && reg_class_subset_p (rclass, FP_REGS))
8302     {
8303       sri->icode = code_for_aarch64_reload_mov (mode);
8304       return NO_REGS;
8305     }
8306
8307   /* A TFmode or TImode memory access should be handled via an FP_REGS
8308      because AArch64 has richer addressing modes for LDR/STR instructions
8309      than LDP/STP instructions.  */
8310   if (TARGET_FLOAT && rclass == GENERAL_REGS
8311       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
8312     return FP_REGS;
8313
8314   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
8315       return GENERAL_REGS;
8316
8317   return NO_REGS;
8318 }
8319
8320 static bool
8321 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
8322 {
8323   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
8324
8325   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8326      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
8327   if (frame_pointer_needed)
8328     return to == HARD_FRAME_POINTER_REGNUM;
8329   return true;
8330 }
8331
8332 poly_int64
8333 aarch64_initial_elimination_offset (unsigned from, unsigned to)
8334 {
8335   if (to == HARD_FRAME_POINTER_REGNUM)
8336     {
8337       if (from == ARG_POINTER_REGNUM)
8338         return cfun->machine->frame.hard_fp_offset;
8339
8340       if (from == FRAME_POINTER_REGNUM)
8341         return cfun->machine->frame.hard_fp_offset
8342                - cfun->machine->frame.locals_offset;
8343     }
8344
8345   if (to == STACK_POINTER_REGNUM)
8346     {
8347       if (from == FRAME_POINTER_REGNUM)
8348           return cfun->machine->frame.frame_size
8349                  - cfun->machine->frame.locals_offset;
8350     }
8351
8352   return cfun->machine->frame.frame_size;
8353 }
8354
8355 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
8356    previous frame.  */
8357
8358 rtx
8359 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
8360 {
8361   if (count != 0)
8362     return const0_rtx;
8363   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
8364 }
8365
8366
8367 static void
8368 aarch64_asm_trampoline_template (FILE *f)
8369 {
8370   int offset1 = 16;
8371   int offset2 = 20;
8372
8373   if (aarch64_bti_enabled ())
8374     {
8375       asm_fprintf (f, "\thint\t34 // bti c\n");
8376       offset1 -= 4;
8377       offset2 -= 4;
8378     }
8379
8380   if (TARGET_ILP32)
8381     {
8382       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
8383       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
8384                    offset1);
8385     }
8386   else
8387     {
8388       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
8389       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
8390                    offset2);
8391     }
8392   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
8393
8394   /* The trampoline needs an extra padding instruction.  In case if BTI is
8395      enabled the padding instruction is replaced by the BTI instruction at
8396      the beginning.  */
8397   if (!aarch64_bti_enabled ())
8398     assemble_aligned_integer (4, const0_rtx);
8399
8400   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8401   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8402 }
8403
8404 static void
8405 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
8406 {
8407   rtx fnaddr, mem, a_tramp;
8408   const int tramp_code_sz = 16;
8409
8410   /* Don't need to copy the trailing D-words, we fill those in below.  */
8411   emit_block_move (m_tramp, assemble_trampoline_template (),
8412                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
8413   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
8414   fnaddr = XEXP (DECL_RTL (fndecl), 0);
8415   if (GET_MODE (fnaddr) != ptr_mode)
8416     fnaddr = convert_memory_address (ptr_mode, fnaddr);
8417   emit_move_insn (mem, fnaddr);
8418
8419   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
8420   emit_move_insn (mem, chain_value);
8421
8422   /* XXX We should really define a "clear_cache" pattern and use
8423      gen_clear_cache().  */
8424   a_tramp = XEXP (m_tramp, 0);
8425   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
8426                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
8427                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
8428                      ptr_mode);
8429 }
8430
8431 static unsigned char
8432 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
8433 {
8434   /* ??? Logically we should only need to provide a value when
8435      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
8436      can hold MODE, but at the moment we need to handle all modes.
8437      Just ignore any runtime parts for registers that can't store them.  */
8438   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
8439   unsigned int nregs;
8440   switch (regclass)
8441     {
8442     case TAILCALL_ADDR_REGS:
8443     case POINTER_REGS:
8444     case GENERAL_REGS:
8445     case ALL_REGS:
8446     case POINTER_AND_FP_REGS:
8447     case FP_REGS:
8448     case FP_LO_REGS:
8449       if (aarch64_sve_data_mode_p (mode)
8450           && constant_multiple_p (GET_MODE_SIZE (mode),
8451                                   BYTES_PER_SVE_VECTOR, &nregs))
8452         return nregs;
8453       return (aarch64_vector_data_mode_p (mode)
8454               ? CEIL (lowest_size, UNITS_PER_VREG)
8455               : CEIL (lowest_size, UNITS_PER_WORD));
8456     case STACK_REG:
8457     case PR_REGS:
8458     case PR_LO_REGS:
8459     case PR_HI_REGS:
8460       return 1;
8461
8462     case NO_REGS:
8463       return 0;
8464
8465     default:
8466       break;
8467     }
8468   gcc_unreachable ();
8469 }
8470
8471 static reg_class_t
8472 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
8473 {
8474   if (regclass == POINTER_REGS)
8475     return GENERAL_REGS;
8476
8477   if (regclass == STACK_REG)
8478     {
8479       if (REG_P(x)
8480           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
8481           return regclass;
8482
8483       return NO_REGS;
8484     }
8485
8486   /* Register eliminiation can result in a request for
8487      SP+constant->FP_REGS.  We cannot support such operations which
8488      use SP as source and an FP_REG as destination, so reject out
8489      right now.  */
8490   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
8491     {
8492       rtx lhs = XEXP (x, 0);
8493
8494       /* Look through a possible SUBREG introduced by ILP32.  */
8495       if (GET_CODE (lhs) == SUBREG)
8496         lhs = SUBREG_REG (lhs);
8497
8498       gcc_assert (REG_P (lhs));
8499       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
8500                                       POINTER_REGS));
8501       return NO_REGS;
8502     }
8503
8504   return regclass;
8505 }
8506
8507 void
8508 aarch64_asm_output_labelref (FILE* f, const char *name)
8509 {
8510   asm_fprintf (f, "%U%s", name);
8511 }
8512
8513 static void
8514 aarch64_elf_asm_constructor (rtx symbol, int priority)
8515 {
8516   if (priority == DEFAULT_INIT_PRIORITY)
8517     default_ctor_section_asm_out_constructor (symbol, priority);
8518   else
8519     {
8520       section *s;
8521       /* While priority is known to be in range [0, 65535], so 18 bytes
8522          would be enough, the compiler might not know that.  To avoid
8523          -Wformat-truncation false positive, use a larger size.  */
8524       char buf[23];
8525       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
8526       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8527       switch_to_section (s);
8528       assemble_align (POINTER_SIZE);
8529       assemble_aligned_integer (POINTER_BYTES, symbol);
8530     }
8531 }
8532
8533 static void
8534 aarch64_elf_asm_destructor (rtx symbol, int priority)
8535 {
8536   if (priority == DEFAULT_INIT_PRIORITY)
8537     default_dtor_section_asm_out_destructor (symbol, priority);
8538   else
8539     {
8540       section *s;
8541       /* While priority is known to be in range [0, 65535], so 18 bytes
8542          would be enough, the compiler might not know that.  To avoid
8543          -Wformat-truncation false positive, use a larger size.  */
8544       char buf[23];
8545       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
8546       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8547       switch_to_section (s);
8548       assemble_align (POINTER_SIZE);
8549       assemble_aligned_integer (POINTER_BYTES, symbol);
8550     }
8551 }
8552
8553 const char*
8554 aarch64_output_casesi (rtx *operands)
8555 {
8556   char buf[100];
8557   char label[100];
8558   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
8559   int index;
8560   static const char *const patterns[4][2] =
8561   {
8562     {
8563       "ldrb\t%w3, [%0,%w1,uxtw]",
8564       "add\t%3, %4, %w3, sxtb #2"
8565     },
8566     {
8567       "ldrh\t%w3, [%0,%w1,uxtw #1]",
8568       "add\t%3, %4, %w3, sxth #2"
8569     },
8570     {
8571       "ldr\t%w3, [%0,%w1,uxtw #2]",
8572       "add\t%3, %4, %w3, sxtw #2"
8573     },
8574     /* We assume that DImode is only generated when not optimizing and
8575        that we don't really need 64-bit address offsets.  That would
8576        imply an object file with 8GB of code in a single function!  */
8577     {
8578       "ldr\t%w3, [%0,%w1,uxtw #2]",
8579       "add\t%3, %4, %w3, sxtw #2"
8580     }
8581   };
8582
8583   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
8584
8585   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
8586   index = exact_log2 (GET_MODE_SIZE (mode));
8587
8588   gcc_assert (index >= 0 && index <= 3);
8589
8590   /* Need to implement table size reduction, by chaning the code below.  */
8591   output_asm_insn (patterns[index][0], operands);
8592   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
8593   snprintf (buf, sizeof (buf),
8594             "adr\t%%4, %s", targetm.strip_name_encoding (label));
8595   output_asm_insn (buf, operands);
8596   output_asm_insn (patterns[index][1], operands);
8597   output_asm_insn ("br\t%3", operands);
8598   assemble_label (asm_out_file, label);
8599   return "";
8600 }
8601
8602
8603 /* Return size in bits of an arithmetic operand which is shifted/scaled and
8604    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
8605    operator.  */
8606
8607 int
8608 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
8609 {
8610   if (shift >= 0 && shift <= 3)
8611     {
8612       int size;
8613       for (size = 8; size <= 32; size *= 2)
8614         {
8615           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
8616           if (mask == bits << shift)
8617             return size;
8618         }
8619     }
8620   return 0;
8621 }
8622
8623 /* Constant pools are per function only when PC relative
8624    literal loads are true or we are in the large memory
8625    model.  */
8626
8627 static inline bool
8628 aarch64_can_use_per_function_literal_pools_p (void)
8629 {
8630   return (aarch64_pcrelative_literal_loads
8631           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
8632 }
8633
8634 static bool
8635 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
8636 {
8637   /* We can't use blocks for constants when we're using a per-function
8638      constant pool.  */
8639   return !aarch64_can_use_per_function_literal_pools_p ();
8640 }
8641
8642 /* Select appropriate section for constants depending
8643    on where we place literal pools.  */
8644
8645 static section *
8646 aarch64_select_rtx_section (machine_mode mode,
8647                             rtx x,
8648                             unsigned HOST_WIDE_INT align)
8649 {
8650   if (aarch64_can_use_per_function_literal_pools_p ())
8651     return function_section (current_function_decl);
8652
8653   return default_elf_select_rtx_section (mode, x, align);
8654 }
8655
8656 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
8657 void
8658 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
8659                                   HOST_WIDE_INT offset)
8660 {
8661   /* When using per-function literal pools, we must ensure that any code
8662      section is aligned to the minimal instruction length, lest we get
8663      errors from the assembler re "unaligned instructions".  */
8664   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
8665     ASM_OUTPUT_ALIGN (f, 2);
8666 }
8667
8668 /* Costs.  */
8669
8670 /* Helper function for rtx cost calculation.  Strip a shift expression
8671    from X.  Returns the inner operand if successful, or the original
8672    expression on failure.  */
8673 static rtx
8674 aarch64_strip_shift (rtx x)
8675 {
8676   rtx op = x;
8677
8678   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
8679      we can convert both to ROR during final output.  */
8680   if ((GET_CODE (op) == ASHIFT
8681        || GET_CODE (op) == ASHIFTRT
8682        || GET_CODE (op) == LSHIFTRT
8683        || GET_CODE (op) == ROTATERT
8684        || GET_CODE (op) == ROTATE)
8685       && CONST_INT_P (XEXP (op, 1)))
8686     return XEXP (op, 0);
8687
8688   if (GET_CODE (op) == MULT
8689       && CONST_INT_P (XEXP (op, 1))
8690       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
8691     return XEXP (op, 0);
8692
8693   return x;
8694 }
8695
8696 /* Helper function for rtx cost calculation.  Strip an extend
8697    expression from X.  Returns the inner operand if successful, or the
8698    original expression on failure.  We deal with a number of possible
8699    canonicalization variations here. If STRIP_SHIFT is true, then
8700    we can strip off a shift also.  */
8701 static rtx
8702 aarch64_strip_extend (rtx x, bool strip_shift)
8703 {
8704   scalar_int_mode mode;
8705   rtx op = x;
8706
8707   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
8708     return op;
8709
8710   /* Zero and sign extraction of a widened value.  */
8711   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
8712       && XEXP (op, 2) == const0_rtx
8713       && GET_CODE (XEXP (op, 0)) == MULT
8714       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
8715                                          XEXP (op, 1)))
8716     return XEXP (XEXP (op, 0), 0);
8717
8718   /* It can also be represented (for zero-extend) as an AND with an
8719      immediate.  */
8720   if (GET_CODE (op) == AND
8721       && GET_CODE (XEXP (op, 0)) == MULT
8722       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
8723       && CONST_INT_P (XEXP (op, 1))
8724       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
8725                            INTVAL (XEXP (op, 1))) != 0)
8726     return XEXP (XEXP (op, 0), 0);
8727
8728   /* Now handle extended register, as this may also have an optional
8729      left shift by 1..4.  */
8730   if (strip_shift
8731       && GET_CODE (op) == ASHIFT
8732       && CONST_INT_P (XEXP (op, 1))
8733       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
8734     op = XEXP (op, 0);
8735
8736   if (GET_CODE (op) == ZERO_EXTEND
8737       || GET_CODE (op) == SIGN_EXTEND)
8738     op = XEXP (op, 0);
8739
8740   if (op != x)
8741     return op;
8742
8743   return x;
8744 }
8745
8746 /* Return true iff CODE is a shift supported in combination
8747    with arithmetic instructions.  */
8748
8749 static bool
8750 aarch64_shift_p (enum rtx_code code)
8751 {
8752   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
8753 }
8754
8755
8756 /* Return true iff X is a cheap shift without a sign extend. */
8757
8758 static bool
8759 aarch64_cheap_mult_shift_p (rtx x)
8760 {
8761   rtx op0, op1;
8762
8763   op0 = XEXP (x, 0);
8764   op1 = XEXP (x, 1);
8765
8766   if (!(aarch64_tune_params.extra_tuning_flags
8767                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
8768     return false;
8769
8770   if (GET_CODE (op0) == SIGN_EXTEND)
8771     return false;
8772
8773   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
8774       && UINTVAL (op1) <= 4)
8775     return true;
8776
8777   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
8778     return false;
8779
8780   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
8781
8782   if (l2 > 0 && l2 <= 4)
8783     return true;
8784
8785   return false;
8786 }
8787
8788 /* Helper function for rtx cost calculation.  Calculate the cost of
8789    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
8790    Return the calculated cost of the expression, recursing manually in to
8791    operands where needed.  */
8792
8793 static int
8794 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
8795 {
8796   rtx op0, op1;
8797   const struct cpu_cost_table *extra_cost
8798     = aarch64_tune_params.insn_extra_cost;
8799   int cost = 0;
8800   bool compound_p = (outer == PLUS || outer == MINUS);
8801   machine_mode mode = GET_MODE (x);
8802
8803   gcc_checking_assert (code == MULT);
8804
8805   op0 = XEXP (x, 0);
8806   op1 = XEXP (x, 1);
8807
8808   if (VECTOR_MODE_P (mode))
8809     mode = GET_MODE_INNER (mode);
8810
8811   /* Integer multiply/fma.  */
8812   if (GET_MODE_CLASS (mode) == MODE_INT)
8813     {
8814       /* The multiply will be canonicalized as a shift, cost it as such.  */
8815       if (aarch64_shift_p (GET_CODE (x))
8816           || (CONST_INT_P (op1)
8817               && exact_log2 (INTVAL (op1)) > 0))
8818         {
8819           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
8820                            || GET_CODE (op0) == SIGN_EXTEND;
8821           if (speed)
8822             {
8823               if (compound_p)
8824                 {
8825                   /* If the shift is considered cheap,
8826                      then don't add any cost. */
8827                   if (aarch64_cheap_mult_shift_p (x))
8828                     ;
8829                   else if (REG_P (op1))
8830                     /* ARITH + shift-by-register.  */
8831                     cost += extra_cost->alu.arith_shift_reg;
8832                   else if (is_extend)
8833                     /* ARITH + extended register.  We don't have a cost field
8834                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
8835                     cost += extra_cost->alu.extend_arith;
8836                   else
8837                     /* ARITH + shift-by-immediate.  */
8838                     cost += extra_cost->alu.arith_shift;
8839                 }
8840               else
8841                 /* LSL (immediate).  */
8842                 cost += extra_cost->alu.shift;
8843
8844             }
8845           /* Strip extends as we will have costed them in the case above.  */
8846           if (is_extend)
8847             op0 = aarch64_strip_extend (op0, true);
8848
8849           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
8850
8851           return cost;
8852         }
8853
8854       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
8855          compound and let the below cases handle it.  After all, MNEG is a
8856          special-case alias of MSUB.  */
8857       if (GET_CODE (op0) == NEG)
8858         {
8859           op0 = XEXP (op0, 0);
8860           compound_p = true;
8861         }
8862
8863       /* Integer multiplies or FMAs have zero/sign extending variants.  */
8864       if ((GET_CODE (op0) == ZERO_EXTEND
8865            && GET_CODE (op1) == ZERO_EXTEND)
8866           || (GET_CODE (op0) == SIGN_EXTEND
8867               && GET_CODE (op1) == SIGN_EXTEND))
8868         {
8869           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8870           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8871
8872           if (speed)
8873             {
8874               if (compound_p)
8875                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
8876                 cost += extra_cost->mult[0].extend_add;
8877               else
8878                 /* MUL/SMULL/UMULL.  */
8879                 cost += extra_cost->mult[0].extend;
8880             }
8881
8882           return cost;
8883         }
8884
8885       /* This is either an integer multiply or a MADD.  In both cases
8886          we want to recurse and cost the operands.  */
8887       cost += rtx_cost (op0, mode, MULT, 0, speed);
8888       cost += rtx_cost (op1, mode, MULT, 1, speed);
8889
8890       if (speed)
8891         {
8892           if (compound_p)
8893             /* MADD/MSUB.  */
8894             cost += extra_cost->mult[mode == DImode].add;
8895           else
8896             /* MUL.  */
8897             cost += extra_cost->mult[mode == DImode].simple;
8898         }
8899
8900       return cost;
8901     }
8902   else
8903     {
8904       if (speed)
8905         {
8906           /* Floating-point FMA/FMUL can also support negations of the
8907              operands, unless the rounding mode is upward or downward in
8908              which case FNMUL is different than FMUL with operand negation.  */
8909           bool neg0 = GET_CODE (op0) == NEG;
8910           bool neg1 = GET_CODE (op1) == NEG;
8911           if (compound_p || !flag_rounding_math || (neg0 && neg1))
8912             {
8913               if (neg0)
8914                 op0 = XEXP (op0, 0);
8915               if (neg1)
8916                 op1 = XEXP (op1, 0);
8917             }
8918
8919           if (compound_p)
8920             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
8921             cost += extra_cost->fp[mode == DFmode].fma;
8922           else
8923             /* FMUL/FNMUL.  */
8924             cost += extra_cost->fp[mode == DFmode].mult;
8925         }
8926
8927       cost += rtx_cost (op0, mode, MULT, 0, speed);
8928       cost += rtx_cost (op1, mode, MULT, 1, speed);
8929       return cost;
8930     }
8931 }
8932
8933 static int
8934 aarch64_address_cost (rtx x,
8935                       machine_mode mode,
8936                       addr_space_t as ATTRIBUTE_UNUSED,
8937                       bool speed)
8938 {
8939   enum rtx_code c = GET_CODE (x);
8940   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8941   struct aarch64_address_info info;
8942   int cost = 0;
8943   info.shift = 0;
8944
8945   if (!aarch64_classify_address (&info, x, mode, false))
8946     {
8947       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8948         {
8949           /* This is a CONST or SYMBOL ref which will be split
8950              in a different way depending on the code model in use.
8951              Cost it through the generic infrastructure.  */
8952           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8953           /* Divide through by the cost of one instruction to
8954              bring it to the same units as the address costs.  */
8955           cost_symbol_ref /= COSTS_N_INSNS (1);
8956           /* The cost is then the cost of preparing the address,
8957              followed by an immediate (possibly 0) offset.  */
8958           return cost_symbol_ref + addr_cost->imm_offset;
8959         }
8960       else
8961         {
8962           /* This is most likely a jump table from a case
8963              statement.  */
8964           return addr_cost->register_offset;
8965         }
8966     }
8967
8968   switch (info.type)
8969     {
8970       case ADDRESS_LO_SUM:
8971       case ADDRESS_SYMBOLIC:
8972       case ADDRESS_REG_IMM:
8973         cost += addr_cost->imm_offset;
8974         break;
8975
8976       case ADDRESS_REG_WB:
8977         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8978           cost += addr_cost->pre_modify;
8979         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8980           cost += addr_cost->post_modify;
8981         else
8982           gcc_unreachable ();
8983
8984         break;
8985
8986       case ADDRESS_REG_REG:
8987         cost += addr_cost->register_offset;
8988         break;
8989
8990       case ADDRESS_REG_SXTW:
8991         cost += addr_cost->register_sextend;
8992         break;
8993
8994       case ADDRESS_REG_UXTW:
8995         cost += addr_cost->register_zextend;
8996         break;
8997
8998       default:
8999         gcc_unreachable ();
9000     }
9001
9002
9003   if (info.shift > 0)
9004     {
9005       /* For the sake of calculating the cost of the shifted register
9006          component, we can treat same sized modes in the same way.  */
9007       if (known_eq (GET_MODE_BITSIZE (mode), 16))
9008         cost += addr_cost->addr_scale_costs.hi;
9009       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9010         cost += addr_cost->addr_scale_costs.si;
9011       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9012         cost += addr_cost->addr_scale_costs.di;
9013       else
9014         /* We can't tell, or this is a 128-bit vector.  */
9015         cost += addr_cost->addr_scale_costs.ti;
9016     }
9017
9018   return cost;
9019 }
9020
9021 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
9022    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
9023    to be taken.  */
9024
9025 int
9026 aarch64_branch_cost (bool speed_p, bool predictable_p)
9027 {
9028   /* When optimizing for speed, use the cost of unpredictable branches.  */
9029   const struct cpu_branch_cost *branch_costs =
9030     aarch64_tune_params.branch_costs;
9031
9032   if (!speed_p || predictable_p)
9033     return branch_costs->predictable;
9034   else
9035     return branch_costs->unpredictable;
9036 }
9037
9038 /* Return true if the RTX X in mode MODE is a zero or sign extract
9039    usable in an ADD or SUB (extended register) instruction.  */
9040 static bool
9041 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9042 {
9043   /* Catch add with a sign extract.
9044      This is add_<optab><mode>_multp2.  */
9045   if (GET_CODE (x) == SIGN_EXTRACT
9046       || GET_CODE (x) == ZERO_EXTRACT)
9047     {
9048       rtx op0 = XEXP (x, 0);
9049       rtx op1 = XEXP (x, 1);
9050       rtx op2 = XEXP (x, 2);
9051
9052       if (GET_CODE (op0) == MULT
9053           && CONST_INT_P (op1)
9054           && op2 == const0_rtx
9055           && CONST_INT_P (XEXP (op0, 1))
9056           && aarch64_is_extend_from_extract (mode,
9057                                              XEXP (op0, 1),
9058                                              op1))
9059         {
9060           return true;
9061         }
9062     }
9063   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9064      No shift.  */
9065   else if (GET_CODE (x) == SIGN_EXTEND
9066            || GET_CODE (x) == ZERO_EXTEND)
9067     return REG_P (XEXP (x, 0));
9068
9069   return false;
9070 }
9071
9072 static bool
9073 aarch64_frint_unspec_p (unsigned int u)
9074 {
9075   switch (u)
9076     {
9077       case UNSPEC_FRINTZ:
9078       case UNSPEC_FRINTP:
9079       case UNSPEC_FRINTM:
9080       case UNSPEC_FRINTA:
9081       case UNSPEC_FRINTN:
9082       case UNSPEC_FRINTX:
9083       case UNSPEC_FRINTI:
9084         return true;
9085
9086       default:
9087         return false;
9088     }
9089 }
9090
9091 /* Return true iff X is an rtx that will match an extr instruction
9092    i.e. as described in the *extr<mode>5_insn family of patterns.
9093    OP0 and OP1 will be set to the operands of the shifts involved
9094    on success and will be NULL_RTX otherwise.  */
9095
9096 static bool
9097 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9098 {
9099   rtx op0, op1;
9100   scalar_int_mode mode;
9101   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9102     return false;
9103
9104   *res_op0 = NULL_RTX;
9105   *res_op1 = NULL_RTX;
9106
9107   if (GET_CODE (x) != IOR)
9108     return false;
9109
9110   op0 = XEXP (x, 0);
9111   op1 = XEXP (x, 1);
9112
9113   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
9114       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
9115     {
9116      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
9117       if (GET_CODE (op1) == ASHIFT)
9118         std::swap (op0, op1);
9119
9120       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9121         return false;
9122
9123       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9124       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9125
9126       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9127           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9128         {
9129           *res_op0 = XEXP (op0, 0);
9130           *res_op1 = XEXP (op1, 0);
9131           return true;
9132         }
9133     }
9134
9135   return false;
9136 }
9137
9138 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9139    storing it in *COST.  Result is true if the total cost of the operation
9140    has now been calculated.  */
9141 static bool
9142 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
9143 {
9144   rtx inner;
9145   rtx comparator;
9146   enum rtx_code cmpcode;
9147
9148   if (COMPARISON_P (op0))
9149     {
9150       inner = XEXP (op0, 0);
9151       comparator = XEXP (op0, 1);
9152       cmpcode = GET_CODE (op0);
9153     }
9154   else
9155     {
9156       inner = op0;
9157       comparator = const0_rtx;
9158       cmpcode = NE;
9159     }
9160
9161   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
9162     {
9163       /* Conditional branch.  */
9164       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9165         return true;
9166       else
9167         {
9168           if (cmpcode == NE || cmpcode == EQ)
9169             {
9170               if (comparator == const0_rtx)
9171                 {
9172                   /* TBZ/TBNZ/CBZ/CBNZ.  */
9173                   if (GET_CODE (inner) == ZERO_EXTRACT)
9174                     /* TBZ/TBNZ.  */
9175                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
9176                                        ZERO_EXTRACT, 0, speed);
9177                   else
9178                     /* CBZ/CBNZ.  */
9179                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
9180
9181                 return true;
9182               }
9183             }
9184           else if (cmpcode == LT || cmpcode == GE)
9185             {
9186               /* TBZ/TBNZ.  */
9187               if (comparator == const0_rtx)
9188                 return true;
9189             }
9190         }
9191     }
9192   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9193     {
9194       /* CCMP.  */
9195       if (GET_CODE (op1) == COMPARE)
9196         {
9197           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
9198           if (XEXP (op1, 1) == const0_rtx)
9199             *cost += 1;
9200           if (speed)
9201             {
9202               machine_mode mode = GET_MODE (XEXP (op1, 0));
9203               const struct cpu_cost_table *extra_cost
9204                 = aarch64_tune_params.insn_extra_cost;
9205
9206               if (GET_MODE_CLASS (mode) == MODE_INT)
9207                 *cost += extra_cost->alu.arith;
9208               else
9209                 *cost += extra_cost->fp[mode == DFmode].compare;
9210             }
9211           return true;
9212         }
9213
9214       /* It's a conditional operation based on the status flags,
9215          so it must be some flavor of CSEL.  */
9216
9217       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
9218       if (GET_CODE (op1) == NEG
9219           || GET_CODE (op1) == NOT
9220           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
9221         op1 = XEXP (op1, 0);
9222       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
9223         {
9224           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
9225           op1 = XEXP (op1, 0);
9226           op2 = XEXP (op2, 0);
9227         }
9228
9229       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
9230       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
9231       return true;
9232     }
9233
9234   /* We don't know what this is, cost all operands.  */
9235   return false;
9236 }
9237
9238 /* Check whether X is a bitfield operation of the form shift + extend that
9239    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
9240    operand to which the bitfield operation is applied.  Otherwise return
9241    NULL_RTX.  */
9242
9243 static rtx
9244 aarch64_extend_bitfield_pattern_p (rtx x)
9245 {
9246   rtx_code outer_code = GET_CODE (x);
9247   machine_mode outer_mode = GET_MODE (x);
9248
9249   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
9250       && outer_mode != SImode && outer_mode != DImode)
9251     return NULL_RTX;
9252
9253   rtx inner = XEXP (x, 0);
9254   rtx_code inner_code = GET_CODE (inner);
9255   machine_mode inner_mode = GET_MODE (inner);
9256   rtx op = NULL_RTX;
9257
9258   switch (inner_code)
9259     {
9260       case ASHIFT:
9261         if (CONST_INT_P (XEXP (inner, 1))
9262             && (inner_mode == QImode || inner_mode == HImode))
9263           op = XEXP (inner, 0);
9264         break;
9265       case LSHIFTRT:
9266         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
9267             && (inner_mode == QImode || inner_mode == HImode))
9268           op = XEXP (inner, 0);
9269         break;
9270       case ASHIFTRT:
9271         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
9272             && (inner_mode == QImode || inner_mode == HImode))
9273           op = XEXP (inner, 0);
9274         break;
9275       default:
9276         break;
9277     }
9278
9279   return op;
9280 }
9281
9282 /* Return true if the mask and a shift amount from an RTX of the form
9283    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9284    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
9285
9286 bool
9287 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
9288                                     rtx shft_amnt)
9289 {
9290   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
9291          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
9292          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
9293          && (INTVAL (mask)
9294              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
9295 }
9296
9297 /* Calculate the cost of calculating X, storing it in *COST.  Result
9298    is true if the total cost of the operation has now been calculated.  */
9299 static bool
9300 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
9301                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
9302 {
9303   rtx op0, op1, op2;
9304   const struct cpu_cost_table *extra_cost
9305     = aarch64_tune_params.insn_extra_cost;
9306   int code = GET_CODE (x);
9307   scalar_int_mode int_mode;
9308
9309   /* By default, assume that everything has equivalent cost to the
9310      cheapest instruction.  Any additional costs are applied as a delta
9311      above this default.  */
9312   *cost = COSTS_N_INSNS (1);
9313
9314   switch (code)
9315     {
9316     case SET:
9317       /* The cost depends entirely on the operands to SET.  */
9318       *cost = 0;
9319       op0 = SET_DEST (x);
9320       op1 = SET_SRC (x);
9321
9322       switch (GET_CODE (op0))
9323         {
9324         case MEM:
9325           if (speed)
9326             {
9327               rtx address = XEXP (op0, 0);
9328               if (VECTOR_MODE_P (mode))
9329                 *cost += extra_cost->ldst.storev;
9330               else if (GET_MODE_CLASS (mode) == MODE_INT)
9331                 *cost += extra_cost->ldst.store;
9332               else if (mode == SFmode)
9333                 *cost += extra_cost->ldst.storef;
9334               else if (mode == DFmode)
9335                 *cost += extra_cost->ldst.stored;
9336
9337               *cost +=
9338                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9339                                                      0, speed));
9340             }
9341
9342           *cost += rtx_cost (op1, mode, SET, 1, speed);
9343           return true;
9344
9345         case SUBREG:
9346           if (! REG_P (SUBREG_REG (op0)))
9347             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
9348
9349           /* Fall through.  */
9350         case REG:
9351           /* The cost is one per vector-register copied.  */
9352           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
9353             {
9354               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
9355               *cost = COSTS_N_INSNS (nregs);
9356             }
9357           /* const0_rtx is in general free, but we will use an
9358              instruction to set a register to 0.  */
9359           else if (REG_P (op1) || op1 == const0_rtx)
9360             {
9361               /* The cost is 1 per register copied.  */
9362               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
9363               *cost = COSTS_N_INSNS (nregs);
9364             }
9365           else
9366             /* Cost is just the cost of the RHS of the set.  */
9367             *cost += rtx_cost (op1, mode, SET, 1, speed);
9368           return true;
9369
9370         case ZERO_EXTRACT:
9371         case SIGN_EXTRACT:
9372           /* Bit-field insertion.  Strip any redundant widening of
9373              the RHS to meet the width of the target.  */
9374           if (GET_CODE (op1) == SUBREG)
9375             op1 = SUBREG_REG (op1);
9376           if ((GET_CODE (op1) == ZERO_EXTEND
9377                || GET_CODE (op1) == SIGN_EXTEND)
9378               && CONST_INT_P (XEXP (op0, 1))
9379               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
9380               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
9381             op1 = XEXP (op1, 0);
9382
9383           if (CONST_INT_P (op1))
9384             {
9385               /* MOV immediate is assumed to always be cheap.  */
9386               *cost = COSTS_N_INSNS (1);
9387             }
9388           else
9389             {
9390               /* BFM.  */
9391               if (speed)
9392                 *cost += extra_cost->alu.bfi;
9393               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
9394             }
9395
9396           return true;
9397
9398         default:
9399           /* We can't make sense of this, assume default cost.  */
9400           *cost = COSTS_N_INSNS (1);
9401           return false;
9402         }
9403       return false;
9404
9405     case CONST_INT:
9406       /* If an instruction can incorporate a constant within the
9407          instruction, the instruction's expression avoids calling
9408          rtx_cost() on the constant.  If rtx_cost() is called on a
9409          constant, then it is usually because the constant must be
9410          moved into a register by one or more instructions.
9411
9412          The exception is constant 0, which can be expressed
9413          as XZR/WZR and is therefore free.  The exception to this is
9414          if we have (set (reg) (const0_rtx)) in which case we must cost
9415          the move.  However, we can catch that when we cost the SET, so
9416          we don't need to consider that here.  */
9417       if (x == const0_rtx)
9418         *cost = 0;
9419       else
9420         {
9421           /* To an approximation, building any other constant is
9422              proportionally expensive to the number of instructions
9423              required to build that constant.  This is true whether we
9424              are compiling for SPEED or otherwise.  */
9425           if (!is_a <scalar_int_mode> (mode, &int_mode))
9426             int_mode = word_mode;
9427           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
9428                                  (NULL_RTX, x, false, int_mode));
9429         }
9430       return true;
9431
9432     case CONST_DOUBLE:
9433
9434       /* First determine number of instructions to do the move
9435           as an integer constant.  */
9436       if (!aarch64_float_const_representable_p (x)
9437            && !aarch64_can_const_movi_rtx_p (x, mode)
9438            && aarch64_float_const_rtx_p (x))
9439         {
9440           unsigned HOST_WIDE_INT ival;
9441           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
9442           gcc_assert (succeed);
9443
9444           scalar_int_mode imode = (mode == HFmode
9445                                    ? SImode
9446                                    : int_mode_for_mode (mode).require ());
9447           int ncost = aarch64_internal_mov_immediate
9448                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9449           *cost += COSTS_N_INSNS (ncost);
9450           return true;
9451         }
9452
9453       if (speed)
9454         {
9455           /* mov[df,sf]_aarch64.  */
9456           if (aarch64_float_const_representable_p (x))
9457             /* FMOV (scalar immediate).  */
9458             *cost += extra_cost->fp[mode == DFmode].fpconst;
9459           else if (!aarch64_float_const_zero_rtx_p (x))
9460             {
9461               /* This will be a load from memory.  */
9462               if (mode == DFmode)
9463                 *cost += extra_cost->ldst.loadd;
9464               else
9465                 *cost += extra_cost->ldst.loadf;
9466             }
9467           else
9468             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
9469                or MOV v0.s[0], wzr - neither of which are modeled by the
9470                cost tables.  Just use the default cost.  */
9471             {
9472             }
9473         }
9474
9475       return true;
9476
9477     case MEM:
9478       if (speed)
9479         {
9480           /* For loads we want the base cost of a load, plus an
9481              approximation for the additional cost of the addressing
9482              mode.  */
9483           rtx address = XEXP (x, 0);
9484           if (VECTOR_MODE_P (mode))
9485             *cost += extra_cost->ldst.loadv;
9486           else if (GET_MODE_CLASS (mode) == MODE_INT)
9487             *cost += extra_cost->ldst.load;
9488           else if (mode == SFmode)
9489             *cost += extra_cost->ldst.loadf;
9490           else if (mode == DFmode)
9491             *cost += extra_cost->ldst.loadd;
9492
9493           *cost +=
9494                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9495                                                      0, speed));
9496         }
9497
9498       return true;
9499
9500     case NEG:
9501       op0 = XEXP (x, 0);
9502
9503       if (VECTOR_MODE_P (mode))
9504         {
9505           if (speed)
9506             {
9507               /* FNEG.  */
9508               *cost += extra_cost->vect.alu;
9509             }
9510           return false;
9511         }
9512
9513       if (GET_MODE_CLASS (mode) == MODE_INT)
9514         {
9515           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9516               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9517             {
9518               /* CSETM.  */
9519               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
9520               return true;
9521             }
9522
9523           /* Cost this as SUB wzr, X.  */
9524           op0 = CONST0_RTX (mode);
9525           op1 = XEXP (x, 0);
9526           goto cost_minus;
9527         }
9528
9529       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9530         {
9531           /* Support (neg(fma...)) as a single instruction only if
9532              sign of zeros is unimportant.  This matches the decision
9533              making in aarch64.md.  */
9534           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
9535             {
9536               /* FNMADD.  */
9537               *cost = rtx_cost (op0, mode, NEG, 0, speed);
9538               return true;
9539             }
9540           if (GET_CODE (op0) == MULT)
9541             {
9542               /* FNMUL.  */
9543               *cost = rtx_cost (op0, mode, NEG, 0, speed);
9544               return true;
9545             }
9546           if (speed)
9547             /* FNEG.  */
9548             *cost += extra_cost->fp[mode == DFmode].neg;
9549           return false;
9550         }
9551
9552       return false;
9553
9554     case CLRSB:
9555     case CLZ:
9556       if (speed)
9557         {
9558           if (VECTOR_MODE_P (mode))
9559             *cost += extra_cost->vect.alu;
9560           else
9561             *cost += extra_cost->alu.clz;
9562         }
9563
9564       return false;
9565
9566     case COMPARE:
9567       op0 = XEXP (x, 0);
9568       op1 = XEXP (x, 1);
9569
9570       if (op1 == const0_rtx
9571           && GET_CODE (op0) == AND)
9572         {
9573           x = op0;
9574           mode = GET_MODE (op0);
9575           goto cost_logic;
9576         }
9577
9578       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
9579         {
9580           /* TODO: A write to the CC flags possibly costs extra, this
9581              needs encoding in the cost tables.  */
9582
9583           mode = GET_MODE (op0);
9584           /* ANDS.  */
9585           if (GET_CODE (op0) == AND)
9586             {
9587               x = op0;
9588               goto cost_logic;
9589             }
9590
9591           if (GET_CODE (op0) == PLUS)
9592             {
9593               /* ADDS (and CMN alias).  */
9594               x = op0;
9595               goto cost_plus;
9596             }
9597
9598           if (GET_CODE (op0) == MINUS)
9599             {
9600               /* SUBS.  */
9601               x = op0;
9602               goto cost_minus;
9603             }
9604
9605           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
9606               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
9607               && CONST_INT_P (XEXP (op0, 2)))
9608             {
9609               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
9610                  Handle it here directly rather than going to cost_logic
9611                  since we know the immediate generated for the TST is valid
9612                  so we can avoid creating an intermediate rtx for it only
9613                  for costing purposes.  */
9614               if (speed)
9615                 *cost += extra_cost->alu.logical;
9616
9617               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
9618                                  ZERO_EXTRACT, 0, speed);
9619               return true;
9620             }
9621
9622           if (GET_CODE (op1) == NEG)
9623             {
9624               /* CMN.  */
9625               if (speed)
9626                 *cost += extra_cost->alu.arith;
9627
9628               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
9629               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
9630               return true;
9631             }
9632
9633           /* CMP.
9634
9635              Compare can freely swap the order of operands, and
9636              canonicalization puts the more complex operation first.
9637              But the integer MINUS logic expects the shift/extend
9638              operation in op1.  */
9639           if (! (REG_P (op0)
9640                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
9641           {
9642             op0 = XEXP (x, 1);
9643             op1 = XEXP (x, 0);
9644           }
9645           goto cost_minus;
9646         }
9647
9648       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
9649         {
9650           /* FCMP.  */
9651           if (speed)
9652             *cost += extra_cost->fp[mode == DFmode].compare;
9653
9654           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
9655             {
9656               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
9657               /* FCMP supports constant 0.0 for no extra cost. */
9658               return true;
9659             }
9660           return false;
9661         }
9662
9663       if (VECTOR_MODE_P (mode))
9664         {
9665           /* Vector compare.  */
9666           if (speed)
9667             *cost += extra_cost->vect.alu;
9668
9669           if (aarch64_float_const_zero_rtx_p (op1))
9670             {
9671               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
9672                  cost.  */
9673               return true;
9674             }
9675           return false;
9676         }
9677       return false;
9678
9679     case MINUS:
9680       {
9681         op0 = XEXP (x, 0);
9682         op1 = XEXP (x, 1);
9683
9684 cost_minus:
9685         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
9686
9687         /* Detect valid immediates.  */
9688         if ((GET_MODE_CLASS (mode) == MODE_INT
9689              || (GET_MODE_CLASS (mode) == MODE_CC
9690                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
9691             && CONST_INT_P (op1)
9692             && aarch64_uimm12_shift (INTVAL (op1)))
9693           {
9694             if (speed)
9695               /* SUB(S) (immediate).  */
9696               *cost += extra_cost->alu.arith;
9697             return true;
9698           }
9699
9700         /* Look for SUB (extended register).  */
9701         if (is_a <scalar_int_mode> (mode, &int_mode)
9702             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
9703           {
9704             if (speed)
9705               *cost += extra_cost->alu.extend_arith;
9706
9707             op1 = aarch64_strip_extend (op1, true);
9708             *cost += rtx_cost (op1, VOIDmode,
9709                                (enum rtx_code) GET_CODE (op1), 0, speed);
9710             return true;
9711           }
9712
9713         rtx new_op1 = aarch64_strip_extend (op1, false);
9714
9715         /* Cost this as an FMA-alike operation.  */
9716         if ((GET_CODE (new_op1) == MULT
9717              || aarch64_shift_p (GET_CODE (new_op1)))
9718             && code != COMPARE)
9719           {
9720             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
9721                                             (enum rtx_code) code,
9722                                             speed);
9723             return true;
9724           }
9725
9726         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
9727
9728         if (speed)
9729           {
9730             if (VECTOR_MODE_P (mode))
9731               {
9732                 /* Vector SUB.  */
9733                 *cost += extra_cost->vect.alu;
9734               }
9735             else if (GET_MODE_CLASS (mode) == MODE_INT)
9736               {
9737                 /* SUB(S).  */
9738                 *cost += extra_cost->alu.arith;
9739               }
9740             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9741               {
9742                 /* FSUB.  */
9743                 *cost += extra_cost->fp[mode == DFmode].addsub;
9744               }
9745           }
9746         return true;
9747       }
9748
9749     case PLUS:
9750       {
9751         rtx new_op0;
9752
9753         op0 = XEXP (x, 0);
9754         op1 = XEXP (x, 1);
9755
9756 cost_plus:
9757         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9758             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9759           {
9760             /* CSINC.  */
9761             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
9762             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9763             return true;
9764           }
9765
9766         if (GET_MODE_CLASS (mode) == MODE_INT
9767             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
9768                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
9769           {
9770             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
9771
9772             if (speed)
9773               /* ADD (immediate).  */
9774               *cost += extra_cost->alu.arith;
9775             return true;
9776           }
9777
9778         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9779
9780         /* Look for ADD (extended register).  */
9781         if (is_a <scalar_int_mode> (mode, &int_mode)
9782             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
9783           {
9784             if (speed)
9785               *cost += extra_cost->alu.extend_arith;
9786
9787             op0 = aarch64_strip_extend (op0, true);
9788             *cost += rtx_cost (op0, VOIDmode,
9789                                (enum rtx_code) GET_CODE (op0), 0, speed);
9790             return true;
9791           }
9792
9793         /* Strip any extend, leave shifts behind as we will
9794            cost them through mult_cost.  */
9795         new_op0 = aarch64_strip_extend (op0, false);
9796
9797         if (GET_CODE (new_op0) == MULT
9798             || aarch64_shift_p (GET_CODE (new_op0)))
9799           {
9800             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
9801                                             speed);
9802             return true;
9803           }
9804
9805         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
9806
9807         if (speed)
9808           {
9809             if (VECTOR_MODE_P (mode))
9810               {
9811                 /* Vector ADD.  */
9812                 *cost += extra_cost->vect.alu;
9813               }
9814             else if (GET_MODE_CLASS (mode) == MODE_INT)
9815               {
9816                 /* ADD.  */
9817                 *cost += extra_cost->alu.arith;
9818               }
9819             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9820               {
9821                 /* FADD.  */
9822                 *cost += extra_cost->fp[mode == DFmode].addsub;
9823               }
9824           }
9825         return true;
9826       }
9827
9828     case BSWAP:
9829       *cost = COSTS_N_INSNS (1);
9830
9831       if (speed)
9832         {
9833           if (VECTOR_MODE_P (mode))
9834             *cost += extra_cost->vect.alu;
9835           else
9836             *cost += extra_cost->alu.rev;
9837         }
9838       return false;
9839
9840     case IOR:
9841       if (aarch_rev16_p (x))
9842         {
9843           *cost = COSTS_N_INSNS (1);
9844
9845           if (speed)
9846             {
9847               if (VECTOR_MODE_P (mode))
9848                 *cost += extra_cost->vect.alu;
9849               else
9850                 *cost += extra_cost->alu.rev;
9851             }
9852           return true;
9853         }
9854
9855       if (aarch64_extr_rtx_p (x, &op0, &op1))
9856         {
9857           *cost += rtx_cost (op0, mode, IOR, 0, speed);
9858           *cost += rtx_cost (op1, mode, IOR, 1, speed);
9859           if (speed)
9860             *cost += extra_cost->alu.shift;
9861
9862           return true;
9863         }
9864     /* Fall through.  */
9865     case XOR:
9866     case AND:
9867     cost_logic:
9868       op0 = XEXP (x, 0);
9869       op1 = XEXP (x, 1);
9870
9871       if (VECTOR_MODE_P (mode))
9872         {
9873           if (speed)
9874             *cost += extra_cost->vect.alu;
9875           return true;
9876         }
9877
9878       if (code == AND
9879           && GET_CODE (op0) == MULT
9880           && CONST_INT_P (XEXP (op0, 1))
9881           && CONST_INT_P (op1)
9882           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9883                                INTVAL (op1)) != 0)
9884         {
9885           /* This is a UBFM/SBFM.  */
9886           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9887           if (speed)
9888             *cost += extra_cost->alu.bfx;
9889           return true;
9890         }
9891
9892       if (is_int_mode (mode, &int_mode))
9893         {
9894           if (CONST_INT_P (op1))
9895             {
9896               /* We have a mask + shift version of a UBFIZ
9897                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
9898               if (GET_CODE (op0) == ASHIFT
9899                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9900                                                          XEXP (op0, 1)))
9901                 {
9902                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
9903                                      (enum rtx_code) code, 0, speed);
9904                   if (speed)
9905                     *cost += extra_cost->alu.bfx;
9906
9907                   return true;
9908                 }
9909               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9910                 {
9911                 /* We possibly get the immediate for free, this is not
9912                    modelled.  */
9913                   *cost += rtx_cost (op0, int_mode,
9914                                      (enum rtx_code) code, 0, speed);
9915                   if (speed)
9916                     *cost += extra_cost->alu.logical;
9917
9918                   return true;
9919                 }
9920             }
9921           else
9922             {
9923               rtx new_op0 = op0;
9924
9925               /* Handle ORN, EON, or BIC.  */
9926               if (GET_CODE (op0) == NOT)
9927                 op0 = XEXP (op0, 0);
9928
9929               new_op0 = aarch64_strip_shift (op0);
9930
9931               /* If we had a shift on op0 then this is a logical-shift-
9932                  by-register/immediate operation.  Otherwise, this is just
9933                  a logical operation.  */
9934               if (speed)
9935                 {
9936                   if (new_op0 != op0)
9937                     {
9938                       /* Shift by immediate.  */
9939                       if (CONST_INT_P (XEXP (op0, 1)))
9940                         *cost += extra_cost->alu.log_shift;
9941                       else
9942                         *cost += extra_cost->alu.log_shift_reg;
9943                     }
9944                   else
9945                     *cost += extra_cost->alu.logical;
9946                 }
9947
9948               /* In both cases we want to cost both operands.  */
9949               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9950                                  0, speed);
9951               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9952                                  1, speed);
9953
9954               return true;
9955             }
9956         }
9957       return false;
9958
9959     case NOT:
9960       x = XEXP (x, 0);
9961       op0 = aarch64_strip_shift (x);
9962
9963       if (VECTOR_MODE_P (mode))
9964         {
9965           /* Vector NOT.  */
9966           *cost += extra_cost->vect.alu;
9967           return false;
9968         }
9969
9970       /* MVN-shifted-reg.  */
9971       if (op0 != x)
9972         {
9973           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9974
9975           if (speed)
9976             *cost += extra_cost->alu.log_shift;
9977
9978           return true;
9979         }
9980       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9981          Handle the second form here taking care that 'a' in the above can
9982          be a shift.  */
9983       else if (GET_CODE (op0) == XOR)
9984         {
9985           rtx newop0 = XEXP (op0, 0);
9986           rtx newop1 = XEXP (op0, 1);
9987           rtx op0_stripped = aarch64_strip_shift (newop0);
9988
9989           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9990           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9991
9992           if (speed)
9993             {
9994               if (op0_stripped != newop0)
9995                 *cost += extra_cost->alu.log_shift;
9996               else
9997                 *cost += extra_cost->alu.logical;
9998             }
9999
10000           return true;
10001         }
10002       /* MVN.  */
10003       if (speed)
10004         *cost += extra_cost->alu.logical;
10005
10006       return false;
10007
10008     case ZERO_EXTEND:
10009
10010       op0 = XEXP (x, 0);
10011       /* If a value is written in SI mode, then zero extended to DI
10012          mode, the operation will in general be free as a write to
10013          a 'w' register implicitly zeroes the upper bits of an 'x'
10014          register.  However, if this is
10015
10016            (set (reg) (zero_extend (reg)))
10017
10018          we must cost the explicit register move.  */
10019       if (mode == DImode
10020           && GET_MODE (op0) == SImode
10021           && outer == SET)
10022         {
10023           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10024
10025         /* If OP_COST is non-zero, then the cost of the zero extend
10026            is effectively the cost of the inner operation.  Otherwise
10027            we have a MOV instruction and we take the cost from the MOV
10028            itself.  This is true independently of whether we are
10029            optimizing for space or time.  */
10030           if (op_cost)
10031             *cost = op_cost;
10032
10033           return true;
10034         }
10035       else if (MEM_P (op0))
10036         {
10037           /* All loads can zero extend to any size for free.  */
10038           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10039           return true;
10040         }
10041
10042       op0 = aarch64_extend_bitfield_pattern_p (x);
10043       if (op0)
10044         {
10045           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
10046           if (speed)
10047             *cost += extra_cost->alu.bfx;
10048           return true;
10049         }
10050
10051       if (speed)
10052         {
10053           if (VECTOR_MODE_P (mode))
10054             {
10055               /* UMOV.  */
10056               *cost += extra_cost->vect.alu;
10057             }
10058           else
10059             {
10060               /* We generate an AND instead of UXTB/UXTH.  */
10061               *cost += extra_cost->alu.logical;
10062             }
10063         }
10064       return false;
10065
10066     case SIGN_EXTEND:
10067       if (MEM_P (XEXP (x, 0)))
10068         {
10069           /* LDRSH.  */
10070           if (speed)
10071             {
10072               rtx address = XEXP (XEXP (x, 0), 0);
10073               *cost += extra_cost->ldst.load_sign_extend;
10074
10075               *cost +=
10076                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10077                                                      0, speed));
10078             }
10079           return true;
10080         }
10081
10082       op0 = aarch64_extend_bitfield_pattern_p (x);
10083       if (op0)
10084         {
10085           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
10086           if (speed)
10087             *cost += extra_cost->alu.bfx;
10088           return true;
10089         }
10090
10091       if (speed)
10092         {
10093           if (VECTOR_MODE_P (mode))
10094             *cost += extra_cost->vect.alu;
10095           else
10096             *cost += extra_cost->alu.extend;
10097         }
10098       return false;
10099
10100     case ASHIFT:
10101       op0 = XEXP (x, 0);
10102       op1 = XEXP (x, 1);
10103
10104       if (CONST_INT_P (op1))
10105         {
10106           if (speed)
10107             {
10108               if (VECTOR_MODE_P (mode))
10109                 {
10110                   /* Vector shift (immediate).  */
10111                   *cost += extra_cost->vect.alu;
10112                 }
10113               else
10114                 {
10115                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
10116                      aliases.  */
10117                   *cost += extra_cost->alu.shift;
10118                 }
10119             }
10120
10121           /* We can incorporate zero/sign extend for free.  */
10122           if (GET_CODE (op0) == ZERO_EXTEND
10123               || GET_CODE (op0) == SIGN_EXTEND)
10124             op0 = XEXP (op0, 0);
10125
10126           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
10127           return true;
10128         }
10129       else
10130         {
10131           if (VECTOR_MODE_P (mode))
10132             {
10133               if (speed)
10134                 /* Vector shift (register).  */
10135                 *cost += extra_cost->vect.alu;
10136             }
10137           else
10138             {
10139               if (speed)
10140                 /* LSLV.  */
10141                 *cost += extra_cost->alu.shift_reg;
10142
10143               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10144                   && CONST_INT_P (XEXP (op1, 1))
10145                   && known_eq (INTVAL (XEXP (op1, 1)),
10146                                GET_MODE_BITSIZE (mode) - 1))
10147                 {
10148                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10149                   /* We already demanded XEXP (op1, 0) to be REG_P, so
10150                      don't recurse into it.  */
10151                   return true;
10152                 }
10153             }
10154           return false;  /* All arguments need to be in registers.  */
10155         }
10156
10157     case ROTATE:
10158     case ROTATERT:
10159     case LSHIFTRT:
10160     case ASHIFTRT:
10161       op0 = XEXP (x, 0);
10162       op1 = XEXP (x, 1);
10163
10164       if (CONST_INT_P (op1))
10165         {
10166           /* ASR (immediate) and friends.  */
10167           if (speed)
10168             {
10169               if (VECTOR_MODE_P (mode))
10170                 *cost += extra_cost->vect.alu;
10171               else
10172                 *cost += extra_cost->alu.shift;
10173             }
10174
10175           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10176           return true;
10177         }
10178       else
10179         {
10180           if (VECTOR_MODE_P (mode))
10181             {
10182               if (speed)
10183                 /* Vector shift (register).  */
10184                 *cost += extra_cost->vect.alu;
10185             }
10186           else
10187             {
10188               if (speed)
10189                 /* ASR (register) and friends.  */
10190                 *cost += extra_cost->alu.shift_reg;
10191
10192               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10193                   && CONST_INT_P (XEXP (op1, 1))
10194                   && known_eq (INTVAL (XEXP (op1, 1)),
10195                                GET_MODE_BITSIZE (mode) - 1))
10196                 {
10197                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10198                   /* We already demanded XEXP (op1, 0) to be REG_P, so
10199                      don't recurse into it.  */
10200                   return true;
10201                 }
10202             }
10203           return false;  /* All arguments need to be in registers.  */
10204         }
10205
10206     case SYMBOL_REF:
10207
10208       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
10209           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
10210         {
10211           /* LDR.  */
10212           if (speed)
10213             *cost += extra_cost->ldst.load;
10214         }
10215       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
10216                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
10217         {
10218           /* ADRP, followed by ADD.  */
10219           *cost += COSTS_N_INSNS (1);
10220           if (speed)
10221             *cost += 2 * extra_cost->alu.arith;
10222         }
10223       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
10224                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10225         {
10226           /* ADR.  */
10227           if (speed)
10228             *cost += extra_cost->alu.arith;
10229         }
10230
10231       if (flag_pic)
10232         {
10233           /* One extra load instruction, after accessing the GOT.  */
10234           *cost += COSTS_N_INSNS (1);
10235           if (speed)
10236             *cost += extra_cost->ldst.load;
10237         }
10238       return true;
10239
10240     case HIGH:
10241     case LO_SUM:
10242       /* ADRP/ADD (immediate).  */
10243       if (speed)
10244         *cost += extra_cost->alu.arith;
10245       return true;
10246
10247     case ZERO_EXTRACT:
10248     case SIGN_EXTRACT:
10249       /* UBFX/SBFX.  */
10250       if (speed)
10251         {
10252           if (VECTOR_MODE_P (mode))
10253             *cost += extra_cost->vect.alu;
10254           else
10255             *cost += extra_cost->alu.bfx;
10256         }
10257
10258       /* We can trust that the immediates used will be correct (there
10259          are no by-register forms), so we need only cost op0.  */
10260       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
10261       return true;
10262
10263     case MULT:
10264       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
10265       /* aarch64_rtx_mult_cost always handles recursion to its
10266          operands.  */
10267       return true;
10268
10269     case MOD:
10270     /* We can expand signed mod by power of 2 using a NEGS, two parallel
10271        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
10272        an unconditional negate.  This case should only ever be reached through
10273        the set_smod_pow2_cheap check in expmed.c.  */
10274       if (CONST_INT_P (XEXP (x, 1))
10275           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
10276           && (mode == SImode || mode == DImode))
10277         {
10278           /* We expand to 4 instructions.  Reset the baseline.  */
10279           *cost = COSTS_N_INSNS (4);
10280
10281           if (speed)
10282             *cost += 2 * extra_cost->alu.logical
10283                      + 2 * extra_cost->alu.arith;
10284
10285           return true;
10286         }
10287
10288     /* Fall-through.  */
10289     case UMOD:
10290       if (speed)
10291         {
10292           /* Slighly prefer UMOD over SMOD.  */
10293           if (VECTOR_MODE_P (mode))
10294             *cost += extra_cost->vect.alu;
10295           else if (GET_MODE_CLASS (mode) == MODE_INT)
10296             *cost += (extra_cost->mult[mode == DImode].add
10297                       + extra_cost->mult[mode == DImode].idiv
10298                       + (code == MOD ? 1 : 0));
10299         }
10300       return false;  /* All arguments need to be in registers.  */
10301
10302     case DIV:
10303     case UDIV:
10304     case SQRT:
10305       if (speed)
10306         {
10307           if (VECTOR_MODE_P (mode))
10308             *cost += extra_cost->vect.alu;
10309           else if (GET_MODE_CLASS (mode) == MODE_INT)
10310             /* There is no integer SQRT, so only DIV and UDIV can get
10311                here.  */
10312             *cost += (extra_cost->mult[mode == DImode].idiv
10313                      /* Slighly prefer UDIV over SDIV.  */
10314                      + (code == DIV ? 1 : 0));
10315           else
10316             *cost += extra_cost->fp[mode == DFmode].div;
10317         }
10318       return false;  /* All arguments need to be in registers.  */
10319
10320     case IF_THEN_ELSE:
10321       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
10322                                          XEXP (x, 2), cost, speed);
10323
10324     case EQ:
10325     case NE:
10326     case GT:
10327     case GTU:
10328     case LT:
10329     case LTU:
10330     case GE:
10331     case GEU:
10332     case LE:
10333     case LEU:
10334
10335       return false; /* All arguments must be in registers.  */
10336
10337     case FMA:
10338       op0 = XEXP (x, 0);
10339       op1 = XEXP (x, 1);
10340       op2 = XEXP (x, 2);
10341
10342       if (speed)
10343         {
10344           if (VECTOR_MODE_P (mode))
10345             *cost += extra_cost->vect.alu;
10346           else
10347             *cost += extra_cost->fp[mode == DFmode].fma;
10348         }
10349
10350       /* FMSUB, FNMADD, and FNMSUB are free.  */
10351       if (GET_CODE (op0) == NEG)
10352         op0 = XEXP (op0, 0);
10353
10354       if (GET_CODE (op2) == NEG)
10355         op2 = XEXP (op2, 0);
10356
10357       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
10358          and the by-element operand as operand 0.  */
10359       if (GET_CODE (op1) == NEG)
10360         op1 = XEXP (op1, 0);
10361
10362       /* Catch vector-by-element operations.  The by-element operand can
10363          either be (vec_duplicate (vec_select (x))) or just
10364          (vec_select (x)), depending on whether we are multiplying by
10365          a vector or a scalar.
10366
10367          Canonicalization is not very good in these cases, FMA4 will put the
10368          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
10369       if (GET_CODE (op0) == VEC_DUPLICATE)
10370         op0 = XEXP (op0, 0);
10371       else if (GET_CODE (op1) == VEC_DUPLICATE)
10372         op1 = XEXP (op1, 0);
10373
10374       if (GET_CODE (op0) == VEC_SELECT)
10375         op0 = XEXP (op0, 0);
10376       else if (GET_CODE (op1) == VEC_SELECT)
10377         op1 = XEXP (op1, 0);
10378
10379       /* If the remaining parameters are not registers,
10380          get the cost to put them into registers.  */
10381       *cost += rtx_cost (op0, mode, FMA, 0, speed);
10382       *cost += rtx_cost (op1, mode, FMA, 1, speed);
10383       *cost += rtx_cost (op2, mode, FMA, 2, speed);
10384       return true;
10385
10386     case FLOAT:
10387     case UNSIGNED_FLOAT:
10388       if (speed)
10389         *cost += extra_cost->fp[mode == DFmode].fromint;
10390       return false;
10391
10392     case FLOAT_EXTEND:
10393       if (speed)
10394         {
10395           if (VECTOR_MODE_P (mode))
10396             {
10397               /*Vector truncate.  */
10398               *cost += extra_cost->vect.alu;
10399             }
10400           else
10401             *cost += extra_cost->fp[mode == DFmode].widen;
10402         }
10403       return false;
10404
10405     case FLOAT_TRUNCATE:
10406       if (speed)
10407         {
10408           if (VECTOR_MODE_P (mode))
10409             {
10410               /*Vector conversion.  */
10411               *cost += extra_cost->vect.alu;
10412             }
10413           else
10414             *cost += extra_cost->fp[mode == DFmode].narrow;
10415         }
10416       return false;
10417
10418     case FIX:
10419     case UNSIGNED_FIX:
10420       x = XEXP (x, 0);
10421       /* Strip the rounding part.  They will all be implemented
10422          by the fcvt* family of instructions anyway.  */
10423       if (GET_CODE (x) == UNSPEC)
10424         {
10425           unsigned int uns_code = XINT (x, 1);
10426
10427           if (uns_code == UNSPEC_FRINTA
10428               || uns_code == UNSPEC_FRINTM
10429               || uns_code == UNSPEC_FRINTN
10430               || uns_code == UNSPEC_FRINTP
10431               || uns_code == UNSPEC_FRINTZ)
10432             x = XVECEXP (x, 0, 0);
10433         }
10434
10435       if (speed)
10436         {
10437           if (VECTOR_MODE_P (mode))
10438             *cost += extra_cost->vect.alu;
10439           else
10440             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
10441         }
10442
10443       /* We can combine fmul by a power of 2 followed by a fcvt into a single
10444          fixed-point fcvt.  */
10445       if (GET_CODE (x) == MULT
10446           && ((VECTOR_MODE_P (mode)
10447                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
10448               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
10449         {
10450           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
10451                              0, speed);
10452           return true;
10453         }
10454
10455       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
10456       return true;
10457
10458     case ABS:
10459       if (VECTOR_MODE_P (mode))
10460         {
10461           /* ABS (vector).  */
10462           if (speed)
10463             *cost += extra_cost->vect.alu;
10464         }
10465       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10466         {
10467           op0 = XEXP (x, 0);
10468
10469           /* FABD, which is analogous to FADD.  */
10470           if (GET_CODE (op0) == MINUS)
10471             {
10472               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
10473               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
10474               if (speed)
10475                 *cost += extra_cost->fp[mode == DFmode].addsub;
10476
10477               return true;
10478             }
10479           /* Simple FABS is analogous to FNEG.  */
10480           if (speed)
10481             *cost += extra_cost->fp[mode == DFmode].neg;
10482         }
10483       else
10484         {
10485           /* Integer ABS will either be split to
10486              two arithmetic instructions, or will be an ABS
10487              (scalar), which we don't model.  */
10488           *cost = COSTS_N_INSNS (2);
10489           if (speed)
10490             *cost += 2 * extra_cost->alu.arith;
10491         }
10492       return false;
10493
10494     case SMAX:
10495     case SMIN:
10496       if (speed)
10497         {
10498           if (VECTOR_MODE_P (mode))
10499             *cost += extra_cost->vect.alu;
10500           else
10501             {
10502               /* FMAXNM/FMINNM/FMAX/FMIN.
10503                  TODO: This may not be accurate for all implementations, but
10504                  we do not model this in the cost tables.  */
10505               *cost += extra_cost->fp[mode == DFmode].addsub;
10506             }
10507         }
10508       return false;
10509
10510     case UNSPEC:
10511       /* The floating point round to integer frint* instructions.  */
10512       if (aarch64_frint_unspec_p (XINT (x, 1)))
10513         {
10514           if (speed)
10515             *cost += extra_cost->fp[mode == DFmode].roundint;
10516
10517           return false;
10518         }
10519
10520       if (XINT (x, 1) == UNSPEC_RBIT)
10521         {
10522           if (speed)
10523             *cost += extra_cost->alu.rev;
10524
10525           return false;
10526         }
10527       break;
10528
10529     case TRUNCATE:
10530
10531       /* Decompose <su>muldi3_highpart.  */
10532       if (/* (truncate:DI  */
10533           mode == DImode
10534           /*   (lshiftrt:TI  */
10535           && GET_MODE (XEXP (x, 0)) == TImode
10536           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
10537           /*      (mult:TI  */
10538           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
10539           /*        (ANY_EXTEND:TI (reg:DI))
10540                     (ANY_EXTEND:TI (reg:DI)))  */
10541           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
10542                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
10543               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
10544                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
10545           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
10546           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
10547           /*     (const_int 64)  */
10548           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10549           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
10550         {
10551           /* UMULH/SMULH.  */
10552           if (speed)
10553             *cost += extra_cost->mult[mode == DImode].extend;
10554           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
10555                              mode, MULT, 0, speed);
10556           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
10557                              mode, MULT, 1, speed);
10558           return true;
10559         }
10560
10561       /* Fall through.  */
10562     default:
10563       break;
10564     }
10565
10566   if (dump_file
10567       && flag_aarch64_verbose_cost)
10568     fprintf (dump_file,
10569       "\nFailed to cost RTX.  Assuming default cost.\n");
10570
10571   return true;
10572 }
10573
10574 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
10575    calculated for X.  This cost is stored in *COST.  Returns true
10576    if the total cost of X was calculated.  */
10577 static bool
10578 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
10579                    int param, int *cost, bool speed)
10580 {
10581   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
10582
10583   if (dump_file
10584       && flag_aarch64_verbose_cost)
10585     {
10586       print_rtl_single (dump_file, x);
10587       fprintf (dump_file, "\n%s cost: %d (%s)\n",
10588                speed ? "Hot" : "Cold",
10589                *cost, result ? "final" : "partial");
10590     }
10591
10592   return result;
10593 }
10594
10595 static int
10596 aarch64_register_move_cost (machine_mode mode,
10597                             reg_class_t from_i, reg_class_t to_i)
10598 {
10599   enum reg_class from = (enum reg_class) from_i;
10600   enum reg_class to = (enum reg_class) to_i;
10601   const struct cpu_regmove_cost *regmove_cost
10602     = aarch64_tune_params.regmove_cost;
10603
10604   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
10605   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
10606     to = GENERAL_REGS;
10607
10608   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
10609     from = GENERAL_REGS;
10610
10611   /* Moving between GPR and stack cost is the same as GP2GP.  */
10612   if ((from == GENERAL_REGS && to == STACK_REG)
10613       || (to == GENERAL_REGS && from == STACK_REG))
10614     return regmove_cost->GP2GP;
10615
10616   /* To/From the stack register, we move via the gprs.  */
10617   if (to == STACK_REG || from == STACK_REG)
10618     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
10619             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
10620
10621   if (known_eq (GET_MODE_SIZE (mode), 16))
10622     {
10623       /* 128-bit operations on general registers require 2 instructions.  */
10624       if (from == GENERAL_REGS && to == GENERAL_REGS)
10625         return regmove_cost->GP2GP * 2;
10626       else if (from == GENERAL_REGS)
10627         return regmove_cost->GP2FP * 2;
10628       else if (to == GENERAL_REGS)
10629         return regmove_cost->FP2GP * 2;
10630
10631       /* When AdvSIMD instructions are disabled it is not possible to move
10632          a 128-bit value directly between Q registers.  This is handled in
10633          secondary reload.  A general register is used as a scratch to move
10634          the upper DI value and the lower DI value is moved directly,
10635          hence the cost is the sum of three moves. */
10636       if (! TARGET_SIMD)
10637         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
10638
10639       return regmove_cost->FP2FP;
10640     }
10641
10642   if (from == GENERAL_REGS && to == GENERAL_REGS)
10643     return regmove_cost->GP2GP;
10644   else if (from == GENERAL_REGS)
10645     return regmove_cost->GP2FP;
10646   else if (to == GENERAL_REGS)
10647     return regmove_cost->FP2GP;
10648
10649   return regmove_cost->FP2FP;
10650 }
10651
10652 static int
10653 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
10654                           reg_class_t rclass ATTRIBUTE_UNUSED,
10655                           bool in ATTRIBUTE_UNUSED)
10656 {
10657   return aarch64_tune_params.memmov_cost;
10658 }
10659
10660 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
10661    to optimize 1.0/sqrt.  */
10662
10663 static bool
10664 use_rsqrt_p (machine_mode mode)
10665 {
10666   return (!flag_trapping_math
10667           && flag_unsafe_math_optimizations
10668           && ((aarch64_tune_params.approx_modes->recip_sqrt
10669                & AARCH64_APPROX_MODE (mode))
10670               || flag_mrecip_low_precision_sqrt));
10671 }
10672
10673 /* Function to decide when to use the approximate reciprocal square root
10674    builtin.  */
10675
10676 static tree
10677 aarch64_builtin_reciprocal (tree fndecl)
10678 {
10679   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
10680
10681   if (!use_rsqrt_p (mode))
10682     return NULL_TREE;
10683   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
10684 }
10685
10686 /* Emit instruction sequence to compute either the approximate square root
10687    or its approximate reciprocal, depending on the flag RECP, and return
10688    whether the sequence was emitted or not.  */
10689
10690 bool
10691 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
10692 {
10693   machine_mode mode = GET_MODE (dst);
10694
10695   if (GET_MODE_INNER (mode) == HFmode)
10696     {
10697       gcc_assert (!recp);
10698       return false;
10699     }
10700
10701   if (!recp)
10702     {
10703       if (!(flag_mlow_precision_sqrt
10704             || (aarch64_tune_params.approx_modes->sqrt
10705                 & AARCH64_APPROX_MODE (mode))))
10706         return false;
10707
10708       if (flag_finite_math_only
10709           || flag_trapping_math
10710           || !flag_unsafe_math_optimizations
10711           || optimize_function_for_size_p (cfun))
10712         return false;
10713     }
10714   else
10715     /* Caller assumes we cannot fail.  */
10716     gcc_assert (use_rsqrt_p (mode));
10717
10718   machine_mode mmsk = mode_for_int_vector (mode).require ();
10719   rtx xmsk = gen_reg_rtx (mmsk);
10720   if (!recp)
10721     /* When calculating the approximate square root, compare the
10722        argument with 0.0 and create a mask.  */
10723     emit_insn (gen_rtx_SET (xmsk,
10724                             gen_rtx_NEG (mmsk,
10725                                          gen_rtx_EQ (mmsk, src,
10726                                                      CONST0_RTX (mode)))));
10727
10728   /* Estimate the approximate reciprocal square root.  */
10729   rtx xdst = gen_reg_rtx (mode);
10730   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
10731
10732   /* Iterate over the series twice for SF and thrice for DF.  */
10733   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10734
10735   /* Optionally iterate over the series once less for faster performance
10736      while sacrificing the accuracy.  */
10737   if ((recp && flag_mrecip_low_precision_sqrt)
10738       || (!recp && flag_mlow_precision_sqrt))
10739     iterations--;
10740
10741   /* Iterate over the series to calculate the approximate reciprocal square
10742      root.  */
10743   rtx x1 = gen_reg_rtx (mode);
10744   while (iterations--)
10745     {
10746       rtx x2 = gen_reg_rtx (mode);
10747       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
10748
10749       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
10750
10751       if (iterations > 0)
10752         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
10753     }
10754
10755   if (!recp)
10756     {
10757       /* Qualify the approximate reciprocal square root when the argument is
10758          0.0 by squashing the intermediary result to 0.0.  */
10759       rtx xtmp = gen_reg_rtx (mmsk);
10760       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
10761                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
10762       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
10763
10764       /* Calculate the approximate square root.  */
10765       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
10766     }
10767
10768   /* Finalize the approximation.  */
10769   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
10770
10771   return true;
10772 }
10773
10774 /* Emit the instruction sequence to compute the approximation for the division
10775    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
10776
10777 bool
10778 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10779 {
10780   machine_mode mode = GET_MODE (quo);
10781
10782   if (GET_MODE_INNER (mode) == HFmode)
10783     return false;
10784
10785   bool use_approx_division_p = (flag_mlow_precision_div
10786                                 || (aarch64_tune_params.approx_modes->division
10787                                     & AARCH64_APPROX_MODE (mode)));
10788
10789   if (!flag_finite_math_only
10790       || flag_trapping_math
10791       || !flag_unsafe_math_optimizations
10792       || optimize_function_for_size_p (cfun)
10793       || !use_approx_division_p)
10794     return false;
10795
10796   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10797     return false;
10798
10799   /* Estimate the approximate reciprocal.  */
10800   rtx xrcp = gen_reg_rtx (mode);
10801   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
10802
10803   /* Iterate over the series twice for SF and thrice for DF.  */
10804   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10805
10806   /* Optionally iterate over the series once less for faster performance,
10807      while sacrificing the accuracy.  */
10808   if (flag_mlow_precision_div)
10809     iterations--;
10810
10811   /* Iterate over the series to calculate the approximate reciprocal.  */
10812   rtx xtmp = gen_reg_rtx (mode);
10813   while (iterations--)
10814     {
10815       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
10816
10817       if (iterations > 0)
10818         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10819     }
10820
10821   if (num != CONST1_RTX (mode))
10822     {
10823       /* As the approximate reciprocal of DEN is already calculated, only
10824          calculate the approximate division when NUM is not 1.0.  */
10825       rtx xnum = force_reg (mode, num);
10826       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10827     }
10828
10829   /* Finalize the approximation.  */
10830   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10831   return true;
10832 }
10833
10834 /* Return the number of instructions that can be issued per cycle.  */
10835 static int
10836 aarch64_sched_issue_rate (void)
10837 {
10838   return aarch64_tune_params.issue_rate;
10839 }
10840
10841 static int
10842 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10843 {
10844   int issue_rate = aarch64_sched_issue_rate ();
10845
10846   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10847 }
10848
10849
10850 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10851    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
10852    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
10853
10854 static int
10855 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10856                                                     int ready_index)
10857 {
10858   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10859 }
10860
10861
10862 /* Vectorizer cost model target hooks.  */
10863
10864 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
10865 static int
10866 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10867                                     tree vectype,
10868                                     int misalign ATTRIBUTE_UNUSED)
10869 {
10870   unsigned elements;
10871   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10872   bool fp = false;
10873
10874   if (vectype != NULL)
10875     fp = FLOAT_TYPE_P (vectype);
10876
10877   switch (type_of_cost)
10878     {
10879       case scalar_stmt:
10880         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10881
10882       case scalar_load:
10883         return costs->scalar_load_cost;
10884
10885       case scalar_store:
10886         return costs->scalar_store_cost;
10887
10888       case vector_stmt:
10889         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10890
10891       case vector_load:
10892         return costs->vec_align_load_cost;
10893
10894       case vector_store:
10895         return costs->vec_store_cost;
10896
10897       case vec_to_scalar:
10898         return costs->vec_to_scalar_cost;
10899
10900       case scalar_to_vec:
10901         return costs->scalar_to_vec_cost;
10902
10903       case unaligned_load:
10904       case vector_gather_load:
10905         return costs->vec_unalign_load_cost;
10906
10907       case unaligned_store:
10908       case vector_scatter_store:
10909         return costs->vec_unalign_store_cost;
10910
10911       case cond_branch_taken:
10912         return costs->cond_taken_branch_cost;
10913
10914       case cond_branch_not_taken:
10915         return costs->cond_not_taken_branch_cost;
10916
10917       case vec_perm:
10918         return costs->vec_permute_cost;
10919
10920       case vec_promote_demote:
10921         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10922
10923       case vec_construct:
10924         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10925         return elements / 2 + 1;
10926
10927       default:
10928         gcc_unreachable ();
10929     }
10930 }
10931
10932 /* Implement targetm.vectorize.add_stmt_cost.  */
10933 static unsigned
10934 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10935                        struct _stmt_vec_info *stmt_info, int misalign,
10936                        enum vect_cost_model_location where)
10937 {
10938   unsigned *cost = (unsigned *) data;
10939   unsigned retval = 0;
10940
10941   if (flag_vect_cost_model)
10942     {
10943       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10944       int stmt_cost =
10945             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10946
10947       /* Statements in an inner loop relative to the loop being
10948          vectorized are weighted more heavily.  The value here is
10949          arbitrary and could potentially be improved with analysis.  */
10950       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10951         count *= 50; /*  FIXME  */
10952
10953       retval = (unsigned) (count * stmt_cost);
10954       cost[where] += retval;
10955     }
10956
10957   return retval;
10958 }
10959
10960 static void initialize_aarch64_code_model (struct gcc_options *);
10961
10962 /* Parse the TO_PARSE string and put the architecture struct that it
10963    selects into RES and the architectural features into ISA_FLAGS.
10964    Return an aarch64_parse_opt_result describing the parse result.
10965    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
10966    When the TO_PARSE string contains an invalid extension,
10967    a copy of the string is created and stored to INVALID_EXTENSION.  */
10968
10969 static enum aarch64_parse_opt_result
10970 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10971                     unsigned long *isa_flags, std::string *invalid_extension)
10972 {
10973   const char *ext;
10974   const struct processor *arch;
10975   size_t len;
10976
10977   ext = strchr (to_parse, '+');
10978
10979   if (ext != NULL)
10980     len = ext - to_parse;
10981   else
10982     len = strlen (to_parse);
10983
10984   if (len == 0)
10985     return AARCH64_PARSE_MISSING_ARG;
10986
10987
10988   /* Loop through the list of supported ARCHes to find a match.  */
10989   for (arch = all_architectures; arch->name != NULL; arch++)
10990     {
10991       if (strlen (arch->name) == len
10992           && strncmp (arch->name, to_parse, len) == 0)
10993         {
10994           unsigned long isa_temp = arch->flags;
10995
10996           if (ext != NULL)
10997             {
10998               /* TO_PARSE string contains at least one extension.  */
10999               enum aarch64_parse_opt_result ext_res
11000                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11001
11002               if (ext_res != AARCH64_PARSE_OK)
11003                 return ext_res;
11004             }
11005           /* Extension parsing was successful.  Confirm the result
11006              arch and ISA flags.  */
11007           *res = arch;
11008           *isa_flags = isa_temp;
11009           return AARCH64_PARSE_OK;
11010         }
11011     }
11012
11013   /* ARCH name not found in list.  */
11014   return AARCH64_PARSE_INVALID_ARG;
11015 }
11016
11017 /* Parse the TO_PARSE string and put the result tuning in RES and the
11018    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
11019    describing the parse result.  If there is an error parsing, RES and
11020    ISA_FLAGS are left unchanged.
11021    When the TO_PARSE string contains an invalid extension,
11022    a copy of the string is created and stored to INVALID_EXTENSION.  */
11023
11024 static enum aarch64_parse_opt_result
11025 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11026                    unsigned long *isa_flags, std::string *invalid_extension)
11027 {
11028   const char *ext;
11029   const struct processor *cpu;
11030   size_t len;
11031
11032   ext = strchr (to_parse, '+');
11033
11034   if (ext != NULL)
11035     len = ext - to_parse;
11036   else
11037     len = strlen (to_parse);
11038
11039   if (len == 0)
11040     return AARCH64_PARSE_MISSING_ARG;
11041
11042
11043   /* Loop through the list of supported CPUs to find a match.  */
11044   for (cpu = all_cores; cpu->name != NULL; cpu++)
11045     {
11046       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
11047         {
11048           unsigned long isa_temp = cpu->flags;
11049
11050
11051           if (ext != NULL)
11052             {
11053               /* TO_PARSE string contains at least one extension.  */
11054               enum aarch64_parse_opt_result ext_res
11055                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11056
11057               if (ext_res != AARCH64_PARSE_OK)
11058                 return ext_res;
11059             }
11060           /* Extension parsing was successfull.  Confirm the result
11061              cpu and ISA flags.  */
11062           *res = cpu;
11063           *isa_flags = isa_temp;
11064           return AARCH64_PARSE_OK;
11065         }
11066     }
11067
11068   /* CPU name not found in list.  */
11069   return AARCH64_PARSE_INVALID_ARG;
11070 }
11071
11072 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11073    Return an aarch64_parse_opt_result describing the parse result.
11074    If the parsing fails the RES does not change.  */
11075
11076 static enum aarch64_parse_opt_result
11077 aarch64_parse_tune (const char *to_parse, const struct processor **res)
11078 {
11079   const struct processor *cpu;
11080
11081   /* Loop through the list of supported CPUs to find a match.  */
11082   for (cpu = all_cores; cpu->name != NULL; cpu++)
11083     {
11084       if (strcmp (cpu->name, to_parse) == 0)
11085         {
11086           *res = cpu;
11087           return AARCH64_PARSE_OK;
11088         }
11089     }
11090
11091   /* CPU name not found in list.  */
11092   return AARCH64_PARSE_INVALID_ARG;
11093 }
11094
11095 /* Parse TOKEN, which has length LENGTH to see if it is an option
11096    described in FLAG.  If it is, return the index bit for that fusion type.
11097    If not, error (printing OPTION_NAME) and return zero.  */
11098
11099 static unsigned int
11100 aarch64_parse_one_option_token (const char *token,
11101                                 size_t length,
11102                                 const struct aarch64_flag_desc *flag,
11103                                 const char *option_name)
11104 {
11105   for (; flag->name != NULL; flag++)
11106     {
11107       if (length == strlen (flag->name)
11108           && !strncmp (flag->name, token, length))
11109         return flag->flag;
11110     }
11111
11112   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
11113   return 0;
11114 }
11115
11116 /* Parse OPTION which is a comma-separated list of flags to enable.
11117    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11118    default state we inherit from the CPU tuning structures.  OPTION_NAME
11119    gives the top-level option we are parsing in the -moverride string,
11120    for use in error messages.  */
11121
11122 static unsigned int
11123 aarch64_parse_boolean_options (const char *option,
11124                                const struct aarch64_flag_desc *flags,
11125                                unsigned int initial_state,
11126                                const char *option_name)
11127 {
11128   const char separator = '.';
11129   const char* specs = option;
11130   const char* ntoken = option;
11131   unsigned int found_flags = initial_state;
11132
11133   while ((ntoken = strchr (specs, separator)))
11134     {
11135       size_t token_length = ntoken - specs;
11136       unsigned token_ops = aarch64_parse_one_option_token (specs,
11137                                                            token_length,
11138                                                            flags,
11139                                                            option_name);
11140       /* If we find "none" (or, for simplicity's sake, an error) anywhere
11141          in the token stream, reset the supported operations.  So:
11142
11143            adrp+add.cmp+branch.none.adrp+add
11144
11145            would have the result of turning on only adrp+add fusion.  */
11146       if (!token_ops)
11147         found_flags = 0;
11148
11149       found_flags |= token_ops;
11150       specs = ++ntoken;
11151     }
11152
11153   /* We ended with a comma, print something.  */
11154   if (!(*specs))
11155     {
11156       error ("%s string ill-formed\n", option_name);
11157       return 0;
11158     }
11159
11160   /* We still have one more token to parse.  */
11161   size_t token_length = strlen (specs);
11162   unsigned token_ops = aarch64_parse_one_option_token (specs,
11163                                                        token_length,
11164                                                        flags,
11165                                                        option_name);
11166    if (!token_ops)
11167      found_flags = 0;
11168
11169   found_flags |= token_ops;
11170   return found_flags;
11171 }
11172
11173 /* Support for overriding instruction fusion.  */
11174
11175 static void
11176 aarch64_parse_fuse_string (const char *fuse_string,
11177                             struct tune_params *tune)
11178 {
11179   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
11180                                                      aarch64_fusible_pairs,
11181                                                      tune->fusible_ops,
11182                                                      "fuse=");
11183 }
11184
11185 /* Support for overriding other tuning flags.  */
11186
11187 static void
11188 aarch64_parse_tune_string (const char *tune_string,
11189                             struct tune_params *tune)
11190 {
11191   tune->extra_tuning_flags
11192     = aarch64_parse_boolean_options (tune_string,
11193                                      aarch64_tuning_flags,
11194                                      tune->extra_tuning_flags,
11195                                      "tune=");
11196 }
11197
11198 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11199    Accept the valid SVE vector widths allowed by
11200    aarch64_sve_vector_bits_enum and use it to override sve_width
11201    in TUNE.  */
11202
11203 static void
11204 aarch64_parse_sve_width_string (const char *tune_string,
11205                                 struct tune_params *tune)
11206 {
11207   int width = -1;
11208
11209   int n = sscanf (tune_string, "%d", &width);
11210   if (n == EOF)
11211     {
11212       error ("invalid format for sve_width");
11213       return;
11214     }
11215   switch (width)
11216     {
11217     case SVE_128:
11218     case SVE_256:
11219     case SVE_512:
11220     case SVE_1024:
11221     case SVE_2048:
11222       break;
11223     default:
11224       error ("invalid sve_width value: %d", width);
11225     }
11226   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
11227 }
11228
11229 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11230    we understand.  If it is, extract the option string and handoff to
11231    the appropriate function.  */
11232
11233 void
11234 aarch64_parse_one_override_token (const char* token,
11235                                   size_t length,
11236                                   struct tune_params *tune)
11237 {
11238   const struct aarch64_tuning_override_function *fn
11239     = aarch64_tuning_override_functions;
11240
11241   const char *option_part = strchr (token, '=');
11242   if (!option_part)
11243     {
11244       error ("tuning string missing in option (%s)", token);
11245       return;
11246     }
11247
11248   /* Get the length of the option name.  */
11249   length = option_part - token;
11250   /* Skip the '=' to get to the option string.  */
11251   option_part++;
11252
11253   for (; fn->name != NULL; fn++)
11254     {
11255       if (!strncmp (fn->name, token, length))
11256         {
11257           fn->parse_override (option_part, tune);
11258           return;
11259         }
11260     }
11261
11262   error ("unknown tuning option (%s)",token);
11263   return;
11264 }
11265
11266 /* A checking mechanism for the implementation of the tls size.  */
11267
11268 static void
11269 initialize_aarch64_tls_size (struct gcc_options *opts)
11270 {
11271   if (aarch64_tls_size == 0)
11272     aarch64_tls_size = 24;
11273
11274   switch (opts->x_aarch64_cmodel_var)
11275     {
11276     case AARCH64_CMODEL_TINY:
11277       /* Both the default and maximum TLS size allowed under tiny is 1M which
11278          needs two instructions to address, so we clamp the size to 24.  */
11279       if (aarch64_tls_size > 24)
11280         aarch64_tls_size = 24;
11281       break;
11282     case AARCH64_CMODEL_SMALL:
11283       /* The maximum TLS size allowed under small is 4G.  */
11284       if (aarch64_tls_size > 32)
11285         aarch64_tls_size = 32;
11286       break;
11287     case AARCH64_CMODEL_LARGE:
11288       /* The maximum TLS size allowed under large is 16E.
11289          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
11290       if (aarch64_tls_size > 48)
11291         aarch64_tls_size = 48;
11292       break;
11293     default:
11294       gcc_unreachable ();
11295     }
11296
11297   return;
11298 }
11299
11300 /* Parse STRING looking for options in the format:
11301      string     :: option:string
11302      option     :: name=substring
11303      name       :: {a-z}
11304      substring  :: defined by option.  */
11305
11306 static void
11307 aarch64_parse_override_string (const char* input_string,
11308                                struct tune_params* tune)
11309 {
11310   const char separator = ':';
11311   size_t string_length = strlen (input_string) + 1;
11312   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
11313   char *string = string_root;
11314   strncpy (string, input_string, string_length);
11315   string[string_length - 1] = '\0';
11316
11317   char* ntoken = string;
11318
11319   while ((ntoken = strchr (string, separator)))
11320     {
11321       size_t token_length = ntoken - string;
11322       /* Make this substring look like a string.  */
11323       *ntoken = '\0';
11324       aarch64_parse_one_override_token (string, token_length, tune);
11325       string = ++ntoken;
11326     }
11327
11328   /* One last option to parse.  */
11329   aarch64_parse_one_override_token (string, strlen (string), tune);
11330   free (string_root);
11331 }
11332
11333
11334 static void
11335 aarch64_override_options_after_change_1 (struct gcc_options *opts)
11336 {
11337   if (accepted_branch_protection_string)
11338     {
11339       opts->x_aarch64_branch_protection_string
11340         = xstrdup (accepted_branch_protection_string);
11341     }
11342
11343   /* PR 70044: We have to be careful about being called multiple times for the
11344      same function.  This means all changes should be repeatable.  */
11345
11346   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
11347      Disable the frame pointer flag so the mid-end will not use a frame
11348      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
11349      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
11350      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
11351   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
11352   if (opts->x_flag_omit_frame_pointer == 0)
11353     opts->x_flag_omit_frame_pointer = 2;
11354
11355   /* If not optimizing for size, set the default
11356      alignment to what the target wants.  */
11357   if (!opts->x_optimize_size)
11358     {
11359       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
11360         opts->x_str_align_loops = aarch64_tune_params.loop_align;
11361       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
11362         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
11363       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
11364         opts->x_str_align_functions = aarch64_tune_params.function_align;
11365     }
11366
11367   /* We default to no pc-relative literal loads.  */
11368
11369   aarch64_pcrelative_literal_loads = false;
11370
11371   /* If -mpc-relative-literal-loads is set on the command line, this
11372      implies that the user asked for PC relative literal loads.  */
11373   if (opts->x_pcrelative_literal_loads == 1)
11374     aarch64_pcrelative_literal_loads = true;
11375
11376   /* In the tiny memory model it makes no sense to disallow PC relative
11377      literal pool loads.  */
11378   if (aarch64_cmodel == AARCH64_CMODEL_TINY
11379       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11380     aarch64_pcrelative_literal_loads = true;
11381
11382   /* When enabling the lower precision Newton series for the square root, also
11383      enable it for the reciprocal square root, since the latter is an
11384      intermediary step for the former.  */
11385   if (flag_mlow_precision_sqrt)
11386     flag_mrecip_low_precision_sqrt = true;
11387 }
11388
11389 /* 'Unpack' up the internal tuning structs and update the options
11390     in OPTS.  The caller must have set up selected_tune and selected_arch
11391     as all the other target-specific codegen decisions are
11392     derived from them.  */
11393
11394 void
11395 aarch64_override_options_internal (struct gcc_options *opts)
11396 {
11397   aarch64_tune_flags = selected_tune->flags;
11398   aarch64_tune = selected_tune->sched_core;
11399   /* Make a copy of the tuning parameters attached to the core, which
11400      we may later overwrite.  */
11401   aarch64_tune_params = *(selected_tune->tune);
11402   aarch64_architecture_version = selected_arch->architecture_version;
11403
11404   if (opts->x_aarch64_override_tune_string)
11405     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
11406                                   &aarch64_tune_params);
11407
11408   /* This target defaults to strict volatile bitfields.  */
11409   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
11410     opts->x_flag_strict_volatile_bitfields = 1;
11411
11412   if (aarch64_stack_protector_guard == SSP_GLOBAL
11413       && opts->x_aarch64_stack_protector_guard_offset_str)
11414     {
11415       error ("incompatible options -mstack-protector-guard=global and"
11416              "-mstack-protector-guard-offset=%qs",
11417              aarch64_stack_protector_guard_offset_str);
11418     }
11419
11420   if (aarch64_stack_protector_guard == SSP_SYSREG
11421       && !(opts->x_aarch64_stack_protector_guard_offset_str
11422            && opts->x_aarch64_stack_protector_guard_reg_str))
11423     {
11424       error ("both -mstack-protector-guard-offset and "
11425              "-mstack-protector-guard-reg must be used "
11426              "with -mstack-protector-guard=sysreg");
11427     }
11428
11429   if (opts->x_aarch64_stack_protector_guard_reg_str)
11430     {
11431       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
11432           error ("specify a system register with a small string length.");
11433     }
11434
11435   if (opts->x_aarch64_stack_protector_guard_offset_str)
11436     {
11437       char *end;
11438       const char *str = aarch64_stack_protector_guard_offset_str;
11439       errno = 0;
11440       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
11441       if (!*str || *end || errno)
11442         error ("%qs is not a valid offset in %qs", str,
11443                "-mstack-protector-guard-offset=");
11444       aarch64_stack_protector_guard_offset = offs;
11445     }
11446
11447   initialize_aarch64_code_model (opts);
11448   initialize_aarch64_tls_size (opts);
11449
11450   int queue_depth = 0;
11451   switch (aarch64_tune_params.autoprefetcher_model)
11452     {
11453       case tune_params::AUTOPREFETCHER_OFF:
11454         queue_depth = -1;
11455         break;
11456       case tune_params::AUTOPREFETCHER_WEAK:
11457         queue_depth = 0;
11458         break;
11459       case tune_params::AUTOPREFETCHER_STRONG:
11460         queue_depth = max_insn_queue_index + 1;
11461         break;
11462       default:
11463         gcc_unreachable ();
11464     }
11465
11466   /* We don't mind passing in global_options_set here as we don't use
11467      the *options_set structs anyway.  */
11468   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
11469                          queue_depth,
11470                          opts->x_param_values,
11471                          global_options_set.x_param_values);
11472
11473   /* Set up parameters to be used in prefetching algorithm.  Do not
11474      override the defaults unless we are tuning for a core we have
11475      researched values for.  */
11476   if (aarch64_tune_params.prefetch->num_slots > 0)
11477     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
11478                            aarch64_tune_params.prefetch->num_slots,
11479                            opts->x_param_values,
11480                            global_options_set.x_param_values);
11481   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
11482     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
11483                            aarch64_tune_params.prefetch->l1_cache_size,
11484                            opts->x_param_values,
11485                            global_options_set.x_param_values);
11486   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
11487     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
11488                            aarch64_tune_params.prefetch->l1_cache_line_size,
11489                            opts->x_param_values,
11490                            global_options_set.x_param_values);
11491   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
11492     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
11493                            aarch64_tune_params.prefetch->l2_cache_size,
11494                            opts->x_param_values,
11495                            global_options_set.x_param_values);
11496   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
11497     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
11498                            0,
11499                            opts->x_param_values,
11500                            global_options_set.x_param_values);
11501   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
11502     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
11503                            aarch64_tune_params.prefetch->minimum_stride,
11504                            opts->x_param_values,
11505                            global_options_set.x_param_values);
11506
11507   /* Use the alternative scheduling-pressure algorithm by default.  */
11508   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
11509                          opts->x_param_values,
11510                          global_options_set.x_param_values);
11511
11512   /* If the user hasn't changed it via configure then set the default to 64 KB
11513      for the backend.  */
11514   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
11515                          DEFAULT_STK_CLASH_GUARD_SIZE == 0
11516                            ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
11517                          opts->x_param_values,
11518                          global_options_set.x_param_values);
11519
11520   /* Validate the guard size.  */
11521   int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
11522
11523   /* Enforce that interval is the same size as size so the mid-end does the
11524      right thing.  */
11525   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
11526                          guard_size,
11527                          opts->x_param_values,
11528                          global_options_set.x_param_values);
11529
11530   /* The maybe_set calls won't update the value if the user has explicitly set
11531      one.  Which means we need to validate that probing interval and guard size
11532      are equal.  */
11533   int probe_interval
11534     = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
11535   if (guard_size != probe_interval)
11536     error ("stack clash guard size '%d' must be equal to probing interval "
11537            "'%d'", guard_size, probe_interval);
11538
11539   /* Enable sw prefetching at specified optimization level for
11540      CPUS that have prefetch.  Lower optimization level threshold by 1
11541      when profiling is enabled.  */
11542   if (opts->x_flag_prefetch_loop_arrays < 0
11543       && !opts->x_optimize_size
11544       && aarch64_tune_params.prefetch->default_opt_level >= 0
11545       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
11546     opts->x_flag_prefetch_loop_arrays = 1;
11547
11548   if (opts->x_aarch64_arch_string == NULL)
11549     opts->x_aarch64_arch_string = selected_arch->name;
11550   if (opts->x_aarch64_cpu_string == NULL)
11551     opts->x_aarch64_cpu_string = selected_cpu->name;
11552   if (opts->x_aarch64_tune_string == NULL)
11553     opts->x_aarch64_tune_string = selected_tune->name;
11554
11555   aarch64_override_options_after_change_1 (opts);
11556 }
11557
11558 /* Print a hint with a suggestion for a core or architecture name that
11559    most closely resembles what the user passed in STR.  ARCH is true if
11560    the user is asking for an architecture name.  ARCH is false if the user
11561    is asking for a core name.  */
11562
11563 static void
11564 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
11565 {
11566   auto_vec<const char *> candidates;
11567   const struct processor *entry = arch ? all_architectures : all_cores;
11568   for (; entry->name != NULL; entry++)
11569     candidates.safe_push (entry->name);
11570
11571 #ifdef HAVE_LOCAL_CPU_DETECT
11572   /* Add also "native" as possible value.  */
11573   if (arch)
11574     candidates.safe_push ("native");
11575 #endif
11576
11577   char *s;
11578   const char *hint = candidates_list_and_hint (str, s, candidates);
11579   if (hint)
11580     inform (input_location, "valid arguments are: %s;"
11581                              " did you mean %qs?", s, hint);
11582   else
11583     inform (input_location, "valid arguments are: %s", s);
11584
11585   XDELETEVEC (s);
11586 }
11587
11588 /* Print a hint with a suggestion for a core name that most closely resembles
11589    what the user passed in STR.  */
11590
11591 inline static void
11592 aarch64_print_hint_for_core (const char *str)
11593 {
11594   aarch64_print_hint_for_core_or_arch (str, false);
11595 }
11596
11597 /* Print a hint with a suggestion for an architecture name that most closely
11598    resembles what the user passed in STR.  */
11599
11600 inline static void
11601 aarch64_print_hint_for_arch (const char *str)
11602 {
11603   aarch64_print_hint_for_core_or_arch (str, true);
11604 }
11605
11606
11607 /* Print a hint with a suggestion for an extension name
11608    that most closely resembles what the user passed in STR.  */
11609
11610 void
11611 aarch64_print_hint_for_extensions (const std::string &str)
11612 {
11613   auto_vec<const char *> candidates;
11614   aarch64_get_all_extension_candidates (&candidates);
11615   char *s;
11616   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
11617   if (hint)
11618     inform (input_location, "valid arguments are: %s;"
11619                              " did you mean %qs?", s, hint);
11620   else
11621     inform (input_location, "valid arguments are: %s;", s);
11622
11623   XDELETEVEC (s);
11624 }
11625
11626 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
11627    specified in STR and throw errors if appropriate.  Put the results if
11628    they are valid in RES and ISA_FLAGS.  Return whether the option is
11629    valid.  */
11630
11631 static bool
11632 aarch64_validate_mcpu (const char *str, const struct processor **res,
11633                        unsigned long *isa_flags)
11634 {
11635   std::string invalid_extension;
11636   enum aarch64_parse_opt_result parse_res
11637     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
11638
11639   if (parse_res == AARCH64_PARSE_OK)
11640     return true;
11641
11642   switch (parse_res)
11643     {
11644       case AARCH64_PARSE_MISSING_ARG:
11645         error ("missing cpu name in %<-mcpu=%s%>", str);
11646         break;
11647       case AARCH64_PARSE_INVALID_ARG:
11648         error ("unknown value %qs for -mcpu", str);
11649         aarch64_print_hint_for_core (str);
11650         break;
11651       case AARCH64_PARSE_INVALID_FEATURE:
11652         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
11653                invalid_extension.c_str (), str);
11654         aarch64_print_hint_for_extensions (invalid_extension);
11655         break;
11656       default:
11657         gcc_unreachable ();
11658     }
11659
11660   return false;
11661 }
11662
11663 /* Parses CONST_STR for branch protection features specified in
11664    aarch64_branch_protect_types, and set any global variables required.  Returns
11665    the parsing result and assigns LAST_STR to the last processed token from
11666    CONST_STR so that it can be used for error reporting.  */
11667
11668 static enum
11669 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
11670                                                           char** last_str)
11671 {
11672   char *str_root = xstrdup (const_str);
11673   char* token_save = NULL;
11674   char *str = strtok_r (str_root, "+", &token_save);
11675   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
11676   if (!str)
11677     res = AARCH64_PARSE_MISSING_ARG;
11678   else
11679     {
11680       char *next_str = strtok_r (NULL, "+", &token_save);
11681       /* Reset the branch protection features to their defaults.  */
11682       aarch64_handle_no_branch_protection (NULL, NULL);
11683
11684       while (str && res == AARCH64_PARSE_OK)
11685         {
11686           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
11687           bool found = false;
11688           /* Search for this type.  */
11689           while (type && type->name && !found && res == AARCH64_PARSE_OK)
11690             {
11691               if (strcmp (str, type->name) == 0)
11692                 {
11693                   found = true;
11694                   res = type->handler (str, next_str);
11695                   str = next_str;
11696                   next_str = strtok_r (NULL, "+", &token_save);
11697                 }
11698               else
11699                 type++;
11700             }
11701           if (found && res == AARCH64_PARSE_OK)
11702             {
11703               bool found_subtype = true;
11704               /* Loop through each token until we find one that isn't a
11705                  subtype.  */
11706               while (found_subtype)
11707                 {
11708                   found_subtype = false;
11709                   const aarch64_branch_protect_type *subtype = type->subtypes;
11710                   /* Search for the subtype.  */
11711                   while (str && subtype && subtype->name && !found_subtype
11712                           && res == AARCH64_PARSE_OK)
11713                     {
11714                       if (strcmp (str, subtype->name) == 0)
11715                         {
11716                           found_subtype = true;
11717                           res = subtype->handler (str, next_str);
11718                           str = next_str;
11719                           next_str = strtok_r (NULL, "+", &token_save);
11720                         }
11721                       else
11722                         subtype++;
11723                     }
11724                 }
11725             }
11726           else if (!found)
11727             res = AARCH64_PARSE_INVALID_ARG;
11728         }
11729     }
11730   /* Copy the last processed token into the argument to pass it back.
11731     Used by option and attribute validation to print the offending token.  */
11732   if (last_str)
11733     {
11734       if (str) strcpy (*last_str, str);
11735       else *last_str = NULL;
11736     }
11737   if (res == AARCH64_PARSE_OK)
11738     {
11739       /* If needed, alloc the accepted string then copy in const_str.
11740         Used by override_option_after_change_1.  */
11741       if (!accepted_branch_protection_string)
11742         accepted_branch_protection_string = (char *) xmalloc (
11743                                                       BRANCH_PROTECT_STR_MAX
11744                                                         + 1);
11745       strncpy (accepted_branch_protection_string, const_str,
11746                 BRANCH_PROTECT_STR_MAX + 1);
11747       /* Forcibly null-terminate.  */
11748       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
11749     }
11750   return res;
11751 }
11752
11753 static bool
11754 aarch64_validate_mbranch_protection (const char *const_str)
11755 {
11756   char *str = (char *) xmalloc (strlen (const_str));
11757   enum aarch64_parse_opt_result res =
11758     aarch64_parse_branch_protection (const_str, &str);
11759   if (res == AARCH64_PARSE_INVALID_ARG)
11760     error ("invalid arg %<%s%> for %<-mbranch-protection=%>", str);
11761   else if (res == AARCH64_PARSE_MISSING_ARG)
11762     error ("missing arg for %<-mbranch-protection=%>");
11763   free (str);
11764   return res == AARCH64_PARSE_OK;
11765 }
11766
11767 /* Validate a command-line -march option.  Parse the arch and extensions
11768    (if any) specified in STR and throw errors if appropriate.  Put the
11769    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
11770    option is valid.  */
11771
11772 static bool
11773 aarch64_validate_march (const char *str, const struct processor **res,
11774                          unsigned long *isa_flags)
11775 {
11776   std::string invalid_extension;
11777   enum aarch64_parse_opt_result parse_res
11778     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
11779
11780   if (parse_res == AARCH64_PARSE_OK)
11781     return true;
11782
11783   switch (parse_res)
11784     {
11785       case AARCH64_PARSE_MISSING_ARG:
11786         error ("missing arch name in %<-march=%s%>", str);
11787         break;
11788       case AARCH64_PARSE_INVALID_ARG:
11789         error ("unknown value %qs for -march", str);
11790         aarch64_print_hint_for_arch (str);
11791         break;
11792       case AARCH64_PARSE_INVALID_FEATURE:
11793         error ("invalid feature modifier %qs in %<-march=%s%>",
11794                invalid_extension.c_str (), str);
11795         aarch64_print_hint_for_extensions (invalid_extension);
11796         break;
11797       default:
11798         gcc_unreachable ();
11799     }
11800
11801   return false;
11802 }
11803
11804 /* Validate a command-line -mtune option.  Parse the cpu
11805    specified in STR and throw errors if appropriate.  Put the
11806    result, if it is valid, in RES.  Return whether the option is
11807    valid.  */
11808
11809 static bool
11810 aarch64_validate_mtune (const char *str, const struct processor **res)
11811 {
11812   enum aarch64_parse_opt_result parse_res
11813     = aarch64_parse_tune (str, res);
11814
11815   if (parse_res == AARCH64_PARSE_OK)
11816     return true;
11817
11818   switch (parse_res)
11819     {
11820       case AARCH64_PARSE_MISSING_ARG:
11821         error ("missing cpu name in %<-mtune=%s%>", str);
11822         break;
11823       case AARCH64_PARSE_INVALID_ARG:
11824         error ("unknown value %qs for -mtune", str);
11825         aarch64_print_hint_for_core (str);
11826         break;
11827       default:
11828         gcc_unreachable ();
11829     }
11830   return false;
11831 }
11832
11833 /* Return the CPU corresponding to the enum CPU.
11834    If it doesn't specify a cpu, return the default.  */
11835
11836 static const struct processor *
11837 aarch64_get_tune_cpu (enum aarch64_processor cpu)
11838 {
11839   if (cpu != aarch64_none)
11840     return &all_cores[cpu];
11841
11842   /* The & 0x3f is to extract the bottom 6 bits that encode the
11843      default cpu as selected by the --with-cpu GCC configure option
11844      in config.gcc.
11845      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
11846      flags mechanism should be reworked to make it more sane.  */
11847   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11848 }
11849
11850 /* Return the architecture corresponding to the enum ARCH.
11851    If it doesn't specify a valid architecture, return the default.  */
11852
11853 static const struct processor *
11854 aarch64_get_arch (enum aarch64_arch arch)
11855 {
11856   if (arch != aarch64_no_arch)
11857     return &all_architectures[arch];
11858
11859   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11860
11861   return &all_architectures[cpu->arch];
11862 }
11863
11864 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
11865
11866 static poly_uint16
11867 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
11868 {
11869   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
11870      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
11871      deciding which .md file patterns to use and when deciding whether
11872      something is a legitimate address or constant.  */
11873   if (value == SVE_SCALABLE || value == SVE_128)
11874     return poly_uint16 (2, 2);
11875   else
11876     return (int) value / 64;
11877 }
11878
11879 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
11880    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
11881    tuning structs.  In particular it must set selected_tune and
11882    aarch64_isa_flags that define the available ISA features and tuning
11883    decisions.  It must also set selected_arch as this will be used to
11884    output the .arch asm tags for each function.  */
11885
11886 static void
11887 aarch64_override_options (void)
11888 {
11889   unsigned long cpu_isa = 0;
11890   unsigned long arch_isa = 0;
11891   aarch64_isa_flags = 0;
11892
11893   bool valid_cpu = true;
11894   bool valid_tune = true;
11895   bool valid_arch = true;
11896
11897   selected_cpu = NULL;
11898   selected_arch = NULL;
11899   selected_tune = NULL;
11900
11901   if (aarch64_branch_protection_string)
11902     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
11903
11904   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
11905      If either of -march or -mtune is given, they override their
11906      respective component of -mcpu.  */
11907   if (aarch64_cpu_string)
11908     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
11909                                         &cpu_isa);
11910
11911   if (aarch64_arch_string)
11912     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
11913                                           &arch_isa);
11914
11915   if (aarch64_tune_string)
11916     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
11917
11918 #ifdef SUBTARGET_OVERRIDE_OPTIONS
11919   SUBTARGET_OVERRIDE_OPTIONS;
11920 #endif
11921
11922   /* If the user did not specify a processor, choose the default
11923      one for them.  This will be the CPU set during configuration using
11924      --with-cpu, otherwise it is "generic".  */
11925   if (!selected_cpu)
11926     {
11927       if (selected_arch)
11928         {
11929           selected_cpu = &all_cores[selected_arch->ident];
11930           aarch64_isa_flags = arch_isa;
11931           explicit_arch = selected_arch->arch;
11932         }
11933       else
11934         {
11935           /* Get default configure-time CPU.  */
11936           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
11937           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
11938         }
11939
11940       if (selected_tune)
11941         explicit_tune_core = selected_tune->ident;
11942     }
11943   /* If both -mcpu and -march are specified check that they are architecturally
11944      compatible, warn if they're not and prefer the -march ISA flags.  */
11945   else if (selected_arch)
11946     {
11947       if (selected_arch->arch != selected_cpu->arch)
11948         {
11949           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
11950                        all_architectures[selected_cpu->arch].name,
11951                        selected_arch->name);
11952         }
11953       aarch64_isa_flags = arch_isa;
11954       explicit_arch = selected_arch->arch;
11955       explicit_tune_core = selected_tune ? selected_tune->ident
11956                                           : selected_cpu->ident;
11957     }
11958   else
11959     {
11960       /* -mcpu but no -march.  */
11961       aarch64_isa_flags = cpu_isa;
11962       explicit_tune_core = selected_tune ? selected_tune->ident
11963                                           : selected_cpu->ident;
11964       gcc_assert (selected_cpu);
11965       selected_arch = &all_architectures[selected_cpu->arch];
11966       explicit_arch = selected_arch->arch;
11967     }
11968
11969   /* Set the arch as well as we will need it when outputing
11970      the .arch directive in assembly.  */
11971   if (!selected_arch)
11972     {
11973       gcc_assert (selected_cpu);
11974       selected_arch = &all_architectures[selected_cpu->arch];
11975     }
11976
11977   if (!selected_tune)
11978     selected_tune = selected_cpu;
11979
11980   if (aarch64_enable_bti == 2)
11981     {
11982 #ifdef TARGET_ENABLE_BTI
11983       aarch64_enable_bti = 1;
11984 #else
11985       aarch64_enable_bti = 0;
11986 #endif
11987     }
11988
11989   /* Return address signing is currently not supported for ILP32 targets.  For
11990      LP64 targets use the configured option in the absence of a command-line
11991      option for -mbranch-protection.  */
11992   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
11993     {
11994 #ifdef TARGET_ENABLE_PAC_RET
11995       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
11996       aarch64_ra_sign_key = AARCH64_KEY_A;
11997 #else
11998       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
11999 #endif
12000     }
12001
12002 #ifndef HAVE_AS_MABI_OPTION
12003   /* The compiler may have been configured with 2.23.* binutils, which does
12004      not have support for ILP32.  */
12005   if (TARGET_ILP32)
12006     error ("assembler does not support -mabi=ilp32");
12007 #endif
12008
12009   /* Convert -msve-vector-bits to a VG count.  */
12010   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12011
12012   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12013     sorry ("return address signing is only supported for -mabi=lp64");
12014
12015   /* Make sure we properly set up the explicit options.  */
12016   if ((aarch64_cpu_string && valid_cpu)
12017        || (aarch64_tune_string && valid_tune))
12018     gcc_assert (explicit_tune_core != aarch64_none);
12019
12020   if ((aarch64_cpu_string && valid_cpu)
12021        || (aarch64_arch_string && valid_arch))
12022     gcc_assert (explicit_arch != aarch64_no_arch);
12023
12024   /* The pass to insert speculation tracking runs before
12025      shrink-wrapping and the latter does not know how to update the
12026      tracking status.  So disable it in this case.  */
12027   if (aarch64_track_speculation)
12028     flag_shrink_wrap = 0;
12029
12030   aarch64_override_options_internal (&global_options);
12031
12032   /* Save these options as the default ones in case we push and pop them later
12033      while processing functions with potential target attributes.  */
12034   target_option_default_node = target_option_current_node
12035       = build_target_option_node (&global_options);
12036 }
12037
12038 /* Implement targetm.override_options_after_change.  */
12039
12040 static void
12041 aarch64_override_options_after_change (void)
12042 {
12043   aarch64_override_options_after_change_1 (&global_options);
12044 }
12045
12046 static struct machine_function *
12047 aarch64_init_machine_status (void)
12048 {
12049   struct machine_function *machine;
12050   machine = ggc_cleared_alloc<machine_function> ();
12051   return machine;
12052 }
12053
12054 void
12055 aarch64_init_expanders (void)
12056 {
12057   init_machine_status = aarch64_init_machine_status;
12058 }
12059
12060 /* A checking mechanism for the implementation of the various code models.  */
12061 static void
12062 initialize_aarch64_code_model (struct gcc_options *opts)
12063 {
12064    if (opts->x_flag_pic)
12065      {
12066        switch (opts->x_aarch64_cmodel_var)
12067          {
12068          case AARCH64_CMODEL_TINY:
12069            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
12070            break;
12071          case AARCH64_CMODEL_SMALL:
12072 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12073            aarch64_cmodel = (flag_pic == 2
12074                              ? AARCH64_CMODEL_SMALL_PIC
12075                              : AARCH64_CMODEL_SMALL_SPIC);
12076 #else
12077            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
12078 #endif
12079            break;
12080          case AARCH64_CMODEL_LARGE:
12081            sorry ("code model %qs with -f%s", "large",
12082                   opts->x_flag_pic > 1 ? "PIC" : "pic");
12083            break;
12084          default:
12085            gcc_unreachable ();
12086          }
12087      }
12088    else
12089      aarch64_cmodel = opts->x_aarch64_cmodel_var;
12090 }
12091
12092 /* Implement TARGET_OPTION_SAVE.  */
12093
12094 static void
12095 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
12096 {
12097   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
12098   ptr->x_aarch64_branch_protection_string
12099     = opts->x_aarch64_branch_protection_string;
12100 }
12101
12102 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
12103    using the information saved in PTR.  */
12104
12105 static void
12106 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
12107 {
12108   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
12109   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12110   opts->x_explicit_arch = ptr->x_explicit_arch;
12111   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
12112   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
12113   opts->x_aarch64_branch_protection_string
12114     = ptr->x_aarch64_branch_protection_string;
12115   if (opts->x_aarch64_branch_protection_string)
12116     {
12117       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
12118                                         NULL);
12119     }
12120
12121   aarch64_override_options_internal (opts);
12122 }
12123
12124 /* Implement TARGET_OPTION_PRINT.  */
12125
12126 static void
12127 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
12128 {
12129   const struct processor *cpu
12130     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12131   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
12132   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
12133   std::string extension
12134     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
12135
12136   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
12137   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
12138            arch->name, extension.c_str ());
12139 }
12140
12141 static GTY(()) tree aarch64_previous_fndecl;
12142
12143 void
12144 aarch64_reset_previous_fndecl (void)
12145 {
12146   aarch64_previous_fndecl = NULL;
12147 }
12148
12149 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
12150    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
12151    make sure optab availability predicates are recomputed when necessary.  */
12152
12153 void
12154 aarch64_save_restore_target_globals (tree new_tree)
12155 {
12156   if (TREE_TARGET_GLOBALS (new_tree))
12157     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
12158   else if (new_tree == target_option_default_node)
12159     restore_target_globals (&default_target_globals);
12160   else
12161     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
12162 }
12163
12164 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
12165    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
12166    of the function, if such exists.  This function may be called multiple
12167    times on a single function so use aarch64_previous_fndecl to avoid
12168    setting up identical state.  */
12169
12170 static void
12171 aarch64_set_current_function (tree fndecl)
12172 {
12173   if (!fndecl || fndecl == aarch64_previous_fndecl)
12174     return;
12175
12176   tree old_tree = (aarch64_previous_fndecl
12177                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
12178                    : NULL_TREE);
12179
12180   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12181
12182   /* If current function has no attributes but the previous one did,
12183      use the default node.  */
12184   if (!new_tree && old_tree)
12185     new_tree = target_option_default_node;
12186
12187   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
12188      the default have been handled by aarch64_save_restore_target_globals from
12189      aarch64_pragma_target_parse.  */
12190   if (old_tree == new_tree)
12191     return;
12192
12193   aarch64_previous_fndecl = fndecl;
12194
12195   /* First set the target options.  */
12196   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
12197
12198   aarch64_save_restore_target_globals (new_tree);
12199 }
12200
12201 /* Enum describing the various ways we can handle attributes.
12202    In many cases we can reuse the generic option handling machinery.  */
12203
12204 enum aarch64_attr_opt_type
12205 {
12206   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
12207   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
12208   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
12209   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
12210 };
12211
12212 /* All the information needed to handle a target attribute.
12213    NAME is the name of the attribute.
12214    ATTR_TYPE specifies the type of behavior of the attribute as described
12215    in the definition of enum aarch64_attr_opt_type.
12216    ALLOW_NEG is true if the attribute supports a "no-" form.
12217    HANDLER is the function that takes the attribute string as an argument
12218    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12219    OPT_NUM is the enum specifying the option that the attribute modifies.
12220    This is needed for attributes that mirror the behavior of a command-line
12221    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12222    aarch64_attr_enum.  */
12223
12224 struct aarch64_attribute_info
12225 {
12226   const char *name;
12227   enum aarch64_attr_opt_type attr_type;
12228   bool allow_neg;
12229   bool (*handler) (const char *);
12230   enum opt_code opt_num;
12231 };
12232
12233 /* Handle the ARCH_STR argument to the arch= target attribute.  */
12234
12235 static bool
12236 aarch64_handle_attr_arch (const char *str)
12237 {
12238   const struct processor *tmp_arch = NULL;
12239   std::string invalid_extension;
12240   enum aarch64_parse_opt_result parse_res
12241     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
12242
12243   if (parse_res == AARCH64_PARSE_OK)
12244     {
12245       gcc_assert (tmp_arch);
12246       selected_arch = tmp_arch;
12247       explicit_arch = selected_arch->arch;
12248       return true;
12249     }
12250
12251   switch (parse_res)
12252     {
12253       case AARCH64_PARSE_MISSING_ARG:
12254         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12255         break;
12256       case AARCH64_PARSE_INVALID_ARG:
12257         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
12258         aarch64_print_hint_for_arch (str);
12259         break;
12260       case AARCH64_PARSE_INVALID_FEATURE:
12261         error ("invalid feature modifier %s of value (\"%s\") in "
12262                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12263         aarch64_print_hint_for_extensions (invalid_extension);
12264         break;
12265       default:
12266         gcc_unreachable ();
12267     }
12268
12269   return false;
12270 }
12271
12272 /* Handle the argument CPU_STR to the cpu= target attribute.  */
12273
12274 static bool
12275 aarch64_handle_attr_cpu (const char *str)
12276 {
12277   const struct processor *tmp_cpu = NULL;
12278   std::string invalid_extension;
12279   enum aarch64_parse_opt_result parse_res
12280     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
12281
12282   if (parse_res == AARCH64_PARSE_OK)
12283     {
12284       gcc_assert (tmp_cpu);
12285       selected_tune = tmp_cpu;
12286       explicit_tune_core = selected_tune->ident;
12287
12288       selected_arch = &all_architectures[tmp_cpu->arch];
12289       explicit_arch = selected_arch->arch;
12290       return true;
12291     }
12292
12293   switch (parse_res)
12294     {
12295       case AARCH64_PARSE_MISSING_ARG:
12296         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12297         break;
12298       case AARCH64_PARSE_INVALID_ARG:
12299         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
12300         aarch64_print_hint_for_core (str);
12301         break;
12302       case AARCH64_PARSE_INVALID_FEATURE:
12303         error ("invalid feature modifier %s of value (\"%s\") in "
12304                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12305         aarch64_print_hint_for_extensions (invalid_extension);
12306         break;
12307       default:
12308         gcc_unreachable ();
12309     }
12310
12311   return false;
12312 }
12313
12314 /* Handle the argument STR to the branch-protection= attribute.  */
12315
12316  static bool
12317  aarch64_handle_attr_branch_protection (const char* str)
12318  {
12319   char *err_str = (char *) xmalloc (strlen (str));
12320   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
12321                                                                       &err_str);
12322   bool success = false;
12323   switch (res)
12324     {
12325      case AARCH64_PARSE_MISSING_ARG:
12326        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12327               " attribute");
12328        break;
12329      case AARCH64_PARSE_INVALID_ARG:
12330        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12331               "=\")%> pragma or attribute", err_str);
12332        break;
12333      case AARCH64_PARSE_OK:
12334        success = true;
12335       /* Fall through.  */
12336      case AARCH64_PARSE_INVALID_FEATURE:
12337        break;
12338      default:
12339        gcc_unreachable ();
12340     }
12341   free (err_str);
12342   return success;
12343  }
12344
12345 /* Handle the argument STR to the tune= target attribute.  */
12346
12347 static bool
12348 aarch64_handle_attr_tune (const char *str)
12349 {
12350   const struct processor *tmp_tune = NULL;
12351   enum aarch64_parse_opt_result parse_res
12352     = aarch64_parse_tune (str, &tmp_tune);
12353
12354   if (parse_res == AARCH64_PARSE_OK)
12355     {
12356       gcc_assert (tmp_tune);
12357       selected_tune = tmp_tune;
12358       explicit_tune_core = selected_tune->ident;
12359       return true;
12360     }
12361
12362   switch (parse_res)
12363     {
12364       case AARCH64_PARSE_INVALID_ARG:
12365         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
12366         aarch64_print_hint_for_core (str);
12367         break;
12368       default:
12369         gcc_unreachable ();
12370     }
12371
12372   return false;
12373 }
12374
12375 /* Parse an architecture extensions target attribute string specified in STR.
12376    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
12377    if successful.  Update aarch64_isa_flags to reflect the ISA features
12378    modified.  */
12379
12380 static bool
12381 aarch64_handle_attr_isa_flags (char *str)
12382 {
12383   enum aarch64_parse_opt_result parse_res;
12384   unsigned long isa_flags = aarch64_isa_flags;
12385
12386   /* We allow "+nothing" in the beginning to clear out all architectural
12387      features if the user wants to handpick specific features.  */
12388   if (strncmp ("+nothing", str, 8) == 0)
12389     {
12390       isa_flags = 0;
12391       str += 8;
12392     }
12393
12394   std::string invalid_extension;
12395   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
12396
12397   if (parse_res == AARCH64_PARSE_OK)
12398     {
12399       aarch64_isa_flags = isa_flags;
12400       return true;
12401     }
12402
12403   switch (parse_res)
12404     {
12405       case AARCH64_PARSE_MISSING_ARG:
12406         error ("missing value in %<target()%> pragma or attribute");
12407         break;
12408
12409       case AARCH64_PARSE_INVALID_FEATURE:
12410         error ("invalid feature modifier %s of value (\"%s\") in "
12411                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12412         break;
12413
12414       default:
12415         gcc_unreachable ();
12416     }
12417
12418  return false;
12419 }
12420
12421 /* The target attributes that we support.  On top of these we also support just
12422    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
12423    handled explicitly in aarch64_process_one_target_attr.  */
12424
12425 static const struct aarch64_attribute_info aarch64_attributes[] =
12426 {
12427   { "general-regs-only", aarch64_attr_mask, false, NULL,
12428      OPT_mgeneral_regs_only },
12429   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
12430      OPT_mfix_cortex_a53_835769 },
12431   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
12432      OPT_mfix_cortex_a53_843419 },
12433   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
12434   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
12435   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
12436      OPT_momit_leaf_frame_pointer },
12437   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
12438   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
12439      OPT_march_ },
12440   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
12441   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
12442      OPT_mtune_ },
12443   { "branch-protection", aarch64_attr_custom, false,
12444      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
12445   { "sign-return-address", aarch64_attr_enum, false, NULL,
12446      OPT_msign_return_address_ },
12447   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
12448 };
12449
12450 /* Parse ARG_STR which contains the definition of one target attribute.
12451    Show appropriate errors if any or return true if the attribute is valid.  */
12452
12453 static bool
12454 aarch64_process_one_target_attr (char *arg_str)
12455 {
12456   bool invert = false;
12457
12458   size_t len = strlen (arg_str);
12459
12460   if (len == 0)
12461     {
12462       error ("malformed %<target()%> pragma or attribute");
12463       return false;
12464     }
12465
12466   char *str_to_check = (char *) alloca (len + 1);
12467   strcpy (str_to_check, arg_str);
12468
12469   /* Skip leading whitespace.  */
12470   while (*str_to_check == ' ' || *str_to_check == '\t')
12471     str_to_check++;
12472
12473   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
12474      It is easier to detect and handle it explicitly here rather than going
12475      through the machinery for the rest of the target attributes in this
12476      function.  */
12477   if (*str_to_check == '+')
12478     return aarch64_handle_attr_isa_flags (str_to_check);
12479
12480   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
12481     {
12482       invert = true;
12483       str_to_check += 3;
12484     }
12485   char *arg = strchr (str_to_check, '=');
12486
12487   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
12488      and point ARG to "foo".  */
12489   if (arg)
12490     {
12491       *arg = '\0';
12492       arg++;
12493     }
12494   const struct aarch64_attribute_info *p_attr;
12495   bool found = false;
12496   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
12497     {
12498       /* If the names don't match up, or the user has given an argument
12499          to an attribute that doesn't accept one, or didn't give an argument
12500          to an attribute that expects one, fail to match.  */
12501       if (strcmp (str_to_check, p_attr->name) != 0)
12502         continue;
12503
12504       found = true;
12505       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
12506                               || p_attr->attr_type == aarch64_attr_enum;
12507
12508       if (attr_need_arg_p ^ (arg != NULL))
12509         {
12510           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
12511           return false;
12512         }
12513
12514       /* If the name matches but the attribute does not allow "no-" versions
12515          then we can't match.  */
12516       if (invert && !p_attr->allow_neg)
12517         {
12518           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
12519           return false;
12520         }
12521
12522       switch (p_attr->attr_type)
12523         {
12524         /* Has a custom handler registered.
12525            For example, cpu=, arch=, tune=.  */
12526           case aarch64_attr_custom:
12527             gcc_assert (p_attr->handler);
12528             if (!p_attr->handler (arg))
12529               return false;
12530             break;
12531
12532           /* Either set or unset a boolean option.  */
12533           case aarch64_attr_bool:
12534             {
12535               struct cl_decoded_option decoded;
12536
12537               generate_option (p_attr->opt_num, NULL, !invert,
12538                                CL_TARGET, &decoded);
12539               aarch64_handle_option (&global_options, &global_options_set,
12540                                       &decoded, input_location);
12541               break;
12542             }
12543           /* Set or unset a bit in the target_flags.  aarch64_handle_option
12544              should know what mask to apply given the option number.  */
12545           case aarch64_attr_mask:
12546             {
12547               struct cl_decoded_option decoded;
12548               /* We only need to specify the option number.
12549                  aarch64_handle_option will know which mask to apply.  */
12550               decoded.opt_index = p_attr->opt_num;
12551               decoded.value = !invert;
12552               aarch64_handle_option (&global_options, &global_options_set,
12553                                       &decoded, input_location);
12554               break;
12555             }
12556           /* Use the option setting machinery to set an option to an enum.  */
12557           case aarch64_attr_enum:
12558             {
12559               gcc_assert (arg);
12560               bool valid;
12561               int value;
12562               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
12563                                               &value, CL_TARGET);
12564               if (valid)
12565                 {
12566                   set_option (&global_options, NULL, p_attr->opt_num, value,
12567                               NULL, DK_UNSPECIFIED, input_location,
12568                               global_dc);
12569                 }
12570               else
12571                 {
12572                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
12573                 }
12574               break;
12575             }
12576           default:
12577             gcc_unreachable ();
12578         }
12579     }
12580
12581   /* If we reached here we either have found an attribute and validated
12582      it or didn't match any.  If we matched an attribute but its arguments
12583      were malformed we will have returned false already.  */
12584   return found;
12585 }
12586
12587 /* Count how many times the character C appears in
12588    NULL-terminated string STR.  */
12589
12590 static unsigned int
12591 num_occurences_in_str (char c, char *str)
12592 {
12593   unsigned int res = 0;
12594   while (*str != '\0')
12595     {
12596       if (*str == c)
12597         res++;
12598
12599       str++;
12600     }
12601
12602   return res;
12603 }
12604
12605 /* Parse the tree in ARGS that contains the target attribute information
12606    and update the global target options space.  */
12607
12608 bool
12609 aarch64_process_target_attr (tree args)
12610 {
12611   if (TREE_CODE (args) == TREE_LIST)
12612     {
12613       do
12614         {
12615           tree head = TREE_VALUE (args);
12616           if (head)
12617             {
12618               if (!aarch64_process_target_attr (head))
12619                 return false;
12620             }
12621           args = TREE_CHAIN (args);
12622         } while (args);
12623
12624       return true;
12625     }
12626
12627   if (TREE_CODE (args) != STRING_CST)
12628     {
12629       error ("attribute %<target%> argument not a string");
12630       return false;
12631     }
12632
12633   size_t len = strlen (TREE_STRING_POINTER (args));
12634   char *str_to_check = (char *) alloca (len + 1);
12635   strcpy (str_to_check, TREE_STRING_POINTER (args));
12636
12637   if (len == 0)
12638     {
12639       error ("malformed %<target()%> pragma or attribute");
12640       return false;
12641     }
12642
12643   /* Used to catch empty spaces between commas i.e.
12644      attribute ((target ("attr1,,attr2"))).  */
12645   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
12646
12647   /* Handle multiple target attributes separated by ','.  */
12648   char *token = strtok_r (str_to_check, ",", &str_to_check);
12649
12650   unsigned int num_attrs = 0;
12651   while (token)
12652     {
12653       num_attrs++;
12654       if (!aarch64_process_one_target_attr (token))
12655         {
12656           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
12657           return false;
12658         }
12659
12660       token = strtok_r (NULL, ",", &str_to_check);
12661     }
12662
12663   if (num_attrs != num_commas + 1)
12664     {
12665       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
12666       return false;
12667     }
12668
12669   return true;
12670 }
12671
12672 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
12673    process attribute ((target ("..."))).  */
12674
12675 static bool
12676 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
12677 {
12678   struct cl_target_option cur_target;
12679   bool ret;
12680   tree old_optimize;
12681   tree new_target, new_optimize;
12682   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12683
12684   /* If what we're processing is the current pragma string then the
12685      target option node is already stored in target_option_current_node
12686      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
12687      having to re-parse the string.  This is especially useful to keep
12688      arm_neon.h compile times down since that header contains a lot
12689      of intrinsics enclosed in pragmas.  */
12690   if (!existing_target && args == current_target_pragma)
12691     {
12692       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
12693       return true;
12694     }
12695   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12696
12697   old_optimize = build_optimization_node (&global_options);
12698   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12699
12700   /* If the function changed the optimization levels as well as setting
12701      target options, start with the optimizations specified.  */
12702   if (func_optimize && func_optimize != old_optimize)
12703     cl_optimization_restore (&global_options,
12704                              TREE_OPTIMIZATION (func_optimize));
12705
12706   /* Save the current target options to restore at the end.  */
12707   cl_target_option_save (&cur_target, &global_options);
12708
12709   /* If fndecl already has some target attributes applied to it, unpack
12710      them so that we add this attribute on top of them, rather than
12711      overwriting them.  */
12712   if (existing_target)
12713     {
12714       struct cl_target_option *existing_options
12715         = TREE_TARGET_OPTION (existing_target);
12716
12717       if (existing_options)
12718         cl_target_option_restore (&global_options, existing_options);
12719     }
12720   else
12721     cl_target_option_restore (&global_options,
12722                         TREE_TARGET_OPTION (target_option_current_node));
12723
12724   ret = aarch64_process_target_attr (args);
12725
12726   /* Set up any additional state.  */
12727   if (ret)
12728     {
12729       aarch64_override_options_internal (&global_options);
12730       /* Initialize SIMD builtins if we haven't already.
12731          Set current_target_pragma to NULL for the duration so that
12732          the builtin initialization code doesn't try to tag the functions
12733          being built with the attributes specified by any current pragma, thus
12734          going into an infinite recursion.  */
12735       if (TARGET_SIMD)
12736         {
12737           tree saved_current_target_pragma = current_target_pragma;
12738           current_target_pragma = NULL;
12739           aarch64_init_simd_builtins ();
12740           current_target_pragma = saved_current_target_pragma;
12741         }
12742       new_target = build_target_option_node (&global_options);
12743     }
12744   else
12745     new_target = NULL;
12746
12747   new_optimize = build_optimization_node (&global_options);
12748
12749   if (fndecl && ret)
12750     {
12751       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
12752
12753       if (old_optimize != new_optimize)
12754         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
12755     }
12756
12757   cl_target_option_restore (&global_options, &cur_target);
12758
12759   if (old_optimize != new_optimize)
12760     cl_optimization_restore (&global_options,
12761                              TREE_OPTIMIZATION (old_optimize));
12762   return ret;
12763 }
12764
12765 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
12766    tri-bool options (yes, no, don't care) and the default value is
12767    DEF, determine whether to reject inlining.  */
12768
12769 static bool
12770 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
12771                                      int dont_care, int def)
12772 {
12773   /* If the callee doesn't care, always allow inlining.  */
12774   if (callee == dont_care)
12775     return true;
12776
12777   /* If the caller doesn't care, always allow inlining.  */
12778   if (caller == dont_care)
12779     return true;
12780
12781   /* Otherwise, allow inlining if either the callee and caller values
12782      agree, or if the callee is using the default value.  */
12783   return (callee == caller || callee == def);
12784 }
12785
12786 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
12787    to inline CALLEE into CALLER based on target-specific info.
12788    Make sure that the caller and callee have compatible architectural
12789    features.  Then go through the other possible target attributes
12790    and see if they can block inlining.  Try not to reject always_inline
12791    callees unless they are incompatible architecturally.  */
12792
12793 static bool
12794 aarch64_can_inline_p (tree caller, tree callee)
12795 {
12796   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
12797   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
12798
12799   struct cl_target_option *caller_opts
12800         = TREE_TARGET_OPTION (caller_tree ? caller_tree
12801                                            : target_option_default_node);
12802
12803   struct cl_target_option *callee_opts
12804         = TREE_TARGET_OPTION (callee_tree ? callee_tree
12805                                            : target_option_default_node);
12806
12807   /* Callee's ISA flags should be a subset of the caller's.  */
12808   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
12809        != callee_opts->x_aarch64_isa_flags)
12810     return false;
12811
12812   /* Allow non-strict aligned functions inlining into strict
12813      aligned ones.  */
12814   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
12815        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
12816       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
12817            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
12818     return false;
12819
12820   bool always_inline = lookup_attribute ("always_inline",
12821                                           DECL_ATTRIBUTES (callee));
12822
12823   /* If the architectural features match up and the callee is always_inline
12824      then the other attributes don't matter.  */
12825   if (always_inline)
12826     return true;
12827
12828   if (caller_opts->x_aarch64_cmodel_var
12829       != callee_opts->x_aarch64_cmodel_var)
12830     return false;
12831
12832   if (caller_opts->x_aarch64_tls_dialect
12833       != callee_opts->x_aarch64_tls_dialect)
12834     return false;
12835
12836   /* Honour explicit requests to workaround errata.  */
12837   if (!aarch64_tribools_ok_for_inlining_p (
12838           caller_opts->x_aarch64_fix_a53_err835769,
12839           callee_opts->x_aarch64_fix_a53_err835769,
12840           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
12841     return false;
12842
12843   if (!aarch64_tribools_ok_for_inlining_p (
12844           caller_opts->x_aarch64_fix_a53_err843419,
12845           callee_opts->x_aarch64_fix_a53_err843419,
12846           2, TARGET_FIX_ERR_A53_843419))
12847     return false;
12848
12849   /* If the user explicitly specified -momit-leaf-frame-pointer for the
12850      caller and calle and they don't match up, reject inlining.  */
12851   if (!aarch64_tribools_ok_for_inlining_p (
12852           caller_opts->x_flag_omit_leaf_frame_pointer,
12853           callee_opts->x_flag_omit_leaf_frame_pointer,
12854           2, 1))
12855     return false;
12856
12857   /* If the callee has specific tuning overrides, respect them.  */
12858   if (callee_opts->x_aarch64_override_tune_string != NULL
12859       && caller_opts->x_aarch64_override_tune_string == NULL)
12860     return false;
12861
12862   /* If the user specified tuning override strings for the
12863      caller and callee and they don't match up, reject inlining.
12864      We just do a string compare here, we don't analyze the meaning
12865      of the string, as it would be too costly for little gain.  */
12866   if (callee_opts->x_aarch64_override_tune_string
12867       && caller_opts->x_aarch64_override_tune_string
12868       && (strcmp (callee_opts->x_aarch64_override_tune_string,
12869                   caller_opts->x_aarch64_override_tune_string) != 0))
12870     return false;
12871
12872   return true;
12873 }
12874
12875 /* Return true if SYMBOL_REF X binds locally.  */
12876
12877 static bool
12878 aarch64_symbol_binds_local_p (const_rtx x)
12879 {
12880   return (SYMBOL_REF_DECL (x)
12881           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
12882           : SYMBOL_REF_LOCAL_P (x));
12883 }
12884
12885 /* Return true if SYMBOL_REF X is thread local */
12886 static bool
12887 aarch64_tls_symbol_p (rtx x)
12888 {
12889   if (! TARGET_HAVE_TLS)
12890     return false;
12891
12892   if (GET_CODE (x) != SYMBOL_REF)
12893     return false;
12894
12895   return SYMBOL_REF_TLS_MODEL (x) != 0;
12896 }
12897
12898 /* Classify a TLS symbol into one of the TLS kinds.  */
12899 enum aarch64_symbol_type
12900 aarch64_classify_tls_symbol (rtx x)
12901 {
12902   enum tls_model tls_kind = tls_symbolic_operand_type (x);
12903
12904   switch (tls_kind)
12905     {
12906     case TLS_MODEL_GLOBAL_DYNAMIC:
12907     case TLS_MODEL_LOCAL_DYNAMIC:
12908       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
12909
12910     case TLS_MODEL_INITIAL_EXEC:
12911       switch (aarch64_cmodel)
12912         {
12913         case AARCH64_CMODEL_TINY:
12914         case AARCH64_CMODEL_TINY_PIC:
12915           return SYMBOL_TINY_TLSIE;
12916         default:
12917           return SYMBOL_SMALL_TLSIE;
12918         }
12919
12920     case TLS_MODEL_LOCAL_EXEC:
12921       if (aarch64_tls_size == 12)
12922         return SYMBOL_TLSLE12;
12923       else if (aarch64_tls_size == 24)
12924         return SYMBOL_TLSLE24;
12925       else if (aarch64_tls_size == 32)
12926         return SYMBOL_TLSLE32;
12927       else if (aarch64_tls_size == 48)
12928         return SYMBOL_TLSLE48;
12929       else
12930         gcc_unreachable ();
12931
12932     case TLS_MODEL_EMULATED:
12933     case TLS_MODEL_NONE:
12934       return SYMBOL_FORCE_TO_MEM;
12935
12936     default:
12937       gcc_unreachable ();
12938     }
12939 }
12940
12941 /* Return the correct method for accessing X + OFFSET, where X is either
12942    a SYMBOL_REF or LABEL_REF.  */
12943
12944 enum aarch64_symbol_type
12945 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
12946 {
12947   if (GET_CODE (x) == LABEL_REF)
12948     {
12949       switch (aarch64_cmodel)
12950         {
12951         case AARCH64_CMODEL_LARGE:
12952           return SYMBOL_FORCE_TO_MEM;
12953
12954         case AARCH64_CMODEL_TINY_PIC:
12955         case AARCH64_CMODEL_TINY:
12956           return SYMBOL_TINY_ABSOLUTE;
12957
12958         case AARCH64_CMODEL_SMALL_SPIC:
12959         case AARCH64_CMODEL_SMALL_PIC:
12960         case AARCH64_CMODEL_SMALL:
12961           return SYMBOL_SMALL_ABSOLUTE;
12962
12963         default:
12964           gcc_unreachable ();
12965         }
12966     }
12967
12968   if (GET_CODE (x) == SYMBOL_REF)
12969     {
12970       if (aarch64_tls_symbol_p (x))
12971         return aarch64_classify_tls_symbol (x);
12972
12973       switch (aarch64_cmodel)
12974         {
12975         case AARCH64_CMODEL_TINY:
12976           /* When we retrieve symbol + offset address, we have to make sure
12977              the offset does not cause overflow of the final address.  But
12978              we have no way of knowing the address of symbol at compile time
12979              so we can't accurately say if the distance between the PC and
12980              symbol + offset is outside the addressible range of +/-1M in the
12981              TINY code model.  So we rely on images not being greater than
12982              1M and cap the offset at 1M and anything beyond 1M will have to
12983              be loaded using an alternative mechanism.  Furthermore if the
12984              symbol is a weak reference to something that isn't known to
12985              resolve to a symbol in this module, then force to memory.  */
12986           if ((SYMBOL_REF_WEAK (x)
12987                && !aarch64_symbol_binds_local_p (x))
12988               || !IN_RANGE (offset, -1048575, 1048575))
12989             return SYMBOL_FORCE_TO_MEM;
12990           return SYMBOL_TINY_ABSOLUTE;
12991
12992         case AARCH64_CMODEL_SMALL:
12993           /* Same reasoning as the tiny code model, but the offset cap here is
12994              4G.  */
12995           if ((SYMBOL_REF_WEAK (x)
12996                && !aarch64_symbol_binds_local_p (x))
12997               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
12998                             HOST_WIDE_INT_C (4294967264)))
12999             return SYMBOL_FORCE_TO_MEM;
13000           return SYMBOL_SMALL_ABSOLUTE;
13001
13002         case AARCH64_CMODEL_TINY_PIC:
13003           if (!aarch64_symbol_binds_local_p (x))
13004             return SYMBOL_TINY_GOT;
13005           return SYMBOL_TINY_ABSOLUTE;
13006
13007         case AARCH64_CMODEL_SMALL_SPIC:
13008         case AARCH64_CMODEL_SMALL_PIC:
13009           if (!aarch64_symbol_binds_local_p (x))
13010             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13011                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13012           return SYMBOL_SMALL_ABSOLUTE;
13013
13014         case AARCH64_CMODEL_LARGE:
13015           /* This is alright even in PIC code as the constant
13016              pool reference is always PC relative and within
13017              the same translation unit.  */
13018           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13019             return SYMBOL_SMALL_ABSOLUTE;
13020           else
13021             return SYMBOL_FORCE_TO_MEM;
13022
13023         default:
13024           gcc_unreachable ();
13025         }
13026     }
13027
13028   /* By default push everything into the constant pool.  */
13029   return SYMBOL_FORCE_TO_MEM;
13030 }
13031
13032 bool
13033 aarch64_constant_address_p (rtx x)
13034 {
13035   return (CONSTANT_P (x) && memory_address_p (DImode, x));
13036 }
13037
13038 bool
13039 aarch64_legitimate_pic_operand_p (rtx x)
13040 {
13041   if (GET_CODE (x) == SYMBOL_REF
13042       || (GET_CODE (x) == CONST
13043           && GET_CODE (XEXP (x, 0)) == PLUS
13044           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13045      return false;
13046
13047   return true;
13048 }
13049
13050 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
13051    that should be rematerialized rather than spilled.  */
13052
13053 static bool
13054 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
13055 {
13056   /* Support CSE and rematerialization of common constants.  */
13057   if (CONST_INT_P (x)
13058       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
13059       || GET_CODE (x) == CONST_VECTOR)
13060     return true;
13061
13062   /* Do not allow vector struct mode constants for Advanced SIMD.
13063      We could support 0 and -1 easily, but they need support in
13064      aarch64-simd.md.  */
13065   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13066   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13067     return false;
13068
13069   /* Only accept variable-length vector constants if they can be
13070      handled directly.
13071
13072      ??? It would be possible to handle rematerialization of other
13073      constants via secondary reloads.  */
13074   if (vec_flags & VEC_ANY_SVE)
13075     return aarch64_simd_valid_immediate (x, NULL);
13076
13077   if (GET_CODE (x) == HIGH)
13078     x = XEXP (x, 0);
13079
13080   /* Accept polynomial constants that can be calculated by using the
13081      destination of a move as the sole temporary.  Constants that
13082      require a second temporary cannot be rematerialized (they can't be
13083      forced to memory and also aren't legitimate constants).  */
13084   poly_int64 offset;
13085   if (poly_int_rtx_p (x, &offset))
13086     return aarch64_offset_temporaries (false, offset) <= 1;
13087
13088   /* If an offset is being added to something else, we need to allow the
13089      base to be moved into the destination register, meaning that there
13090      are no free temporaries for the offset.  */
13091   x = strip_offset (x, &offset);
13092   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
13093     return false;
13094
13095   /* Do not allow const (plus (anchor_symbol, const_int)).  */
13096   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
13097     return false;
13098
13099   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
13100      so spilling them is better than rematerialization.  */
13101   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
13102     return true;
13103
13104   /* Label references are always constant.  */
13105   if (GET_CODE (x) == LABEL_REF)
13106     return true;
13107
13108   return false;
13109 }
13110
13111 rtx
13112 aarch64_load_tp (rtx target)
13113 {
13114   if (!target
13115       || GET_MODE (target) != Pmode
13116       || !register_operand (target, Pmode))
13117     target = gen_reg_rtx (Pmode);
13118
13119   /* Can return in any reg.  */
13120   emit_insn (gen_aarch64_load_tp_hard (target));
13121   return target;
13122 }
13123
13124 /* On AAPCS systems, this is the "struct __va_list".  */
13125 static GTY(()) tree va_list_type;
13126
13127 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13128    Return the type to use as __builtin_va_list.
13129
13130    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
13131
13132    struct __va_list
13133    {
13134      void *__stack;
13135      void *__gr_top;
13136      void *__vr_top;
13137      int   __gr_offs;
13138      int   __vr_offs;
13139    };  */
13140
13141 static tree
13142 aarch64_build_builtin_va_list (void)
13143 {
13144   tree va_list_name;
13145   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13146
13147   /* Create the type.  */
13148   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
13149   /* Give it the required name.  */
13150   va_list_name = build_decl (BUILTINS_LOCATION,
13151                              TYPE_DECL,
13152                              get_identifier ("__va_list"),
13153                              va_list_type);
13154   DECL_ARTIFICIAL (va_list_name) = 1;
13155   TYPE_NAME (va_list_type) = va_list_name;
13156   TYPE_STUB_DECL (va_list_type) = va_list_name;
13157
13158   /* Create the fields.  */
13159   f_stack = build_decl (BUILTINS_LOCATION,
13160                         FIELD_DECL, get_identifier ("__stack"),
13161                         ptr_type_node);
13162   f_grtop = build_decl (BUILTINS_LOCATION,
13163                         FIELD_DECL, get_identifier ("__gr_top"),
13164                         ptr_type_node);
13165   f_vrtop = build_decl (BUILTINS_LOCATION,
13166                         FIELD_DECL, get_identifier ("__vr_top"),
13167                         ptr_type_node);
13168   f_groff = build_decl (BUILTINS_LOCATION,
13169                         FIELD_DECL, get_identifier ("__gr_offs"),
13170                         integer_type_node);
13171   f_vroff = build_decl (BUILTINS_LOCATION,
13172                         FIELD_DECL, get_identifier ("__vr_offs"),
13173                         integer_type_node);
13174
13175   /* Tell tree-stdarg pass about our internal offset fields.
13176      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
13177      purpose to identify whether the code is updating va_list internal
13178      offset fields through irregular way.  */
13179   va_list_gpr_counter_field = f_groff;
13180   va_list_fpr_counter_field = f_vroff;
13181
13182   DECL_ARTIFICIAL (f_stack) = 1;
13183   DECL_ARTIFICIAL (f_grtop) = 1;
13184   DECL_ARTIFICIAL (f_vrtop) = 1;
13185   DECL_ARTIFICIAL (f_groff) = 1;
13186   DECL_ARTIFICIAL (f_vroff) = 1;
13187
13188   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
13189   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
13190   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
13191   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
13192   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
13193
13194   TYPE_FIELDS (va_list_type) = f_stack;
13195   DECL_CHAIN (f_stack) = f_grtop;
13196   DECL_CHAIN (f_grtop) = f_vrtop;
13197   DECL_CHAIN (f_vrtop) = f_groff;
13198   DECL_CHAIN (f_groff) = f_vroff;
13199
13200   /* Compute its layout.  */
13201   layout_type (va_list_type);
13202
13203   return va_list_type;
13204 }
13205
13206 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
13207 static void
13208 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
13209 {
13210   const CUMULATIVE_ARGS *cum;
13211   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13212   tree stack, grtop, vrtop, groff, vroff;
13213   tree t;
13214   int gr_save_area_size = cfun->va_list_gpr_size;
13215   int vr_save_area_size = cfun->va_list_fpr_size;
13216   int vr_offset;
13217
13218   cum = &crtl->args.info;
13219   if (cfun->va_list_gpr_size)
13220     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
13221                              cfun->va_list_gpr_size);
13222   if (cfun->va_list_fpr_size)
13223     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
13224                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
13225
13226   if (!TARGET_FLOAT)
13227     {
13228       gcc_assert (cum->aapcs_nvrn == 0);
13229       vr_save_area_size = 0;
13230     }
13231
13232   f_stack = TYPE_FIELDS (va_list_type_node);
13233   f_grtop = DECL_CHAIN (f_stack);
13234   f_vrtop = DECL_CHAIN (f_grtop);
13235   f_groff = DECL_CHAIN (f_vrtop);
13236   f_vroff = DECL_CHAIN (f_groff);
13237
13238   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
13239                   NULL_TREE);
13240   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
13241                   NULL_TREE);
13242   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
13243                   NULL_TREE);
13244   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
13245                   NULL_TREE);
13246   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
13247                   NULL_TREE);
13248
13249   /* Emit code to initialize STACK, which points to the next varargs stack
13250      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
13251      by named arguments.  STACK is 8-byte aligned.  */
13252   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
13253   if (cum->aapcs_stack_size > 0)
13254     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
13255   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
13256   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13257
13258   /* Emit code to initialize GRTOP, the top of the GR save area.
13259      virtual_incoming_args_rtx should have been 16 byte aligned.  */
13260   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
13261   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
13262   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13263
13264   /* Emit code to initialize VRTOP, the top of the VR save area.
13265      This address is gr_save_area_bytes below GRTOP, rounded
13266      down to the next 16-byte boundary.  */
13267   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
13268   vr_offset = ROUND_UP (gr_save_area_size,
13269                         STACK_BOUNDARY / BITS_PER_UNIT);
13270
13271   if (vr_offset)
13272     t = fold_build_pointer_plus_hwi (t, -vr_offset);
13273   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
13274   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13275
13276   /* Emit code to initialize GROFF, the offset from GRTOP of the
13277      next GPR argument.  */
13278   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
13279               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
13280   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13281
13282   /* Likewise emit code to initialize VROFF, the offset from FTOP
13283      of the next VR argument.  */
13284   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
13285               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
13286   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13287 }
13288
13289 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
13290
13291 static tree
13292 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
13293                               gimple_seq *post_p ATTRIBUTE_UNUSED)
13294 {
13295   tree addr;
13296   bool indirect_p;
13297   bool is_ha;           /* is HFA or HVA.  */
13298   bool dw_align;        /* double-word align.  */
13299   machine_mode ag_mode = VOIDmode;
13300   int nregs;
13301   machine_mode mode;
13302
13303   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13304   tree stack, f_top, f_off, off, arg, roundup, on_stack;
13305   HOST_WIDE_INT size, rsize, adjust, align;
13306   tree t, u, cond1, cond2;
13307
13308   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
13309   if (indirect_p)
13310     type = build_pointer_type (type);
13311
13312   mode = TYPE_MODE (type);
13313
13314   f_stack = TYPE_FIELDS (va_list_type_node);
13315   f_grtop = DECL_CHAIN (f_stack);
13316   f_vrtop = DECL_CHAIN (f_grtop);
13317   f_groff = DECL_CHAIN (f_vrtop);
13318   f_vroff = DECL_CHAIN (f_groff);
13319
13320   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
13321                   f_stack, NULL_TREE);
13322   size = int_size_in_bytes (type);
13323   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
13324
13325   dw_align = false;
13326   adjust = 0;
13327   if (aarch64_vfp_is_call_or_return_candidate (mode,
13328                                                type,
13329                                                &ag_mode,
13330                                                &nregs,
13331                                                &is_ha))
13332     {
13333       /* No frontends can create types with variable-sized modes, so we
13334          shouldn't be asked to pass or return them.  */
13335       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
13336
13337       /* TYPE passed in fp/simd registers.  */
13338       if (!TARGET_FLOAT)
13339         aarch64_err_no_fpadvsimd (mode);
13340
13341       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
13342                       unshare_expr (valist), f_vrtop, NULL_TREE);
13343       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
13344                       unshare_expr (valist), f_vroff, NULL_TREE);
13345
13346       rsize = nregs * UNITS_PER_VREG;
13347
13348       if (is_ha)
13349         {
13350           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
13351             adjust = UNITS_PER_VREG - ag_size;
13352         }
13353       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13354                && size < UNITS_PER_VREG)
13355         {
13356           adjust = UNITS_PER_VREG - size;
13357         }
13358     }
13359   else
13360     {
13361       /* TYPE passed in general registers.  */
13362       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
13363                       unshare_expr (valist), f_grtop, NULL_TREE);
13364       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
13365                       unshare_expr (valist), f_groff, NULL_TREE);
13366       rsize = ROUND_UP (size, UNITS_PER_WORD);
13367       nregs = rsize / UNITS_PER_WORD;
13368
13369       if (align > 8)
13370         dw_align = true;
13371
13372       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13373           && size < UNITS_PER_WORD)
13374         {
13375           adjust = UNITS_PER_WORD  - size;
13376         }
13377     }
13378
13379   /* Get a local temporary for the field value.  */
13380   off = get_initialized_tmp_var (f_off, pre_p, NULL);
13381
13382   /* Emit code to branch if off >= 0.  */
13383   t = build2 (GE_EXPR, boolean_type_node, off,
13384               build_int_cst (TREE_TYPE (off), 0));
13385   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
13386
13387   if (dw_align)
13388     {
13389       /* Emit: offs = (offs + 15) & -16.  */
13390       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13391                   build_int_cst (TREE_TYPE (off), 15));
13392       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
13393                   build_int_cst (TREE_TYPE (off), -16));
13394       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
13395     }
13396   else
13397     roundup = NULL;
13398
13399   /* Update ap.__[g|v]r_offs  */
13400   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13401               build_int_cst (TREE_TYPE (off), rsize));
13402   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
13403
13404   /* String up.  */
13405   if (roundup)
13406     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13407
13408   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
13409   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
13410               build_int_cst (TREE_TYPE (f_off), 0));
13411   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
13412
13413   /* String up: make sure the assignment happens before the use.  */
13414   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
13415   COND_EXPR_ELSE (cond1) = t;
13416
13417   /* Prepare the trees handling the argument that is passed on the stack;
13418      the top level node will store in ON_STACK.  */
13419   arg = get_initialized_tmp_var (stack, pre_p, NULL);
13420   if (align > 8)
13421     {
13422       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
13423       t = fold_build_pointer_plus_hwi (arg, 15);
13424       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13425                   build_int_cst (TREE_TYPE (t), -16));
13426       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
13427     }
13428   else
13429     roundup = NULL;
13430   /* Advance ap.__stack  */
13431   t = fold_build_pointer_plus_hwi (arg, size + 7);
13432   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13433               build_int_cst (TREE_TYPE (t), -8));
13434   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
13435   /* String up roundup and advance.  */
13436   if (roundup)
13437     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13438   /* String up with arg */
13439   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
13440   /* Big-endianness related address adjustment.  */
13441   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13442       && size < UNITS_PER_WORD)
13443   {
13444     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
13445                 size_int (UNITS_PER_WORD - size));
13446     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
13447   }
13448
13449   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
13450   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
13451
13452   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
13453   t = off;
13454   if (adjust)
13455     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
13456                 build_int_cst (TREE_TYPE (off), adjust));
13457
13458   t = fold_convert (sizetype, t);
13459   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
13460
13461   if (is_ha)
13462     {
13463       /* type ha; // treat as "struct {ftype field[n];}"
13464          ... [computing offs]
13465          for (i = 0; i <nregs; ++i, offs += 16)
13466            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
13467          return ha;  */
13468       int i;
13469       tree tmp_ha, field_t, field_ptr_t;
13470
13471       /* Declare a local variable.  */
13472       tmp_ha = create_tmp_var_raw (type, "ha");
13473       gimple_add_tmp_var (tmp_ha);
13474
13475       /* Establish the base type.  */
13476       switch (ag_mode)
13477         {
13478         case E_SFmode:
13479           field_t = float_type_node;
13480           field_ptr_t = float_ptr_type_node;
13481           break;
13482         case E_DFmode:
13483           field_t = double_type_node;
13484           field_ptr_t = double_ptr_type_node;
13485           break;
13486         case E_TFmode:
13487           field_t = long_double_type_node;
13488           field_ptr_t = long_double_ptr_type_node;
13489           break;
13490         case E_HFmode:
13491           field_t = aarch64_fp16_type_node;
13492           field_ptr_t = aarch64_fp16_ptr_type_node;
13493           break;
13494         case E_V2SImode:
13495         case E_V4SImode:
13496             {
13497               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
13498               field_t = build_vector_type_for_mode (innertype, ag_mode);
13499               field_ptr_t = build_pointer_type (field_t);
13500             }
13501           break;
13502         default:
13503           gcc_assert (0);
13504         }
13505
13506       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
13507       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
13508       addr = t;
13509       t = fold_convert (field_ptr_t, addr);
13510       t = build2 (MODIFY_EXPR, field_t,
13511                   build1 (INDIRECT_REF, field_t, tmp_ha),
13512                   build1 (INDIRECT_REF, field_t, t));
13513
13514       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
13515       for (i = 1; i < nregs; ++i)
13516         {
13517           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
13518           u = fold_convert (field_ptr_t, addr);
13519           u = build2 (MODIFY_EXPR, field_t,
13520                       build2 (MEM_REF, field_t, tmp_ha,
13521                               build_int_cst (field_ptr_t,
13522                                              (i *
13523                                               int_size_in_bytes (field_t)))),
13524                       build1 (INDIRECT_REF, field_t, u));
13525           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
13526         }
13527
13528       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
13529       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
13530     }
13531
13532   COND_EXPR_ELSE (cond2) = t;
13533   addr = fold_convert (build_pointer_type (type), cond1);
13534   addr = build_va_arg_indirect_ref (addr);
13535
13536   if (indirect_p)
13537     addr = build_va_arg_indirect_ref (addr);
13538
13539   return addr;
13540 }
13541
13542 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
13543
13544 static void
13545 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
13546                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
13547                                 int no_rtl)
13548 {
13549   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
13550   CUMULATIVE_ARGS local_cum;
13551   int gr_saved = cfun->va_list_gpr_size;
13552   int vr_saved = cfun->va_list_fpr_size;
13553
13554   /* The caller has advanced CUM up to, but not beyond, the last named
13555      argument.  Advance a local copy of CUM past the last "real" named
13556      argument, to find out how many registers are left over.  */
13557   local_cum = *cum;
13558   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
13559
13560   /* Found out how many registers we need to save.
13561      Honor tree-stdvar analysis results.  */
13562   if (cfun->va_list_gpr_size)
13563     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
13564                     cfun->va_list_gpr_size / UNITS_PER_WORD);
13565   if (cfun->va_list_fpr_size)
13566     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
13567                     cfun->va_list_fpr_size / UNITS_PER_VREG);
13568
13569   if (!TARGET_FLOAT)
13570     {
13571       gcc_assert (local_cum.aapcs_nvrn == 0);
13572       vr_saved = 0;
13573     }
13574
13575   if (!no_rtl)
13576     {
13577       if (gr_saved > 0)
13578         {
13579           rtx ptr, mem;
13580
13581           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
13582           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
13583                                - gr_saved * UNITS_PER_WORD);
13584           mem = gen_frame_mem (BLKmode, ptr);
13585           set_mem_alias_set (mem, get_varargs_alias_set ());
13586
13587           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
13588                                mem, gr_saved);
13589         }
13590       if (vr_saved > 0)
13591         {
13592           /* We can't use move_block_from_reg, because it will use
13593              the wrong mode, storing D regs only.  */
13594           machine_mode mode = TImode;
13595           int off, i, vr_start;
13596
13597           /* Set OFF to the offset from virtual_incoming_args_rtx of
13598              the first vector register.  The VR save area lies below
13599              the GR one, and is aligned to 16 bytes.  */
13600           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
13601                            STACK_BOUNDARY / BITS_PER_UNIT);
13602           off -= vr_saved * UNITS_PER_VREG;
13603
13604           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
13605           for (i = 0; i < vr_saved; ++i)
13606             {
13607               rtx ptr, mem;
13608
13609               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
13610               mem = gen_frame_mem (mode, ptr);
13611               set_mem_alias_set (mem, get_varargs_alias_set ());
13612               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
13613               off += UNITS_PER_VREG;
13614             }
13615         }
13616     }
13617
13618   /* We don't save the size into *PRETEND_SIZE because we want to avoid
13619      any complication of having crtl->args.pretend_args_size changed.  */
13620   cfun->machine->frame.saved_varargs_size
13621     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
13622                  STACK_BOUNDARY / BITS_PER_UNIT)
13623        + vr_saved * UNITS_PER_VREG);
13624 }
13625
13626 static void
13627 aarch64_conditional_register_usage (void)
13628 {
13629   int i;
13630   if (!TARGET_FLOAT)
13631     {
13632       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
13633         {
13634           fixed_regs[i] = 1;
13635           call_used_regs[i] = 1;
13636         }
13637     }
13638   if (!TARGET_SVE)
13639     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
13640       {
13641         fixed_regs[i] = 1;
13642         call_used_regs[i] = 1;
13643       }
13644
13645   /* When tracking speculation, we need a couple of call-clobbered registers
13646      to track the speculation state.  It would be nice to just use
13647      IP0 and IP1, but currently there are numerous places that just
13648      assume these registers are free for other uses (eg pointer
13649      authentication).  */
13650   if (aarch64_track_speculation)
13651     {
13652       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
13653       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
13654       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13655       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13656     }
13657 }
13658
13659 /* Walk down the type tree of TYPE counting consecutive base elements.
13660    If *MODEP is VOIDmode, then set it to the first valid floating point
13661    type.  If a non-floating point type is found, or if a floating point
13662    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
13663    otherwise return the count in the sub-tree.  */
13664 static int
13665 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
13666 {
13667   machine_mode mode;
13668   HOST_WIDE_INT size;
13669
13670   switch (TREE_CODE (type))
13671     {
13672     case REAL_TYPE:
13673       mode = TYPE_MODE (type);
13674       if (mode != DFmode && mode != SFmode
13675           && mode != TFmode && mode != HFmode)
13676         return -1;
13677
13678       if (*modep == VOIDmode)
13679         *modep = mode;
13680
13681       if (*modep == mode)
13682         return 1;
13683
13684       break;
13685
13686     case COMPLEX_TYPE:
13687       mode = TYPE_MODE (TREE_TYPE (type));
13688       if (mode != DFmode && mode != SFmode
13689           && mode != TFmode && mode != HFmode)
13690         return -1;
13691
13692       if (*modep == VOIDmode)
13693         *modep = mode;
13694
13695       if (*modep == mode)
13696         return 2;
13697
13698       break;
13699
13700     case VECTOR_TYPE:
13701       /* Use V2SImode and V4SImode as representatives of all 64-bit
13702          and 128-bit vector types.  */
13703       size = int_size_in_bytes (type);
13704       switch (size)
13705         {
13706         case 8:
13707           mode = V2SImode;
13708           break;
13709         case 16:
13710           mode = V4SImode;
13711           break;
13712         default:
13713           return -1;
13714         }
13715
13716       if (*modep == VOIDmode)
13717         *modep = mode;
13718
13719       /* Vector modes are considered to be opaque: two vectors are
13720          equivalent for the purposes of being homogeneous aggregates
13721          if they are the same size.  */
13722       if (*modep == mode)
13723         return 1;
13724
13725       break;
13726
13727     case ARRAY_TYPE:
13728       {
13729         int count;
13730         tree index = TYPE_DOMAIN (type);
13731
13732         /* Can't handle incomplete types nor sizes that are not
13733            fixed.  */
13734         if (!COMPLETE_TYPE_P (type)
13735             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13736           return -1;
13737
13738         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
13739         if (count == -1
13740             || !index
13741             || !TYPE_MAX_VALUE (index)
13742             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
13743             || !TYPE_MIN_VALUE (index)
13744             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
13745             || count < 0)
13746           return -1;
13747
13748         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
13749                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
13750
13751         /* There must be no padding.  */
13752         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13753                       count * GET_MODE_BITSIZE (*modep)))
13754           return -1;
13755
13756         return count;
13757       }
13758
13759     case RECORD_TYPE:
13760       {
13761         int count = 0;
13762         int sub_count;
13763         tree field;
13764
13765         /* Can't handle incomplete types nor sizes that are not
13766            fixed.  */
13767         if (!COMPLETE_TYPE_P (type)
13768             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13769           return -1;
13770
13771         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13772           {
13773             if (TREE_CODE (field) != FIELD_DECL)
13774               continue;
13775
13776             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13777             if (sub_count < 0)
13778               return -1;
13779             count += sub_count;
13780           }
13781
13782         /* There must be no padding.  */
13783         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13784                       count * GET_MODE_BITSIZE (*modep)))
13785           return -1;
13786
13787         return count;
13788       }
13789
13790     case UNION_TYPE:
13791     case QUAL_UNION_TYPE:
13792       {
13793         /* These aren't very interesting except in a degenerate case.  */
13794         int count = 0;
13795         int sub_count;
13796         tree field;
13797
13798         /* Can't handle incomplete types nor sizes that are not
13799            fixed.  */
13800         if (!COMPLETE_TYPE_P (type)
13801             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13802           return -1;
13803
13804         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13805           {
13806             if (TREE_CODE (field) != FIELD_DECL)
13807               continue;
13808
13809             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13810             if (sub_count < 0)
13811               return -1;
13812             count = count > sub_count ? count : sub_count;
13813           }
13814
13815         /* There must be no padding.  */
13816         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13817                       count * GET_MODE_BITSIZE (*modep)))
13818           return -1;
13819
13820         return count;
13821       }
13822
13823     default:
13824       break;
13825     }
13826
13827   return -1;
13828 }
13829
13830 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
13831    type as described in AAPCS64 \S 4.1.2.
13832
13833    See the comment above aarch64_composite_type_p for the notes on MODE.  */
13834
13835 static bool
13836 aarch64_short_vector_p (const_tree type,
13837                         machine_mode mode)
13838 {
13839   poly_int64 size = -1;
13840
13841   if (type && TREE_CODE (type) == VECTOR_TYPE)
13842     size = int_size_in_bytes (type);
13843   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
13844             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
13845     size = GET_MODE_SIZE (mode);
13846
13847   return known_eq (size, 8) || known_eq (size, 16);
13848 }
13849
13850 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
13851    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
13852    array types.  The C99 floating-point complex types are also considered
13853    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
13854    types, which are GCC extensions and out of the scope of AAPCS64, are
13855    treated as composite types here as well.
13856
13857    Note that MODE itself is not sufficient in determining whether a type
13858    is such a composite type or not.  This is because
13859    stor-layout.c:compute_record_mode may have already changed the MODE
13860    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
13861    structure with only one field may have its MODE set to the mode of the
13862    field.  Also an integer mode whose size matches the size of the
13863    RECORD_TYPE type may be used to substitute the original mode
13864    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
13865    solely relied on.  */
13866
13867 static bool
13868 aarch64_composite_type_p (const_tree type,
13869                           machine_mode mode)
13870 {
13871   if (aarch64_short_vector_p (type, mode))
13872     return false;
13873
13874   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
13875     return true;
13876
13877   if (mode == BLKmode
13878       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
13879       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
13880     return true;
13881
13882   return false;
13883 }
13884
13885 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
13886    shall be passed or returned in simd/fp register(s) (providing these
13887    parameter passing registers are available).
13888
13889    Upon successful return, *COUNT returns the number of needed registers,
13890    *BASE_MODE returns the mode of the individual register and when IS_HAF
13891    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
13892    floating-point aggregate or a homogeneous short-vector aggregate.  */
13893
13894 static bool
13895 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
13896                                          const_tree type,
13897                                          machine_mode *base_mode,
13898                                          int *count,
13899                                          bool *is_ha)
13900 {
13901   machine_mode new_mode = VOIDmode;
13902   bool composite_p = aarch64_composite_type_p (type, mode);
13903
13904   if (is_ha != NULL) *is_ha = false;
13905
13906   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
13907       || aarch64_short_vector_p (type, mode))
13908     {
13909       *count = 1;
13910       new_mode = mode;
13911     }
13912   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
13913     {
13914       if (is_ha != NULL) *is_ha = true;
13915       *count = 2;
13916       new_mode = GET_MODE_INNER (mode);
13917     }
13918   else if (type && composite_p)
13919     {
13920       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
13921
13922       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
13923         {
13924           if (is_ha != NULL) *is_ha = true;
13925           *count = ag_count;
13926         }
13927       else
13928         return false;
13929     }
13930   else
13931     return false;
13932
13933   *base_mode = new_mode;
13934   return true;
13935 }
13936
13937 /* Implement TARGET_STRUCT_VALUE_RTX.  */
13938
13939 static rtx
13940 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
13941                           int incoming ATTRIBUTE_UNUSED)
13942 {
13943   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
13944 }
13945
13946 /* Implements target hook vector_mode_supported_p.  */
13947 static bool
13948 aarch64_vector_mode_supported_p (machine_mode mode)
13949 {
13950   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13951   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
13952 }
13953
13954 /* Return appropriate SIMD container
13955    for MODE within a vector of WIDTH bits.  */
13956 static machine_mode
13957 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
13958 {
13959   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
13960     switch (mode)
13961       {
13962       case E_DFmode:
13963         return VNx2DFmode;
13964       case E_SFmode:
13965         return VNx4SFmode;
13966       case E_HFmode:
13967         return VNx8HFmode;
13968       case E_DImode:
13969         return VNx2DImode;
13970       case E_SImode:
13971         return VNx4SImode;
13972       case E_HImode:
13973         return VNx8HImode;
13974       case E_QImode:
13975         return VNx16QImode;
13976       default:
13977         return word_mode;
13978       }
13979
13980   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
13981   if (TARGET_SIMD)
13982     {
13983       if (known_eq (width, 128))
13984         switch (mode)
13985           {
13986           case E_DFmode:
13987             return V2DFmode;
13988           case E_SFmode:
13989             return V4SFmode;
13990           case E_HFmode:
13991             return V8HFmode;
13992           case E_SImode:
13993             return V4SImode;
13994           case E_HImode:
13995             return V8HImode;
13996           case E_QImode:
13997             return V16QImode;
13998           case E_DImode:
13999             return V2DImode;
14000           default:
14001             break;
14002           }
14003       else
14004         switch (mode)
14005           {
14006           case E_SFmode:
14007             return V2SFmode;
14008           case E_HFmode:
14009             return V4HFmode;
14010           case E_SImode:
14011             return V2SImode;
14012           case E_HImode:
14013             return V4HImode;
14014           case E_QImode:
14015             return V8QImode;
14016           default:
14017             break;
14018           }
14019     }
14020   return word_mode;
14021 }
14022
14023 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
14024 static machine_mode
14025 aarch64_preferred_simd_mode (scalar_mode mode)
14026 {
14027   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
14028   return aarch64_simd_container_mode (mode, bits);
14029 }
14030
14031 /* Return a list of possible vector sizes for the vectorizer
14032    to iterate over.  */
14033 static void
14034 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
14035 {
14036   if (TARGET_SVE)
14037     sizes->safe_push (BYTES_PER_SVE_VECTOR);
14038   sizes->safe_push (16);
14039   sizes->safe_push (8);
14040 }
14041
14042 /* Implement TARGET_MANGLE_TYPE.  */
14043
14044 static const char *
14045 aarch64_mangle_type (const_tree type)
14046 {
14047   /* The AArch64 ABI documents say that "__va_list" has to be
14048      mangled as if it is in the "std" namespace.  */
14049   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
14050     return "St9__va_list";
14051
14052   /* Half-precision float.  */
14053   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
14054     return "Dh";
14055
14056   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
14057      builtin types.  */
14058   if (TYPE_NAME (type) != NULL)
14059     return aarch64_mangle_builtin_type (type);
14060
14061   /* Use the default mangling.  */
14062   return NULL;
14063 }
14064
14065 /* Find the first rtx_insn before insn that will generate an assembly
14066    instruction.  */
14067
14068 static rtx_insn *
14069 aarch64_prev_real_insn (rtx_insn *insn)
14070 {
14071   if (!insn)
14072     return NULL;
14073
14074   do
14075     {
14076       insn = prev_real_insn (insn);
14077     }
14078   while (insn && recog_memoized (insn) < 0);
14079
14080   return insn;
14081 }
14082
14083 static bool
14084 is_madd_op (enum attr_type t1)
14085 {
14086   unsigned int i;
14087   /* A number of these may be AArch32 only.  */
14088   enum attr_type mlatypes[] = {
14089     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
14090     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
14091     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
14092   };
14093
14094   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
14095     {
14096       if (t1 == mlatypes[i])
14097         return true;
14098     }
14099
14100   return false;
14101 }
14102
14103 /* Check if there is a register dependency between a load and the insn
14104    for which we hold recog_data.  */
14105
14106 static bool
14107 dep_between_memop_and_curr (rtx memop)
14108 {
14109   rtx load_reg;
14110   int opno;
14111
14112   gcc_assert (GET_CODE (memop) == SET);
14113
14114   if (!REG_P (SET_DEST (memop)))
14115     return false;
14116
14117   load_reg = SET_DEST (memop);
14118   for (opno = 1; opno < recog_data.n_operands; opno++)
14119     {
14120       rtx operand = recog_data.operand[opno];
14121       if (REG_P (operand)
14122           && reg_overlap_mentioned_p (load_reg, operand))
14123         return true;
14124
14125     }
14126   return false;
14127 }
14128
14129
14130 /* When working around the Cortex-A53 erratum 835769,
14131    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
14132    instruction and has a preceding memory instruction such that a NOP
14133    should be inserted between them.  */
14134
14135 bool
14136 aarch64_madd_needs_nop (rtx_insn* insn)
14137 {
14138   enum attr_type attr_type;
14139   rtx_insn *prev;
14140   rtx body;
14141
14142   if (!TARGET_FIX_ERR_A53_835769)
14143     return false;
14144
14145   if (!INSN_P (insn) || recog_memoized (insn) < 0)
14146     return false;
14147
14148   attr_type = get_attr_type (insn);
14149   if (!is_madd_op (attr_type))
14150     return false;
14151
14152   prev = aarch64_prev_real_insn (insn);
14153   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
14154      Restore recog state to INSN to avoid state corruption.  */
14155   extract_constrain_insn_cached (insn);
14156
14157   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
14158     return false;
14159
14160   body = single_set (prev);
14161
14162   /* If the previous insn is a memory op and there is no dependency between
14163      it and the DImode madd, emit a NOP between them.  If body is NULL then we
14164      have a complex memory operation, probably a load/store pair.
14165      Be conservative for now and emit a NOP.  */
14166   if (GET_MODE (recog_data.operand[0]) == DImode
14167       && (!body || !dep_between_memop_and_curr (body)))
14168     return true;
14169
14170   return false;
14171
14172 }
14173
14174
14175 /* Implement FINAL_PRESCAN_INSN.  */
14176
14177 void
14178 aarch64_final_prescan_insn (rtx_insn *insn)
14179 {
14180   if (aarch64_madd_needs_nop (insn))
14181     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
14182 }
14183
14184
14185 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
14186    instruction.  */
14187
14188 bool
14189 aarch64_sve_index_immediate_p (rtx base_or_step)
14190 {
14191   return (CONST_INT_P (base_or_step)
14192           && IN_RANGE (INTVAL (base_or_step), -16, 15));
14193 }
14194
14195 /* Return true if X is a valid immediate for the SVE ADD and SUB
14196    instructions.  Negate X first if NEGATE_P is true.  */
14197
14198 bool
14199 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
14200 {
14201   rtx elt;
14202
14203   if (!const_vec_duplicate_p (x, &elt)
14204       || !CONST_INT_P (elt))
14205     return false;
14206
14207   HOST_WIDE_INT val = INTVAL (elt);
14208   if (negate_p)
14209     val = -val;
14210   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
14211
14212   if (val & 0xff)
14213     return IN_RANGE (val, 0, 0xff);
14214   return IN_RANGE (val, 0, 0xff00);
14215 }
14216
14217 /* Return true if X is a valid immediate operand for an SVE logical
14218    instruction such as AND.  */
14219
14220 bool
14221 aarch64_sve_bitmask_immediate_p (rtx x)
14222 {
14223   rtx elt;
14224
14225   return (const_vec_duplicate_p (x, &elt)
14226           && CONST_INT_P (elt)
14227           && aarch64_bitmask_imm (INTVAL (elt),
14228                                   GET_MODE_INNER (GET_MODE (x))));
14229 }
14230
14231 /* Return true if X is a valid immediate for the SVE DUP and CPY
14232    instructions.  */
14233
14234 bool
14235 aarch64_sve_dup_immediate_p (rtx x)
14236 {
14237   rtx elt;
14238
14239   if (!const_vec_duplicate_p (x, &elt)
14240       || !CONST_INT_P (elt))
14241     return false;
14242
14243   HOST_WIDE_INT val = INTVAL (elt);
14244   if (val & 0xff)
14245     return IN_RANGE (val, -0x80, 0x7f);
14246   return IN_RANGE (val, -0x8000, 0x7f00);
14247 }
14248
14249 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14250    SIGNED_P says whether the operand is signed rather than unsigned.  */
14251
14252 bool
14253 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
14254 {
14255   rtx elt;
14256
14257   return (const_vec_duplicate_p (x, &elt)
14258           && CONST_INT_P (elt)
14259           && (signed_p
14260               ? IN_RANGE (INTVAL (elt), -16, 15)
14261               : IN_RANGE (INTVAL (elt), 0, 127)));
14262 }
14263
14264 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14265    instruction.  Negate X first if NEGATE_P is true.  */
14266
14267 bool
14268 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
14269 {
14270   rtx elt;
14271   REAL_VALUE_TYPE r;
14272
14273   if (!const_vec_duplicate_p (x, &elt)
14274       || GET_CODE (elt) != CONST_DOUBLE)
14275     return false;
14276
14277   r = *CONST_DOUBLE_REAL_VALUE (elt);
14278
14279   if (negate_p)
14280     r = real_value_negate (&r);
14281
14282   if (real_equal (&r, &dconst1))
14283     return true;
14284   if (real_equal (&r, &dconsthalf))
14285     return true;
14286   return false;
14287 }
14288
14289 /* Return true if X is a valid immediate operand for an SVE FMUL
14290    instruction.  */
14291
14292 bool
14293 aarch64_sve_float_mul_immediate_p (rtx x)
14294 {
14295   rtx elt;
14296
14297   /* GCC will never generate a multiply with an immediate of 2, so there is no
14298      point testing for it (even though it is a valid constant).  */
14299   return (const_vec_duplicate_p (x, &elt)
14300           && GET_CODE (elt) == CONST_DOUBLE
14301           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
14302 }
14303
14304 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14305    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
14306    is nonnull, use it to describe valid immediates.  */
14307 static bool
14308 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
14309                                     simd_immediate_info *info,
14310                                     enum simd_immediate_check which,
14311                                     simd_immediate_info::insn_type insn)
14312 {
14313   /* Try a 4-byte immediate with LSL.  */
14314   for (unsigned int shift = 0; shift < 32; shift += 8)
14315     if ((val32 & (0xff << shift)) == val32)
14316       {
14317         if (info)
14318           *info = simd_immediate_info (SImode, val32 >> shift, insn,
14319                                        simd_immediate_info::LSL, shift);
14320         return true;
14321       }
14322
14323   /* Try a 2-byte immediate with LSL.  */
14324   unsigned int imm16 = val32 & 0xffff;
14325   if (imm16 == (val32 >> 16))
14326     for (unsigned int shift = 0; shift < 16; shift += 8)
14327       if ((imm16 & (0xff << shift)) == imm16)
14328         {
14329           if (info)
14330             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
14331                                          simd_immediate_info::LSL, shift);
14332           return true;
14333         }
14334
14335   /* Try a 4-byte immediate with MSL, except for cases that MVN
14336      can handle.  */
14337   if (which == AARCH64_CHECK_MOV)
14338     for (unsigned int shift = 8; shift < 24; shift += 8)
14339       {
14340         unsigned int low = (1 << shift) - 1;
14341         if (((val32 & (0xff << shift)) | low) == val32)
14342           {
14343             if (info)
14344               *info = simd_immediate_info (SImode, val32 >> shift, insn,
14345                                            simd_immediate_info::MSL, shift);
14346             return true;
14347           }
14348       }
14349
14350   return false;
14351 }
14352
14353 /* Return true if replicating VAL64 is a valid immediate for the
14354    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
14355    use it to describe valid immediates.  */
14356 static bool
14357 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
14358                                  simd_immediate_info *info,
14359                                  enum simd_immediate_check which)
14360 {
14361   unsigned int val32 = val64 & 0xffffffff;
14362   unsigned int val16 = val64 & 0xffff;
14363   unsigned int val8 = val64 & 0xff;
14364
14365   if (val32 == (val64 >> 32))
14366     {
14367       if ((which & AARCH64_CHECK_ORR) != 0
14368           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
14369                                                  simd_immediate_info::MOV))
14370         return true;
14371
14372       if ((which & AARCH64_CHECK_BIC) != 0
14373           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
14374                                                  simd_immediate_info::MVN))
14375         return true;
14376
14377       /* Try using a replicated byte.  */
14378       if (which == AARCH64_CHECK_MOV
14379           && val16 == (val32 >> 16)
14380           && val8 == (val16 >> 8))
14381         {
14382           if (info)
14383             *info = simd_immediate_info (QImode, val8);
14384           return true;
14385         }
14386     }
14387
14388   /* Try using a bit-to-bytemask.  */
14389   if (which == AARCH64_CHECK_MOV)
14390     {
14391       unsigned int i;
14392       for (i = 0; i < 64; i += 8)
14393         {
14394           unsigned char byte = (val64 >> i) & 0xff;
14395           if (byte != 0 && byte != 0xff)
14396             break;
14397         }
14398       if (i == 64)
14399         {
14400           if (info)
14401             *info = simd_immediate_info (DImode, val64);
14402           return true;
14403         }
14404     }
14405   return false;
14406 }
14407
14408 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
14409    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
14410
14411 static bool
14412 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
14413                              simd_immediate_info *info)
14414 {
14415   scalar_int_mode mode = DImode;
14416   unsigned int val32 = val64 & 0xffffffff;
14417   if (val32 == (val64 >> 32))
14418     {
14419       mode = SImode;
14420       unsigned int val16 = val32 & 0xffff;
14421       if (val16 == (val32 >> 16))
14422         {
14423           mode = HImode;
14424           unsigned int val8 = val16 & 0xff;
14425           if (val8 == (val16 >> 8))
14426             mode = QImode;
14427         }
14428     }
14429   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
14430   if (IN_RANGE (val, -0x80, 0x7f))
14431     {
14432       /* DUP with no shift.  */
14433       if (info)
14434         *info = simd_immediate_info (mode, val);
14435       return true;
14436     }
14437   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
14438     {
14439       /* DUP with LSL #8.  */
14440       if (info)
14441         *info = simd_immediate_info (mode, val);
14442       return true;
14443     }
14444   if (aarch64_bitmask_imm (val64, mode))
14445     {
14446       /* DUPM.  */
14447       if (info)
14448         *info = simd_immediate_info (mode, val);
14449       return true;
14450     }
14451   return false;
14452 }
14453
14454 /* Return true if OP is a valid SIMD immediate for the operation
14455    described by WHICH.  If INFO is nonnull, use it to describe valid
14456    immediates.  */
14457 bool
14458 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
14459                               enum simd_immediate_check which)
14460 {
14461   machine_mode mode = GET_MODE (op);
14462   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14463   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14464     return false;
14465
14466   scalar_mode elt_mode = GET_MODE_INNER (mode);
14467   rtx base, step;
14468   unsigned int n_elts;
14469   if (GET_CODE (op) == CONST_VECTOR
14470       && CONST_VECTOR_DUPLICATE_P (op))
14471     n_elts = CONST_VECTOR_NPATTERNS (op);
14472   else if ((vec_flags & VEC_SVE_DATA)
14473            && const_vec_series_p (op, &base, &step))
14474     {
14475       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
14476       if (!aarch64_sve_index_immediate_p (base)
14477           || !aarch64_sve_index_immediate_p (step))
14478         return false;
14479
14480       if (info)
14481         *info = simd_immediate_info (elt_mode, base, step);
14482       return true;
14483     }
14484   else if (GET_CODE (op) == CONST_VECTOR
14485            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
14486     /* N_ELTS set above.  */;
14487   else
14488     return false;
14489
14490   /* Handle PFALSE and PTRUE.  */
14491   if (vec_flags & VEC_SVE_PRED)
14492     return (op == CONST0_RTX (mode)
14493             || op == CONSTM1_RTX (mode));
14494
14495   scalar_float_mode elt_float_mode;
14496   if (n_elts == 1
14497       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
14498     {
14499       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
14500       if (aarch64_float_const_zero_rtx_p (elt)
14501           || aarch64_float_const_representable_p (elt))
14502         {
14503           if (info)
14504             *info = simd_immediate_info (elt_float_mode, elt);
14505           return true;
14506         }
14507     }
14508
14509   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
14510   if (elt_size > 8)
14511     return false;
14512
14513   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
14514
14515   /* Expand the vector constant out into a byte vector, with the least
14516      significant byte of the register first.  */
14517   auto_vec<unsigned char, 16> bytes;
14518   bytes.reserve (n_elts * elt_size);
14519   for (unsigned int i = 0; i < n_elts; i++)
14520     {
14521       /* The vector is provided in gcc endian-neutral fashion.
14522          For aarch64_be Advanced SIMD, it must be laid out in the vector
14523          register in reverse order.  */
14524       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
14525       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
14526
14527       if (elt_mode != elt_int_mode)
14528         elt = gen_lowpart (elt_int_mode, elt);
14529
14530       if (!CONST_INT_P (elt))
14531         return false;
14532
14533       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
14534       for (unsigned int byte = 0; byte < elt_size; byte++)
14535         {
14536           bytes.quick_push (elt_val & 0xff);
14537           elt_val >>= BITS_PER_UNIT;
14538         }
14539     }
14540
14541   /* The immediate must repeat every eight bytes.  */
14542   unsigned int nbytes = bytes.length ();
14543   for (unsigned i = 8; i < nbytes; ++i)
14544     if (bytes[i] != bytes[i - 8])
14545       return false;
14546
14547   /* Get the repeating 8-byte value as an integer.  No endian correction
14548      is needed here because bytes is already in lsb-first order.  */
14549   unsigned HOST_WIDE_INT val64 = 0;
14550   for (unsigned int i = 0; i < 8; i++)
14551     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
14552               << (i * BITS_PER_UNIT));
14553
14554   if (vec_flags & VEC_SVE_DATA)
14555     return aarch64_sve_valid_immediate (val64, info);
14556   else
14557     return aarch64_advsimd_valid_immediate (val64, info, which);
14558 }
14559
14560 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
14561    has a step in the range of INDEX.  Return the index expression if so,
14562    otherwise return null.  */
14563 rtx
14564 aarch64_check_zero_based_sve_index_immediate (rtx x)
14565 {
14566   rtx base, step;
14567   if (const_vec_series_p (x, &base, &step)
14568       && base == const0_rtx
14569       && aarch64_sve_index_immediate_p (step))
14570     return step;
14571   return NULL_RTX;
14572 }
14573
14574 /* Check of immediate shift constants are within range.  */
14575 bool
14576 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
14577 {
14578   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
14579   if (left)
14580     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
14581   else
14582     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
14583 }
14584
14585 /* Return the bitmask CONST_INT to select the bits required by a zero extract
14586    operation of width WIDTH at bit position POS.  */
14587
14588 rtx
14589 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
14590 {
14591   gcc_assert (CONST_INT_P (width));
14592   gcc_assert (CONST_INT_P (pos));
14593
14594   unsigned HOST_WIDE_INT mask
14595     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
14596   return GEN_INT (mask << UINTVAL (pos));
14597 }
14598
14599 bool
14600 aarch64_mov_operand_p (rtx x, machine_mode mode)
14601 {
14602   if (GET_CODE (x) == HIGH
14603       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
14604     return true;
14605
14606   if (CONST_INT_P (x))
14607     return true;
14608
14609   if (VECTOR_MODE_P (GET_MODE (x)))
14610     return aarch64_simd_valid_immediate (x, NULL);
14611
14612   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
14613     return true;
14614
14615   if (aarch64_sve_cnt_immediate_p (x))
14616     return true;
14617
14618   return aarch64_classify_symbolic_expression (x)
14619     == SYMBOL_TINY_ABSOLUTE;
14620 }
14621
14622 /* Return a const_int vector of VAL.  */
14623 rtx
14624 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
14625 {
14626   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
14627   return gen_const_vec_duplicate (mode, c);
14628 }
14629
14630 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
14631
14632 bool
14633 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
14634 {
14635   machine_mode vmode;
14636
14637   vmode = aarch64_simd_container_mode (mode, 64);
14638   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
14639   return aarch64_simd_valid_immediate (op_v, NULL);
14640 }
14641
14642 /* Construct and return a PARALLEL RTX vector with elements numbering the
14643    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
14644    the vector - from the perspective of the architecture.  This does not
14645    line up with GCC's perspective on lane numbers, so we end up with
14646    different masks depending on our target endian-ness.  The diagram
14647    below may help.  We must draw the distinction when building masks
14648    which select one half of the vector.  An instruction selecting
14649    architectural low-lanes for a big-endian target, must be described using
14650    a mask selecting GCC high-lanes.
14651
14652                  Big-Endian             Little-Endian
14653
14654 GCC             0   1   2   3           3   2   1   0
14655               | x | x | x | x |       | x | x | x | x |
14656 Architecture    3   2   1   0           3   2   1   0
14657
14658 Low Mask:         { 2, 3 }                { 0, 1 }
14659 High Mask:        { 0, 1 }                { 2, 3 }
14660
14661    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
14662
14663 rtx
14664 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
14665 {
14666   rtvec v = rtvec_alloc (nunits / 2);
14667   int high_base = nunits / 2;
14668   int low_base = 0;
14669   int base;
14670   rtx t1;
14671   int i;
14672
14673   if (BYTES_BIG_ENDIAN)
14674     base = high ? low_base : high_base;
14675   else
14676     base = high ? high_base : low_base;
14677
14678   for (i = 0; i < nunits / 2; i++)
14679     RTVEC_ELT (v, i) = GEN_INT (base + i);
14680
14681   t1 = gen_rtx_PARALLEL (mode, v);
14682   return t1;
14683 }
14684
14685 /* Check OP for validity as a PARALLEL RTX vector with elements
14686    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
14687    from the perspective of the architecture.  See the diagram above
14688    aarch64_simd_vect_par_cnst_half for more details.  */
14689
14690 bool
14691 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
14692                                        bool high)
14693 {
14694   int nelts;
14695   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
14696     return false;
14697
14698   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
14699   HOST_WIDE_INT count_op = XVECLEN (op, 0);
14700   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
14701   int i = 0;
14702
14703   if (count_op != count_ideal)
14704     return false;
14705
14706   for (i = 0; i < count_ideal; i++)
14707     {
14708       rtx elt_op = XVECEXP (op, 0, i);
14709       rtx elt_ideal = XVECEXP (ideal, 0, i);
14710
14711       if (!CONST_INT_P (elt_op)
14712           || INTVAL (elt_ideal) != INTVAL (elt_op))
14713         return false;
14714     }
14715   return true;
14716 }
14717
14718 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
14719    HIGH (exclusive).  */
14720 void
14721 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
14722                           const_tree exp)
14723 {
14724   HOST_WIDE_INT lane;
14725   gcc_assert (CONST_INT_P (operand));
14726   lane = INTVAL (operand);
14727
14728   if (lane < low || lane >= high)
14729   {
14730     if (exp)
14731       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
14732     else
14733       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
14734   }
14735 }
14736
14737 /* Peform endian correction on lane number N, which indexes a vector
14738    of mode MODE, and return the result as an SImode rtx.  */
14739
14740 rtx
14741 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
14742 {
14743   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
14744 }
14745
14746 /* Return TRUE if OP is a valid vector addressing mode.  */
14747
14748 bool
14749 aarch64_simd_mem_operand_p (rtx op)
14750 {
14751   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
14752                         || REG_P (XEXP (op, 0)));
14753 }
14754
14755 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
14756
14757 bool
14758 aarch64_sve_ld1r_operand_p (rtx op)
14759 {
14760   struct aarch64_address_info addr;
14761   scalar_mode mode;
14762
14763   return (MEM_P (op)
14764           && is_a <scalar_mode> (GET_MODE (op), &mode)
14765           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
14766           && addr.type == ADDRESS_REG_IMM
14767           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
14768 }
14769
14770 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
14771    The conditions for STR are the same.  */
14772 bool
14773 aarch64_sve_ldr_operand_p (rtx op)
14774 {
14775   struct aarch64_address_info addr;
14776
14777   return (MEM_P (op)
14778           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
14779                                        false, ADDR_QUERY_ANY)
14780           && addr.type == ADDRESS_REG_IMM);
14781 }
14782
14783 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
14784    We need to be able to access the individual pieces, so the range
14785    is different from LD[234] and ST[234].  */
14786 bool
14787 aarch64_sve_struct_memory_operand_p (rtx op)
14788 {
14789   if (!MEM_P (op))
14790     return false;
14791
14792   machine_mode mode = GET_MODE (op);
14793   struct aarch64_address_info addr;
14794   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
14795                                  ADDR_QUERY_ANY)
14796       || addr.type != ADDRESS_REG_IMM)
14797     return false;
14798
14799   poly_int64 first = addr.const_offset;
14800   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
14801   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
14802           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
14803 }
14804
14805 /* Emit a register copy from operand to operand, taking care not to
14806    early-clobber source registers in the process.
14807
14808    COUNT is the number of components into which the copy needs to be
14809    decomposed.  */
14810 void
14811 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
14812                                 unsigned int count)
14813 {
14814   unsigned int i;
14815   int rdest = REGNO (operands[0]);
14816   int rsrc = REGNO (operands[1]);
14817
14818   if (!reg_overlap_mentioned_p (operands[0], operands[1])
14819       || rdest < rsrc)
14820     for (i = 0; i < count; i++)
14821       emit_move_insn (gen_rtx_REG (mode, rdest + i),
14822                       gen_rtx_REG (mode, rsrc + i));
14823   else
14824     for (i = 0; i < count; i++)
14825       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
14826                       gen_rtx_REG (mode, rsrc + count - i - 1));
14827 }
14828
14829 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
14830    one of VSTRUCT modes: OI, CI, or XI.  */
14831 int
14832 aarch64_simd_attr_length_rglist (machine_mode mode)
14833 {
14834   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
14835   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
14836 }
14837
14838 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
14839    alignment of a vector to 128 bits.  SVE predicates have an alignment of
14840    16 bits.  */
14841 static HOST_WIDE_INT
14842 aarch64_simd_vector_alignment (const_tree type)
14843 {
14844   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14845     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
14846        be set for non-predicate vectors of booleans.  Modes are the most
14847        direct way we have of identifying real SVE predicate types.  */
14848     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
14849   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
14850   return MIN (align, 128);
14851 }
14852
14853 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
14854 static poly_uint64
14855 aarch64_vectorize_preferred_vector_alignment (const_tree type)
14856 {
14857   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
14858     {
14859       /* If the length of the vector is fixed, try to align to that length,
14860          otherwise don't try to align at all.  */
14861       HOST_WIDE_INT result;
14862       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
14863         result = TYPE_ALIGN (TREE_TYPE (type));
14864       return result;
14865     }
14866   return TYPE_ALIGN (type);
14867 }
14868
14869 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
14870 static bool
14871 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
14872 {
14873   if (is_packed)
14874     return false;
14875
14876   /* For fixed-length vectors, check that the vectorizer will aim for
14877      full-vector alignment.  This isn't true for generic GCC vectors
14878      that are wider than the ABI maximum of 128 bits.  */
14879   poly_uint64 preferred_alignment =
14880     aarch64_vectorize_preferred_vector_alignment (type);
14881   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
14882       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
14883                    preferred_alignment))
14884     return false;
14885
14886   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
14887   return true;
14888 }
14889
14890 /* Return true if the vector misalignment factor is supported by the
14891    target.  */
14892 static bool
14893 aarch64_builtin_support_vector_misalignment (machine_mode mode,
14894                                              const_tree type, int misalignment,
14895                                              bool is_packed)
14896 {
14897   if (TARGET_SIMD && STRICT_ALIGNMENT)
14898     {
14899       /* Return if movmisalign pattern is not supported for this mode.  */
14900       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
14901         return false;
14902
14903       /* Misalignment factor is unknown at compile time.  */
14904       if (misalignment == -1)
14905         return false;
14906     }
14907   return default_builtin_support_vector_misalignment (mode, type, misalignment,
14908                                                       is_packed);
14909 }
14910
14911 /* If VALS is a vector constant that can be loaded into a register
14912    using DUP, generate instructions to do so and return an RTX to
14913    assign to the register.  Otherwise return NULL_RTX.  */
14914 static rtx
14915 aarch64_simd_dup_constant (rtx vals)
14916 {
14917   machine_mode mode = GET_MODE (vals);
14918   machine_mode inner_mode = GET_MODE_INNER (mode);
14919   rtx x;
14920
14921   if (!const_vec_duplicate_p (vals, &x))
14922     return NULL_RTX;
14923
14924   /* We can load this constant by using DUP and a constant in a
14925      single ARM register.  This will be cheaper than a vector
14926      load.  */
14927   x = copy_to_mode_reg (inner_mode, x);
14928   return gen_vec_duplicate (mode, x);
14929 }
14930
14931
14932 /* Generate code to load VALS, which is a PARALLEL containing only
14933    constants (for vec_init) or CONST_VECTOR, efficiently into a
14934    register.  Returns an RTX to copy into the register, or NULL_RTX
14935    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
14936 static rtx
14937 aarch64_simd_make_constant (rtx vals)
14938 {
14939   machine_mode mode = GET_MODE (vals);
14940   rtx const_dup;
14941   rtx const_vec = NULL_RTX;
14942   int n_const = 0;
14943   int i;
14944
14945   if (GET_CODE (vals) == CONST_VECTOR)
14946     const_vec = vals;
14947   else if (GET_CODE (vals) == PARALLEL)
14948     {
14949       /* A CONST_VECTOR must contain only CONST_INTs and
14950          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
14951          Only store valid constants in a CONST_VECTOR.  */
14952       int n_elts = XVECLEN (vals, 0);
14953       for (i = 0; i < n_elts; ++i)
14954         {
14955           rtx x = XVECEXP (vals, 0, i);
14956           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14957             n_const++;
14958         }
14959       if (n_const == n_elts)
14960         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
14961     }
14962   else
14963     gcc_unreachable ();
14964
14965   if (const_vec != NULL_RTX
14966       && aarch64_simd_valid_immediate (const_vec, NULL))
14967     /* Load using MOVI/MVNI.  */
14968     return const_vec;
14969   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
14970     /* Loaded using DUP.  */
14971     return const_dup;
14972   else if (const_vec != NULL_RTX)
14973     /* Load from constant pool. We cannot take advantage of single-cycle
14974        LD1 because we need a PC-relative addressing mode.  */
14975     return const_vec;
14976   else
14977     /* A PARALLEL containing something not valid inside CONST_VECTOR.
14978        We cannot construct an initializer.  */
14979     return NULL_RTX;
14980 }
14981
14982 /* Expand a vector initialisation sequence, such that TARGET is
14983    initialised to contain VALS.  */
14984
14985 void
14986 aarch64_expand_vector_init (rtx target, rtx vals)
14987 {
14988   machine_mode mode = GET_MODE (target);
14989   scalar_mode inner_mode = GET_MODE_INNER (mode);
14990   /* The number of vector elements.  */
14991   int n_elts = XVECLEN (vals, 0);
14992   /* The number of vector elements which are not constant.  */
14993   int n_var = 0;
14994   rtx any_const = NULL_RTX;
14995   /* The first element of vals.  */
14996   rtx v0 = XVECEXP (vals, 0, 0);
14997   bool all_same = true;
14998
14999   /* Count the number of variable elements to initialise.  */
15000   for (int i = 0; i < n_elts; ++i)
15001     {
15002       rtx x = XVECEXP (vals, 0, i);
15003       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
15004         ++n_var;
15005       else
15006         any_const = x;
15007
15008       all_same &= rtx_equal_p (x, v0);
15009     }
15010
15011   /* No variable elements, hand off to aarch64_simd_make_constant which knows
15012      how best to handle this.  */
15013   if (n_var == 0)
15014     {
15015       rtx constant = aarch64_simd_make_constant (vals);
15016       if (constant != NULL_RTX)
15017         {
15018           emit_move_insn (target, constant);
15019           return;
15020         }
15021     }
15022
15023   /* Splat a single non-constant element if we can.  */
15024   if (all_same)
15025     {
15026       rtx x = copy_to_mode_reg (inner_mode, v0);
15027       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15028       return;
15029     }
15030
15031   enum insn_code icode = optab_handler (vec_set_optab, mode);
15032   gcc_assert (icode != CODE_FOR_nothing);
15033
15034   /* If there are only variable elements, try to optimize
15035      the insertion using dup for the most common element
15036      followed by insertions.  */
15037
15038   /* The algorithm will fill matches[*][0] with the earliest matching element,
15039      and matches[X][1] with the count of duplicate elements (if X is the
15040      earliest element which has duplicates).  */
15041
15042   if (n_var == n_elts && n_elts <= 16)
15043     {
15044       int matches[16][2] = {0};
15045       for (int i = 0; i < n_elts; i++)
15046         {
15047           for (int j = 0; j <= i; j++)
15048             {
15049               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
15050                 {
15051                   matches[i][0] = j;
15052                   matches[j][1]++;
15053                   break;
15054                 }
15055             }
15056         }
15057       int maxelement = 0;
15058       int maxv = 0;
15059       for (int i = 0; i < n_elts; i++)
15060         if (matches[i][1] > maxv)
15061           {
15062             maxelement = i;
15063             maxv = matches[i][1];
15064           }
15065
15066       /* Create a duplicate of the most common element, unless all elements
15067          are equally useless to us, in which case just immediately set the
15068          vector register using the first element.  */
15069
15070       if (maxv == 1)
15071         {
15072           /* For vectors of two 64-bit elements, we can do even better.  */
15073           if (n_elts == 2
15074               && (inner_mode == E_DImode
15075                   || inner_mode == E_DFmode))
15076
15077             {
15078               rtx x0 = XVECEXP (vals, 0, 0);
15079               rtx x1 = XVECEXP (vals, 0, 1);
15080               /* Combine can pick up this case, but handling it directly
15081                  here leaves clearer RTL.
15082
15083                  This is load_pair_lanes<mode>, and also gives us a clean-up
15084                  for store_pair_lanes<mode>.  */
15085               if (memory_operand (x0, inner_mode)
15086                   && memory_operand (x1, inner_mode)
15087                   && !STRICT_ALIGNMENT
15088                   && rtx_equal_p (XEXP (x1, 0),
15089                                   plus_constant (Pmode,
15090                                                  XEXP (x0, 0),
15091                                                  GET_MODE_SIZE (inner_mode))))
15092                 {
15093                   rtx t;
15094                   if (inner_mode == DFmode)
15095                     t = gen_load_pair_lanesdf (target, x0, x1);
15096                   else
15097                     t = gen_load_pair_lanesdi (target, x0, x1);
15098                   emit_insn (t);
15099                   return;
15100                 }
15101             }
15102           /* The subreg-move sequence below will move into lane zero of the
15103              vector register.  For big-endian we want that position to hold
15104              the last element of VALS.  */
15105           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
15106           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15107           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
15108         }
15109       else
15110         {
15111           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15112           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15113         }
15114
15115       /* Insert the rest.  */
15116       for (int i = 0; i < n_elts; i++)
15117         {
15118           rtx x = XVECEXP (vals, 0, i);
15119           if (matches[i][0] == maxelement)
15120             continue;
15121           x = copy_to_mode_reg (inner_mode, x);
15122           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15123         }
15124       return;
15125     }
15126
15127   /* Initialise a vector which is part-variable.  We want to first try
15128      to build those lanes which are constant in the most efficient way we
15129      can.  */
15130   if (n_var != n_elts)
15131     {
15132       rtx copy = copy_rtx (vals);
15133
15134       /* Load constant part of vector.  We really don't care what goes into the
15135          parts we will overwrite, but we're more likely to be able to load the
15136          constant efficiently if it has fewer, larger, repeating parts
15137          (see aarch64_simd_valid_immediate).  */
15138       for (int i = 0; i < n_elts; i++)
15139         {
15140           rtx x = XVECEXP (vals, 0, i);
15141           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15142             continue;
15143           rtx subst = any_const;
15144           for (int bit = n_elts / 2; bit > 0; bit /= 2)
15145             {
15146               /* Look in the copied vector, as more elements are const.  */
15147               rtx test = XVECEXP (copy, 0, i ^ bit);
15148               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
15149                 {
15150                   subst = test;
15151                   break;
15152                 }
15153             }
15154           XVECEXP (copy, 0, i) = subst;
15155         }
15156       aarch64_expand_vector_init (target, copy);
15157     }
15158
15159   /* Insert the variable lanes directly.  */
15160   for (int i = 0; i < n_elts; i++)
15161     {
15162       rtx x = XVECEXP (vals, 0, i);
15163       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15164         continue;
15165       x = copy_to_mode_reg (inner_mode, x);
15166       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15167     }
15168 }
15169
15170 static unsigned HOST_WIDE_INT
15171 aarch64_shift_truncation_mask (machine_mode mode)
15172 {
15173   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
15174     return 0;
15175   return GET_MODE_UNIT_BITSIZE (mode) - 1;
15176 }
15177
15178 /* Select a format to encode pointers in exception handling data.  */
15179 int
15180 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
15181 {
15182    int type;
15183    switch (aarch64_cmodel)
15184      {
15185      case AARCH64_CMODEL_TINY:
15186      case AARCH64_CMODEL_TINY_PIC:
15187      case AARCH64_CMODEL_SMALL:
15188      case AARCH64_CMODEL_SMALL_PIC:
15189      case AARCH64_CMODEL_SMALL_SPIC:
15190        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
15191           for everything.  */
15192        type = DW_EH_PE_sdata4;
15193        break;
15194      default:
15195        /* No assumptions here.  8-byte relocs required.  */
15196        type = DW_EH_PE_sdata8;
15197        break;
15198      }
15199    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
15200 }
15201
15202 /* The last .arch and .tune assembly strings that we printed.  */
15203 static std::string aarch64_last_printed_arch_string;
15204 static std::string aarch64_last_printed_tune_string;
15205
15206 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
15207    by the function fndecl.  */
15208
15209 void
15210 aarch64_declare_function_name (FILE *stream, const char* name,
15211                                 tree fndecl)
15212 {
15213   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15214
15215   struct cl_target_option *targ_options;
15216   if (target_parts)
15217     targ_options = TREE_TARGET_OPTION (target_parts);
15218   else
15219     targ_options = TREE_TARGET_OPTION (target_option_current_node);
15220   gcc_assert (targ_options);
15221
15222   const struct processor *this_arch
15223     = aarch64_get_arch (targ_options->x_explicit_arch);
15224
15225   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
15226   std::string extension
15227     = aarch64_get_extension_string_for_isa_flags (isa_flags,
15228                                                   this_arch->flags);
15229   /* Only update the assembler .arch string if it is distinct from the last
15230      such string we printed.  */
15231   std::string to_print = this_arch->name + extension;
15232   if (to_print != aarch64_last_printed_arch_string)
15233     {
15234       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
15235       aarch64_last_printed_arch_string = to_print;
15236     }
15237
15238   /* Print the cpu name we're tuning for in the comments, might be
15239      useful to readers of the generated asm.  Do it only when it changes
15240      from function to function and verbose assembly is requested.  */
15241   const struct processor *this_tune
15242     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
15243
15244   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
15245     {
15246       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
15247                    this_tune->name);
15248       aarch64_last_printed_tune_string = this_tune->name;
15249     }
15250
15251   /* Don't forget the type directive for ELF.  */
15252   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
15253   ASM_OUTPUT_LABEL (stream, name);
15254 }
15255
15256 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
15257
15258 static void
15259 aarch64_start_file (void)
15260 {
15261   struct cl_target_option *default_options
15262     = TREE_TARGET_OPTION (target_option_default_node);
15263
15264   const struct processor *default_arch
15265     = aarch64_get_arch (default_options->x_explicit_arch);
15266   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
15267   std::string extension
15268     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
15269                                                   default_arch->flags);
15270
15271    aarch64_last_printed_arch_string = default_arch->name + extension;
15272    aarch64_last_printed_tune_string = "";
15273    asm_fprintf (asm_out_file, "\t.arch %s\n",
15274                 aarch64_last_printed_arch_string.c_str ());
15275
15276    default_file_start ();
15277 }
15278
15279 /* Emit load exclusive.  */
15280
15281 static void
15282 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
15283                              rtx mem, rtx model_rtx)
15284 {
15285   emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
15286 }
15287
15288 /* Emit store exclusive.  */
15289
15290 static void
15291 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
15292                               rtx rval, rtx mem, rtx model_rtx)
15293 {
15294   emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
15295 }
15296
15297 /* Mark the previous jump instruction as unlikely.  */
15298
15299 static void
15300 aarch64_emit_unlikely_jump (rtx insn)
15301 {
15302   rtx_insn *jump = emit_jump_insn (insn);
15303   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
15304 }
15305
15306 /* Expand a compare and swap pattern.  */
15307
15308 void
15309 aarch64_expand_compare_and_swap (rtx operands[])
15310 {
15311   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
15312   machine_mode mode, r_mode;
15313
15314   bval = operands[0];
15315   rval = operands[1];
15316   mem = operands[2];
15317   oldval = operands[3];
15318   newval = operands[4];
15319   is_weak = operands[5];
15320   mod_s = operands[6];
15321   mod_f = operands[7];
15322   mode = GET_MODE (mem);
15323
15324   /* Normally the succ memory model must be stronger than fail, but in the
15325      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
15326      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
15327   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
15328       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
15329     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
15330
15331   r_mode = mode;
15332   if (mode == QImode || mode == HImode)
15333     {
15334       r_mode = SImode;
15335       rval = gen_reg_rtx (r_mode);
15336     }
15337
15338   if (TARGET_LSE)
15339     {
15340       /* The CAS insn requires oldval and rval overlap, but we need to
15341          have a copy of oldval saved across the operation to tell if
15342          the operation is successful.  */
15343       if (reg_overlap_mentioned_p (rval, oldval))
15344         rval = copy_to_mode_reg (r_mode, oldval);
15345       else
15346         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
15347
15348       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
15349                                                    newval, mod_s));
15350       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15351     }
15352   else
15353     {
15354       /* The oldval predicate varies by mode.  Test it and force to reg.  */
15355       insn_code code = code_for_aarch64_compare_and_swap (mode);
15356       if (!insn_data[code].operand[2].predicate (oldval, mode))
15357         oldval = force_reg (mode, oldval);
15358
15359       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
15360                                  is_weak, mod_s, mod_f));
15361       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
15362     }
15363
15364   if (r_mode != mode)
15365     rval = gen_lowpart (mode, rval);
15366   emit_move_insn (operands[1], rval);
15367
15368   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
15369   emit_insn (gen_rtx_SET (bval, x));
15370 }
15371
15372 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
15373    sequence implementing an atomic operation.  */
15374
15375 static void
15376 aarch64_emit_post_barrier (enum memmodel model)
15377 {
15378   const enum memmodel base_model = memmodel_base (model);
15379
15380   if (is_mm_sync (model)
15381       && (base_model == MEMMODEL_ACQUIRE
15382           || base_model == MEMMODEL_ACQ_REL
15383           || base_model == MEMMODEL_SEQ_CST))
15384     {
15385       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
15386     }
15387 }
15388
15389 /* Split a compare and swap pattern.  */
15390
15391 void
15392 aarch64_split_compare_and_swap (rtx operands[])
15393 {
15394   rtx rval, mem, oldval, newval, scratch;
15395   machine_mode mode;
15396   bool is_weak;
15397   rtx_code_label *label1, *label2;
15398   rtx x, cond;
15399   enum memmodel model;
15400   rtx model_rtx;
15401
15402   rval = operands[0];
15403   mem = operands[1];
15404   oldval = operands[2];
15405   newval = operands[3];
15406   is_weak = (operands[4] != const0_rtx);
15407   model_rtx = operands[5];
15408   scratch = operands[7];
15409   mode = GET_MODE (mem);
15410   model = memmodel_from_int (INTVAL (model_rtx));
15411
15412   /* When OLDVAL is zero and we want the strong version we can emit a tighter
15413     loop:
15414     .label1:
15415         LD[A]XR rval, [mem]
15416         CBNZ    rval, .label2
15417         ST[L]XR scratch, newval, [mem]
15418         CBNZ    scratch, .label1
15419     .label2:
15420         CMP     rval, 0.  */
15421   bool strong_zero_p = !is_weak && oldval == const0_rtx;
15422
15423   label1 = NULL;
15424   if (!is_weak)
15425     {
15426       label1 = gen_label_rtx ();
15427       emit_label (label1);
15428     }
15429   label2 = gen_label_rtx ();
15430
15431   /* The initial load can be relaxed for a __sync operation since a final
15432      barrier will be emitted to stop code hoisting.  */
15433   if (is_mm_sync (model))
15434     aarch64_emit_load_exclusive (mode, rval, mem,
15435                                  GEN_INT (MEMMODEL_RELAXED));
15436   else
15437     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
15438
15439   if (strong_zero_p)
15440     {
15441       if (aarch64_track_speculation)
15442         {
15443           /* Emit an explicit compare instruction, so that we can correctly
15444              track the condition codes.  */
15445           rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
15446           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15447         }
15448       else
15449         x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
15450
15451       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15452                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15453       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15454     }
15455   else
15456     {
15457       cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15458       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15459       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15460                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15461       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15462     }
15463
15464   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
15465
15466   if (!is_weak)
15467     {
15468       if (aarch64_track_speculation)
15469         {
15470           /* Emit an explicit compare instruction, so that we can correctly
15471              track the condition codes.  */
15472           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
15473           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15474         }
15475       else
15476         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
15477
15478       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15479                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
15480       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15481     }
15482   else
15483     {
15484       cond = gen_rtx_REG (CCmode, CC_REGNUM);
15485       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
15486       emit_insn (gen_rtx_SET (cond, x));
15487     }
15488
15489   emit_label (label2);
15490   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
15491      to set the condition flags.  If this is not used it will be removed by
15492      later passes.  */
15493   if (strong_zero_p)
15494     {
15495       cond = gen_rtx_REG (CCmode, CC_REGNUM);
15496       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
15497       emit_insn (gen_rtx_SET (cond, x));
15498     }
15499   /* Emit any final barrier needed for a __sync operation.  */
15500   if (is_mm_sync (model))
15501     aarch64_emit_post_barrier (model);
15502 }
15503
15504 /* Split an atomic operation.  */
15505
15506 void
15507 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
15508                          rtx value, rtx model_rtx, rtx cond)
15509 {
15510   machine_mode mode = GET_MODE (mem);
15511   machine_mode wmode = (mode == DImode ? DImode : SImode);
15512   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
15513   const bool is_sync = is_mm_sync (model);
15514   rtx_code_label *label;
15515   rtx x;
15516
15517   /* Split the atomic operation into a sequence.  */
15518   label = gen_label_rtx ();
15519   emit_label (label);
15520
15521   if (new_out)
15522     new_out = gen_lowpart (wmode, new_out);
15523   if (old_out)
15524     old_out = gen_lowpart (wmode, old_out);
15525   else
15526     old_out = new_out;
15527   value = simplify_gen_subreg (wmode, value, mode, 0);
15528
15529   /* The initial load can be relaxed for a __sync operation since a final
15530      barrier will be emitted to stop code hoisting.  */
15531  if (is_sync)
15532     aarch64_emit_load_exclusive (mode, old_out, mem,
15533                                  GEN_INT (MEMMODEL_RELAXED));
15534   else
15535     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
15536
15537   switch (code)
15538     {
15539     case SET:
15540       new_out = value;
15541       break;
15542
15543     case NOT:
15544       x = gen_rtx_AND (wmode, old_out, value);
15545       emit_insn (gen_rtx_SET (new_out, x));
15546       x = gen_rtx_NOT (wmode, new_out);
15547       emit_insn (gen_rtx_SET (new_out, x));
15548       break;
15549
15550     case MINUS:
15551       if (CONST_INT_P (value))
15552         {
15553           value = GEN_INT (-INTVAL (value));
15554           code = PLUS;
15555         }
15556       /* Fall through.  */
15557
15558     default:
15559       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
15560       emit_insn (gen_rtx_SET (new_out, x));
15561       break;
15562     }
15563
15564   aarch64_emit_store_exclusive (mode, cond, mem,
15565                                 gen_lowpart (mode, new_out), model_rtx);
15566
15567   if (aarch64_track_speculation)
15568     {
15569       /* Emit an explicit compare instruction, so that we can correctly
15570          track the condition codes.  */
15571       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
15572       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15573     }
15574   else
15575     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15576
15577   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15578                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
15579   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15580
15581   /* Emit any final barrier needed for a __sync operation.  */
15582   if (is_sync)
15583     aarch64_emit_post_barrier (model);
15584 }
15585
15586 static void
15587 aarch64_init_libfuncs (void)
15588 {
15589    /* Half-precision float operations.  The compiler handles all operations
15590      with NULL libfuncs by converting to SFmode.  */
15591
15592   /* Conversions.  */
15593   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
15594   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
15595
15596   /* Arithmetic.  */
15597   set_optab_libfunc (add_optab, HFmode, NULL);
15598   set_optab_libfunc (sdiv_optab, HFmode, NULL);
15599   set_optab_libfunc (smul_optab, HFmode, NULL);
15600   set_optab_libfunc (neg_optab, HFmode, NULL);
15601   set_optab_libfunc (sub_optab, HFmode, NULL);
15602
15603   /* Comparisons.  */
15604   set_optab_libfunc (eq_optab, HFmode, NULL);
15605   set_optab_libfunc (ne_optab, HFmode, NULL);
15606   set_optab_libfunc (lt_optab, HFmode, NULL);
15607   set_optab_libfunc (le_optab, HFmode, NULL);
15608   set_optab_libfunc (ge_optab, HFmode, NULL);
15609   set_optab_libfunc (gt_optab, HFmode, NULL);
15610   set_optab_libfunc (unord_optab, HFmode, NULL);
15611 }
15612
15613 /* Target hook for c_mode_for_suffix.  */
15614 static machine_mode
15615 aarch64_c_mode_for_suffix (char suffix)
15616 {
15617   if (suffix == 'q')
15618     return TFmode;
15619
15620   return VOIDmode;
15621 }
15622
15623 /* We can only represent floating point constants which will fit in
15624    "quarter-precision" values.  These values are characterised by
15625    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
15626    by:
15627
15628    (-1)^s * (n/16) * 2^r
15629
15630    Where:
15631      's' is the sign bit.
15632      'n' is an integer in the range 16 <= n <= 31.
15633      'r' is an integer in the range -3 <= r <= 4.  */
15634
15635 /* Return true iff X can be represented by a quarter-precision
15636    floating point immediate operand X.  Note, we cannot represent 0.0.  */
15637 bool
15638 aarch64_float_const_representable_p (rtx x)
15639 {
15640   /* This represents our current view of how many bits
15641      make up the mantissa.  */
15642   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
15643   int exponent;
15644   unsigned HOST_WIDE_INT mantissa, mask;
15645   REAL_VALUE_TYPE r, m;
15646   bool fail;
15647
15648   if (!CONST_DOUBLE_P (x))
15649     return false;
15650
15651   if (GET_MODE (x) == VOIDmode
15652       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
15653     return false;
15654
15655   r = *CONST_DOUBLE_REAL_VALUE (x);
15656
15657   /* We cannot represent infinities, NaNs or +/-zero.  We won't
15658      know if we have +zero until we analyse the mantissa, but we
15659      can reject the other invalid values.  */
15660   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
15661       || REAL_VALUE_MINUS_ZERO (r))
15662     return false;
15663
15664   /* Extract exponent.  */
15665   r = real_value_abs (&r);
15666   exponent = REAL_EXP (&r);
15667
15668   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
15669      highest (sign) bit, with a fixed binary point at bit point_pos.
15670      m1 holds the low part of the mantissa, m2 the high part.
15671      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
15672      bits for the mantissa, this can fail (low bits will be lost).  */
15673   real_ldexp (&m, &r, point_pos - exponent);
15674   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
15675
15676   /* If the low part of the mantissa has bits set we cannot represent
15677      the value.  */
15678   if (w.ulow () != 0)
15679     return false;
15680   /* We have rejected the lower HOST_WIDE_INT, so update our
15681      understanding of how many bits lie in the mantissa and
15682      look only at the high HOST_WIDE_INT.  */
15683   mantissa = w.elt (1);
15684   point_pos -= HOST_BITS_PER_WIDE_INT;
15685
15686   /* We can only represent values with a mantissa of the form 1.xxxx.  */
15687   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
15688   if ((mantissa & mask) != 0)
15689     return false;
15690
15691   /* Having filtered unrepresentable values, we may now remove all
15692      but the highest 5 bits.  */
15693   mantissa >>= point_pos - 5;
15694
15695   /* We cannot represent the value 0.0, so reject it.  This is handled
15696      elsewhere.  */
15697   if (mantissa == 0)
15698     return false;
15699
15700   /* Then, as bit 4 is always set, we can mask it off, leaving
15701      the mantissa in the range [0, 15].  */
15702   mantissa &= ~(1 << 4);
15703   gcc_assert (mantissa <= 15);
15704
15705   /* GCC internally does not use IEEE754-like encoding (where normalized
15706      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
15707      Our mantissa values are shifted 4 places to the left relative to
15708      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
15709      by 5 places to correct for GCC's representation.  */
15710   exponent = 5 - exponent;
15711
15712   return (exponent >= 0 && exponent <= 7);
15713 }
15714
15715 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
15716    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
15717    output MOVI/MVNI, ORR or BIC immediate.  */
15718 char*
15719 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
15720                                    enum simd_immediate_check which)
15721 {
15722   bool is_valid;
15723   static char templ[40];
15724   const char *mnemonic;
15725   const char *shift_op;
15726   unsigned int lane_count = 0;
15727   char element_char;
15728
15729   struct simd_immediate_info info;
15730
15731   /* This will return true to show const_vector is legal for use as either
15732      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
15733      It will also update INFO to show how the immediate should be generated.
15734      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
15735   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
15736   gcc_assert (is_valid);
15737
15738   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15739   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
15740
15741   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15742     {
15743       gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
15744       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
15745          move immediate path.  */
15746       if (aarch64_float_const_zero_rtx_p (info.value))
15747         info.value = GEN_INT (0);
15748       else
15749         {
15750           const unsigned int buf_size = 20;
15751           char float_buf[buf_size] = {'\0'};
15752           real_to_decimal_for_mode (float_buf,
15753                                     CONST_DOUBLE_REAL_VALUE (info.value),
15754                                     buf_size, buf_size, 1, info.elt_mode);
15755
15756           if (lane_count == 1)
15757             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
15758           else
15759             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
15760                       lane_count, element_char, float_buf);
15761           return templ;
15762         }
15763     }
15764
15765   gcc_assert (CONST_INT_P (info.value));
15766
15767   if (which == AARCH64_CHECK_MOV)
15768     {
15769       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
15770       shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
15771       if (lane_count == 1)
15772         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
15773                   mnemonic, UINTVAL (info.value));
15774       else if (info.shift)
15775         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15776                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
15777                   element_char, UINTVAL (info.value), shift_op, info.shift);
15778       else
15779         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15780                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
15781                   element_char, UINTVAL (info.value));
15782     }
15783   else
15784     {
15785       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
15786       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
15787       if (info.shift)
15788         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15789                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
15790                   element_char, UINTVAL (info.value), "lsl", info.shift);
15791       else
15792         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15793                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
15794                   element_char, UINTVAL (info.value));
15795     }
15796   return templ;
15797 }
15798
15799 char*
15800 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
15801 {
15802
15803   /* If a floating point number was passed and we desire to use it in an
15804      integer mode do the conversion to integer.  */
15805   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
15806     {
15807       unsigned HOST_WIDE_INT ival;
15808       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
15809           gcc_unreachable ();
15810       immediate = gen_int_mode (ival, mode);
15811     }
15812
15813   machine_mode vmode;
15814   /* use a 64 bit mode for everything except for DI/DF mode, where we use
15815      a 128 bit vector mode.  */
15816   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
15817
15818   vmode = aarch64_simd_container_mode (mode, width);
15819   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
15820   return aarch64_output_simd_mov_immediate (v_op, width);
15821 }
15822
15823 /* Return the output string to use for moving immediate CONST_VECTOR
15824    into an SVE register.  */
15825
15826 char *
15827 aarch64_output_sve_mov_immediate (rtx const_vector)
15828 {
15829   static char templ[40];
15830   struct simd_immediate_info info;
15831   char element_char;
15832
15833   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15834   gcc_assert (is_valid);
15835
15836   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15837
15838   if (info.step)
15839     {
15840       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15841                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15842                 element_char, INTVAL (info.value), INTVAL (info.step));
15843       return templ;
15844     }
15845
15846   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15847     {
15848       if (aarch64_float_const_zero_rtx_p (info.value))
15849         info.value = GEN_INT (0);
15850       else
15851         {
15852           const int buf_size = 20;
15853           char float_buf[buf_size] = {};
15854           real_to_decimal_for_mode (float_buf,
15855                                     CONST_DOUBLE_REAL_VALUE (info.value),
15856                                     buf_size, buf_size, 1, info.elt_mode);
15857
15858           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15859                     element_char, float_buf);
15860           return templ;
15861         }
15862     }
15863
15864   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15865             element_char, INTVAL (info.value));
15866   return templ;
15867 }
15868
15869 /* Return the asm format for a PTRUE instruction whose destination has
15870    mode MODE.  SUFFIX is the element size suffix.  */
15871
15872 char *
15873 aarch64_output_ptrue (machine_mode mode, char suffix)
15874 {
15875   unsigned int nunits;
15876   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15877   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15878     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15879   else
15880     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15881   return buf;
15882 }
15883
15884 /* Split operands into moves from op[1] + op[2] into op[0].  */
15885
15886 void
15887 aarch64_split_combinev16qi (rtx operands[3])
15888 {
15889   unsigned int dest = REGNO (operands[0]);
15890   unsigned int src1 = REGNO (operands[1]);
15891   unsigned int src2 = REGNO (operands[2]);
15892   machine_mode halfmode = GET_MODE (operands[1]);
15893   unsigned int halfregs = REG_NREGS (operands[1]);
15894   rtx destlo, desthi;
15895
15896   gcc_assert (halfmode == V16QImode);
15897
15898   if (src1 == dest && src2 == dest + halfregs)
15899     {
15900       /* No-op move.  Can't split to nothing; emit something.  */
15901       emit_note (NOTE_INSN_DELETED);
15902       return;
15903     }
15904
15905   /* Preserve register attributes for variable tracking.  */
15906   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15907   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15908                                GET_MODE_SIZE (halfmode));
15909
15910   /* Special case of reversed high/low parts.  */
15911   if (reg_overlap_mentioned_p (operands[2], destlo)
15912       && reg_overlap_mentioned_p (operands[1], desthi))
15913     {
15914       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15915       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15916       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15917     }
15918   else if (!reg_overlap_mentioned_p (operands[2], destlo))
15919     {
15920       /* Try to avoid unnecessary moves if part of the result
15921          is in the right place already.  */
15922       if (src1 != dest)
15923         emit_move_insn (destlo, operands[1]);
15924       if (src2 != dest + halfregs)
15925         emit_move_insn (desthi, operands[2]);
15926     }
15927   else
15928     {
15929       if (src2 != dest + halfregs)
15930         emit_move_insn (desthi, operands[2]);
15931       if (src1 != dest)
15932         emit_move_insn (destlo, operands[1]);
15933     }
15934 }
15935
15936 /* vec_perm support.  */
15937
15938 struct expand_vec_perm_d
15939 {
15940   rtx target, op0, op1;
15941   vec_perm_indices perm;
15942   machine_mode vmode;
15943   unsigned int vec_flags;
15944   bool one_vector_p;
15945   bool testing_p;
15946 };
15947
15948 /* Generate a variable permutation.  */
15949
15950 static void
15951 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15952 {
15953   machine_mode vmode = GET_MODE (target);
15954   bool one_vector_p = rtx_equal_p (op0, op1);
15955
15956   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15957   gcc_checking_assert (GET_MODE (op0) == vmode);
15958   gcc_checking_assert (GET_MODE (op1) == vmode);
15959   gcc_checking_assert (GET_MODE (sel) == vmode);
15960   gcc_checking_assert (TARGET_SIMD);
15961
15962   if (one_vector_p)
15963     {
15964       if (vmode == V8QImode)
15965         {
15966           /* Expand the argument to a V16QI mode by duplicating it.  */
15967           rtx pair = gen_reg_rtx (V16QImode);
15968           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15969           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15970         }
15971       else
15972         {
15973           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15974         }
15975     }
15976   else
15977     {
15978       rtx pair;
15979
15980       if (vmode == V8QImode)
15981         {
15982           pair = gen_reg_rtx (V16QImode);
15983           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15984           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15985         }
15986       else
15987         {
15988           pair = gen_reg_rtx (OImode);
15989           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15990           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15991         }
15992     }
15993 }
15994
15995 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15996    NELT is the number of elements in the vector.  */
15997
15998 void
15999 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
16000                          unsigned int nelt)
16001 {
16002   machine_mode vmode = GET_MODE (target);
16003   bool one_vector_p = rtx_equal_p (op0, op1);
16004   rtx mask;
16005
16006   /* The TBL instruction does not use a modulo index, so we must take care
16007      of that ourselves.  */
16008   mask = aarch64_simd_gen_const_vector_dup (vmode,
16009       one_vector_p ? nelt - 1 : 2 * nelt - 1);
16010   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
16011
16012   /* For big-endian, we also need to reverse the index within the vector
16013      (but not which vector).  */
16014   if (BYTES_BIG_ENDIAN)
16015     {
16016       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
16017       if (!one_vector_p)
16018         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
16019       sel = expand_simple_binop (vmode, XOR, sel, mask,
16020                                  NULL, 0, OPTAB_LIB_WIDEN);
16021     }
16022   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
16023 }
16024
16025 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
16026
16027 static void
16028 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
16029 {
16030   emit_insn (gen_rtx_SET (target,
16031                           gen_rtx_UNSPEC (GET_MODE (target),
16032                                           gen_rtvec (2, op0, op1), code)));
16033 }
16034
16035 /* Expand an SVE vec_perm with the given operands.  */
16036
16037 void
16038 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
16039 {
16040   machine_mode data_mode = GET_MODE (target);
16041   machine_mode sel_mode = GET_MODE (sel);
16042   /* Enforced by the pattern condition.  */
16043   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
16044
16045   /* Note: vec_perm indices are supposed to wrap when they go beyond the
16046      size of the two value vectors, i.e. the upper bits of the indices
16047      are effectively ignored.  SVE TBL instead produces 0 for any
16048      out-of-range indices, so we need to modulo all the vec_perm indices
16049      to ensure they are all in range.  */
16050   rtx sel_reg = force_reg (sel_mode, sel);
16051
16052   /* Check if the sel only references the first values vector.  */
16053   if (GET_CODE (sel) == CONST_VECTOR
16054       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
16055     {
16056       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
16057       return;
16058     }
16059
16060   /* Check if the two values vectors are the same.  */
16061   if (rtx_equal_p (op0, op1))
16062     {
16063       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
16064       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16065                                          NULL, 0, OPTAB_DIRECT);
16066       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
16067       return;
16068     }
16069
16070   /* Run TBL on for each value vector and combine the results.  */
16071
16072   rtx res0 = gen_reg_rtx (data_mode);
16073   rtx res1 = gen_reg_rtx (data_mode);
16074   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
16075   if (GET_CODE (sel) != CONST_VECTOR
16076       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
16077     {
16078       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
16079                                                        2 * nunits - 1);
16080       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16081                                      NULL, 0, OPTAB_DIRECT);
16082     }
16083   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
16084   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
16085                                      NULL, 0, OPTAB_DIRECT);
16086   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
16087   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
16088     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
16089   else
16090     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
16091 }
16092
16093 /* Recognize patterns suitable for the TRN instructions.  */
16094 static bool
16095 aarch64_evpc_trn (struct expand_vec_perm_d *d)
16096 {
16097   HOST_WIDE_INT odd;
16098   poly_uint64 nelt = d->perm.length ();
16099   rtx out, in0, in1, x;
16100   machine_mode vmode = d->vmode;
16101
16102   if (GET_MODE_UNIT_SIZE (vmode) > 8)
16103     return false;
16104
16105   /* Note that these are little-endian tests.
16106      We correct for big-endian later.  */
16107   if (!d->perm[0].is_constant (&odd)
16108       || (odd != 0 && odd != 1)
16109       || !d->perm.series_p (0, 2, odd, 2)
16110       || !d->perm.series_p (1, 2, nelt + odd, 2))
16111     return false;
16112
16113   /* Success!  */
16114   if (d->testing_p)
16115     return true;
16116
16117   in0 = d->op0;
16118   in1 = d->op1;
16119   /* We don't need a big-endian lane correction for SVE; see the comment
16120      at the head of aarch64-sve.md for details.  */
16121   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16122     {
16123       x = in0, in0 = in1, in1 = x;
16124       odd = !odd;
16125     }
16126   out = d->target;
16127
16128   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16129                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
16130   return true;
16131 }
16132
16133 /* Recognize patterns suitable for the UZP instructions.  */
16134 static bool
16135 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
16136 {
16137   HOST_WIDE_INT odd;
16138   rtx out, in0, in1, x;
16139   machine_mode vmode = d->vmode;
16140
16141   if (GET_MODE_UNIT_SIZE (vmode) > 8)
16142     return false;
16143
16144   /* Note that these are little-endian tests.
16145      We correct for big-endian later.  */
16146   if (!d->perm[0].is_constant (&odd)
16147       || (odd != 0 && odd != 1)
16148       || !d->perm.series_p (0, 1, odd, 2))
16149     return false;
16150
16151   /* Success!  */
16152   if (d->testing_p)
16153     return true;
16154
16155   in0 = d->op0;
16156   in1 = d->op1;
16157   /* We don't need a big-endian lane correction for SVE; see the comment
16158      at the head of aarch64-sve.md for details.  */
16159   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16160     {
16161       x = in0, in0 = in1, in1 = x;
16162       odd = !odd;
16163     }
16164   out = d->target;
16165
16166   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16167                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
16168   return true;
16169 }
16170
16171 /* Recognize patterns suitable for the ZIP instructions.  */
16172 static bool
16173 aarch64_evpc_zip (struct expand_vec_perm_d *d)
16174 {
16175   unsigned int high;
16176   poly_uint64 nelt = d->perm.length ();
16177   rtx out, in0, in1, x;
16178   machine_mode vmode = d->vmode;
16179
16180   if (GET_MODE_UNIT_SIZE (vmode) > 8)
16181     return false;
16182
16183   /* Note that these are little-endian tests.
16184      We correct for big-endian later.  */
16185   poly_uint64 first = d->perm[0];
16186   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
16187       || !d->perm.series_p (0, 2, first, 1)
16188       || !d->perm.series_p (1, 2, first + nelt, 1))
16189     return false;
16190   high = maybe_ne (first, 0U);
16191
16192   /* Success!  */
16193   if (d->testing_p)
16194     return true;
16195
16196   in0 = d->op0;
16197   in1 = d->op1;
16198   /* We don't need a big-endian lane correction for SVE; see the comment
16199      at the head of aarch64-sve.md for details.  */
16200   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16201     {
16202       x = in0, in0 = in1, in1 = x;
16203       high = !high;
16204     }
16205   out = d->target;
16206
16207   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16208                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
16209   return true;
16210 }
16211
16212 /* Recognize patterns for the EXT insn.  */
16213
16214 static bool
16215 aarch64_evpc_ext (struct expand_vec_perm_d *d)
16216 {
16217   HOST_WIDE_INT location;
16218   rtx offset;
16219
16220   /* The first element always refers to the first vector.
16221      Check if the extracted indices are increasing by one.  */
16222   if (d->vec_flags == VEC_SVE_PRED
16223       || !d->perm[0].is_constant (&location)
16224       || !d->perm.series_p (0, 1, location, 1))
16225     return false;
16226
16227   /* Success! */
16228   if (d->testing_p)
16229     return true;
16230
16231   /* The case where (location == 0) is a no-op for both big- and little-endian,
16232      and is removed by the mid-end at optimization levels -O1 and higher.
16233
16234      We don't need a big-endian lane correction for SVE; see the comment
16235      at the head of aarch64-sve.md for details.  */
16236   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
16237     {
16238       /* After setup, we want the high elements of the first vector (stored
16239          at the LSB end of the register), and the low elements of the second
16240          vector (stored at the MSB end of the register). So swap.  */
16241       std::swap (d->op0, d->op1);
16242       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
16243          to_constant () is safe since this is restricted to Advanced SIMD
16244          vectors.  */
16245       location = d->perm.length ().to_constant () - location;
16246     }
16247
16248   offset = GEN_INT (location);
16249   emit_set_insn (d->target,
16250                  gen_rtx_UNSPEC (d->vmode,
16251                                  gen_rtvec (3, d->op0, d->op1, offset),
16252                                  UNSPEC_EXT));
16253   return true;
16254 }
16255
16256 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
16257    within each 64-bit, 32-bit or 16-bit granule.  */
16258
16259 static bool
16260 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
16261 {
16262   HOST_WIDE_INT diff;
16263   unsigned int i, size, unspec;
16264   machine_mode pred_mode;
16265
16266   if (d->vec_flags == VEC_SVE_PRED
16267       || !d->one_vector_p
16268       || !d->perm[0].is_constant (&diff))
16269     return false;
16270
16271   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
16272   if (size == 8)
16273     {
16274       unspec = UNSPEC_REV64;
16275       pred_mode = VNx2BImode;
16276     }
16277   else if (size == 4)
16278     {
16279       unspec = UNSPEC_REV32;
16280       pred_mode = VNx4BImode;
16281     }
16282   else if (size == 2)
16283     {
16284       unspec = UNSPEC_REV16;
16285       pred_mode = VNx8BImode;
16286     }
16287   else
16288     return false;
16289
16290   unsigned int step = diff + 1;
16291   for (i = 0; i < step; ++i)
16292     if (!d->perm.series_p (i, step, diff - i, step))
16293       return false;
16294
16295   /* Success! */
16296   if (d->testing_p)
16297     return true;
16298
16299   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
16300   if (d->vec_flags == VEC_SVE_DATA)
16301     {
16302       rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16303       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
16304                             UNSPEC_MERGE_PTRUE);
16305     }
16306   emit_set_insn (d->target, src);
16307   return true;
16308 }
16309
16310 /* Recognize patterns for the REV insn, which reverses elements within
16311    a full vector.  */
16312
16313 static bool
16314 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
16315 {
16316   poly_uint64 nelt = d->perm.length ();
16317
16318   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
16319     return false;
16320
16321   if (!d->perm.series_p (0, 1, nelt - 1, -1))
16322     return false;
16323
16324   /* Success! */
16325   if (d->testing_p)
16326     return true;
16327
16328   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
16329   emit_set_insn (d->target, src);
16330   return true;
16331 }
16332
16333 static bool
16334 aarch64_evpc_dup (struct expand_vec_perm_d *d)
16335 {
16336   rtx out = d->target;
16337   rtx in0;
16338   HOST_WIDE_INT elt;
16339   machine_mode vmode = d->vmode;
16340   rtx lane;
16341
16342   if (d->vec_flags == VEC_SVE_PRED
16343       || d->perm.encoding ().encoded_nelts () != 1
16344       || !d->perm[0].is_constant (&elt))
16345     return false;
16346
16347   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
16348     return false;
16349
16350   /* Success! */
16351   if (d->testing_p)
16352     return true;
16353
16354   /* The generic preparation in aarch64_expand_vec_perm_const_1
16355      swaps the operand order and the permute indices if it finds
16356      d->perm[0] to be in the second operand.  Thus, we can always
16357      use d->op0 and need not do any extra arithmetic to get the
16358      correct lane number.  */
16359   in0 = d->op0;
16360   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
16361
16362   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
16363   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
16364   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
16365   return true;
16366 }
16367
16368 static bool
16369 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
16370 {
16371   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
16372   machine_mode vmode = d->vmode;
16373
16374   /* Make sure that the indices are constant.  */
16375   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
16376   for (unsigned int i = 0; i < encoded_nelts; ++i)
16377     if (!d->perm[i].is_constant ())
16378       return false;
16379
16380   if (d->testing_p)
16381     return true;
16382
16383   /* Generic code will try constant permutation twice.  Once with the
16384      original mode and again with the elements lowered to QImode.
16385      So wait and don't do the selector expansion ourselves.  */
16386   if (vmode != V8QImode && vmode != V16QImode)
16387     return false;
16388
16389   /* to_constant is safe since this routine is specific to Advanced SIMD
16390      vectors.  */
16391   unsigned int nelt = d->perm.length ().to_constant ();
16392   for (unsigned int i = 0; i < nelt; ++i)
16393     /* If big-endian and two vectors we end up with a weird mixed-endian
16394        mode on NEON.  Reverse the index within each word but not the word
16395        itself.  to_constant is safe because we checked is_constant above.  */
16396     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
16397                         ? d->perm[i].to_constant () ^ (nelt - 1)
16398                         : d->perm[i].to_constant ());
16399
16400   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16401   sel = force_reg (vmode, sel);
16402
16403   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
16404   return true;
16405 }
16406
16407 /* Try to implement D using an SVE TBL instruction.  */
16408
16409 static bool
16410 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
16411 {
16412   unsigned HOST_WIDE_INT nelt;
16413
16414   /* Permuting two variable-length vectors could overflow the
16415      index range.  */
16416   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
16417     return false;
16418
16419   if (d->testing_p)
16420     return true;
16421
16422   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
16423   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
16424   if (d->one_vector_p)
16425     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
16426   else
16427     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
16428   return true;
16429 }
16430
16431 static bool
16432 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
16433 {
16434   /* The pattern matching functions above are written to look for a small
16435      number to begin the sequence (0, 1, N/2).  If we begin with an index
16436      from the second operand, we can swap the operands.  */
16437   poly_int64 nelt = d->perm.length ();
16438   if (known_ge (d->perm[0], nelt))
16439     {
16440       d->perm.rotate_inputs (1);
16441       std::swap (d->op0, d->op1);
16442     }
16443
16444   if ((d->vec_flags == VEC_ADVSIMD
16445        || d->vec_flags == VEC_SVE_DATA
16446        || d->vec_flags == VEC_SVE_PRED)
16447       && known_gt (nelt, 1))
16448     {
16449       if (aarch64_evpc_rev_local (d))
16450         return true;
16451       else if (aarch64_evpc_rev_global (d))
16452         return true;
16453       else if (aarch64_evpc_ext (d))
16454         return true;
16455       else if (aarch64_evpc_dup (d))
16456         return true;
16457       else if (aarch64_evpc_zip (d))
16458         return true;
16459       else if (aarch64_evpc_uzp (d))
16460         return true;
16461       else if (aarch64_evpc_trn (d))
16462         return true;
16463       if (d->vec_flags == VEC_SVE_DATA)
16464         return aarch64_evpc_sve_tbl (d);
16465       else if (d->vec_flags == VEC_ADVSIMD)
16466         return aarch64_evpc_tbl (d);
16467     }
16468   return false;
16469 }
16470
16471 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
16472
16473 static bool
16474 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
16475                                   rtx op1, const vec_perm_indices &sel)
16476 {
16477   struct expand_vec_perm_d d;
16478
16479   /* Check whether the mask can be applied to a single vector.  */
16480   if (sel.ninputs () == 1
16481       || (op0 && rtx_equal_p (op0, op1)))
16482     d.one_vector_p = true;
16483   else if (sel.all_from_input_p (0))
16484     {
16485       d.one_vector_p = true;
16486       op1 = op0;
16487     }
16488   else if (sel.all_from_input_p (1))
16489     {
16490       d.one_vector_p = true;
16491       op0 = op1;
16492     }
16493   else
16494     d.one_vector_p = false;
16495
16496   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
16497                      sel.nelts_per_input ());
16498   d.vmode = vmode;
16499   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
16500   d.target = target;
16501   d.op0 = op0;
16502   d.op1 = op1;
16503   d.testing_p = !target;
16504
16505   if (!d.testing_p)
16506     return aarch64_expand_vec_perm_const_1 (&d);
16507
16508   rtx_insn *last = get_last_insn ();
16509   bool ret = aarch64_expand_vec_perm_const_1 (&d);
16510   gcc_assert (last == get_last_insn ());
16511
16512   return ret;
16513 }
16514
16515 /* Generate a byte permute mask for a register of mode MODE,
16516    which has NUNITS units.  */
16517
16518 rtx
16519 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
16520 {
16521   /* We have to reverse each vector because we dont have
16522      a permuted load that can reverse-load according to ABI rules.  */
16523   rtx mask;
16524   rtvec v = rtvec_alloc (16);
16525   unsigned int i, j;
16526   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
16527
16528   gcc_assert (BYTES_BIG_ENDIAN);
16529   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
16530
16531   for (i = 0; i < nunits; i++)
16532     for (j = 0; j < usize; j++)
16533       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
16534   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
16535   return force_reg (V16QImode, mask);
16536 }
16537
16538 /* Return true if X is a valid second operand for the SVE instruction
16539    that implements integer comparison OP_CODE.  */
16540
16541 static bool
16542 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
16543 {
16544   if (register_operand (x, VOIDmode))
16545     return true;
16546
16547   switch (op_code)
16548     {
16549     case LTU:
16550     case LEU:
16551     case GEU:
16552     case GTU:
16553       return aarch64_sve_cmp_immediate_p (x, false);
16554     case LT:
16555     case LE:
16556     case GE:
16557     case GT:
16558     case NE:
16559     case EQ:
16560       return aarch64_sve_cmp_immediate_p (x, true);
16561     default:
16562       gcc_unreachable ();
16563     }
16564 }
16565
16566 /* Use predicated SVE instructions to implement the equivalent of:
16567
16568      (set TARGET OP)
16569
16570    given that PTRUE is an all-true predicate of the appropriate mode.  */
16571
16572 static void
16573 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
16574 {
16575   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
16576                                gen_rtvec (2, ptrue, op),
16577                                UNSPEC_MERGE_PTRUE);
16578   rtx_insn *insn = emit_set_insn (target, unspec);
16579   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
16580 }
16581
16582 /* Likewise, but also clobber the condition codes.  */
16583
16584 static void
16585 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
16586 {
16587   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
16588                                gen_rtvec (2, ptrue, op),
16589                                UNSPEC_MERGE_PTRUE);
16590   rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
16591   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
16592 }
16593
16594 /* Return the UNSPEC_COND_* code for comparison CODE.  */
16595
16596 static unsigned int
16597 aarch64_unspec_cond_code (rtx_code code)
16598 {
16599   switch (code)
16600     {
16601     case NE:
16602       return UNSPEC_COND_NE;
16603     case EQ:
16604       return UNSPEC_COND_EQ;
16605     case LT:
16606       return UNSPEC_COND_LT;
16607     case GT:
16608       return UNSPEC_COND_GT;
16609     case LE:
16610       return UNSPEC_COND_LE;
16611     case GE:
16612       return UNSPEC_COND_GE;
16613     default:
16614       gcc_unreachable ();
16615     }
16616 }
16617
16618 /* Emit:
16619
16620       (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
16621
16622    where <X> is the operation associated with comparison CODE.  This form
16623    of instruction is used when (and (CODE OP0 OP1) PRED) would have different
16624    semantics, such as when PRED might not be all-true and when comparing
16625    inactive lanes could have side effects.  */
16626
16627 static void
16628 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
16629                                   rtx pred, rtx op0, rtx op1)
16630 {
16631   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
16632                                gen_rtvec (3, pred, op0, op1),
16633                                aarch64_unspec_cond_code (code));
16634   emit_set_insn (target, unspec);
16635 }
16636
16637 /* Expand an SVE integer comparison using the SVE equivalent of:
16638
16639      (set TARGET (CODE OP0 OP1)).  */
16640
16641 void
16642 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
16643 {
16644   machine_mode pred_mode = GET_MODE (target);
16645   machine_mode data_mode = GET_MODE (op0);
16646
16647   if (!aarch64_sve_cmp_operand_p (code, op1))
16648     op1 = force_reg (data_mode, op1);
16649
16650   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16651   rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16652   aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
16653 }
16654
16655 /* Emit the SVE equivalent of:
16656
16657       (set TMP1 (CODE1 OP0 OP1))
16658       (set TMP2 (CODE2 OP0 OP1))
16659       (set TARGET (ior:PRED_MODE TMP1 TMP2))
16660
16661    PTRUE is an all-true predicate with the same mode as TARGET.  */
16662
16663 static void
16664 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
16665                            rtx ptrue, rtx op0, rtx op1)
16666 {
16667   machine_mode pred_mode = GET_MODE (ptrue);
16668   rtx tmp1 = gen_reg_rtx (pred_mode);
16669   aarch64_emit_sve_ptrue_op (tmp1, ptrue,
16670                              gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
16671   rtx tmp2 = gen_reg_rtx (pred_mode);
16672   aarch64_emit_sve_ptrue_op (tmp2, ptrue,
16673                              gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
16674   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
16675 }
16676
16677 /* Emit the SVE equivalent of:
16678
16679       (set TMP (CODE OP0 OP1))
16680       (set TARGET (not TMP))
16681
16682    PTRUE is an all-true predicate with the same mode as TARGET.  */
16683
16684 static void
16685 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
16686                                 rtx op0, rtx op1)
16687 {
16688   machine_mode pred_mode = GET_MODE (ptrue);
16689   rtx tmp = gen_reg_rtx (pred_mode);
16690   aarch64_emit_sve_ptrue_op (tmp, ptrue,
16691                              gen_rtx_fmt_ee (code, pred_mode, op0, op1));
16692   aarch64_emit_unop (target, one_cmpl_optab, tmp);
16693 }
16694
16695 /* Expand an SVE floating-point comparison using the SVE equivalent of:
16696
16697      (set TARGET (CODE OP0 OP1))
16698
16699    If CAN_INVERT_P is true, the caller can also handle inverted results;
16700    return true if the result is in fact inverted.  */
16701
16702 bool
16703 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
16704                                   rtx op0, rtx op1, bool can_invert_p)
16705 {
16706   machine_mode pred_mode = GET_MODE (target);
16707   machine_mode data_mode = GET_MODE (op0);
16708
16709   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16710   switch (code)
16711     {
16712     case UNORDERED:
16713       /* UNORDERED has no immediate form.  */
16714       op1 = force_reg (data_mode, op1);
16715       /* fall through */
16716     case LT:
16717     case LE:
16718     case GT:
16719     case GE:
16720     case EQ:
16721     case NE:
16722       {
16723         /* There is native support for the comparison.  */
16724         rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16725         aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16726         return false;
16727       }
16728
16729     case LTGT:
16730       /* This is a trapping operation (LT or GT).  */
16731       aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
16732       return false;
16733
16734     case UNEQ:
16735       if (!flag_trapping_math)
16736         {
16737           /* This would trap for signaling NaNs.  */
16738           op1 = force_reg (data_mode, op1);
16739           aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
16740           return false;
16741         }
16742       /* fall through */
16743     case UNLT:
16744     case UNLE:
16745     case UNGT:
16746     case UNGE:
16747       if (flag_trapping_math)
16748         {
16749           /* Work out which elements are ordered.  */
16750           rtx ordered = gen_reg_rtx (pred_mode);
16751           op1 = force_reg (data_mode, op1);
16752           aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
16753
16754           /* Test the opposite condition for the ordered elements,
16755              then invert the result.  */
16756           if (code == UNEQ)
16757             code = NE;
16758           else
16759             code = reverse_condition_maybe_unordered (code);
16760           if (can_invert_p)
16761             {
16762               aarch64_emit_sve_predicated_cond (target, code,
16763                                                 ordered, op0, op1);
16764               return true;
16765             }
16766           rtx tmp = gen_reg_rtx (pred_mode);
16767           aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
16768           aarch64_emit_unop (target, one_cmpl_optab, tmp);
16769           return false;
16770         }
16771       break;
16772
16773     case ORDERED:
16774       /* ORDERED has no immediate form.  */
16775       op1 = force_reg (data_mode, op1);
16776       break;
16777
16778     default:
16779       gcc_unreachable ();
16780     }
16781
16782   /* There is native support for the inverse comparison.  */
16783   code = reverse_condition_maybe_unordered (code);
16784   if (can_invert_p)
16785     {
16786       rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16787       aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16788       return true;
16789     }
16790   aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
16791   return false;
16792 }
16793
16794 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
16795    of the data being selected and CMP_MODE is the mode of the values being
16796    compared.  */
16797
16798 void
16799 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
16800                           rtx *ops)
16801 {
16802   machine_mode pred_mode
16803     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
16804                              GET_MODE_SIZE (cmp_mode)).require ();
16805   rtx pred = gen_reg_rtx (pred_mode);
16806   if (FLOAT_MODE_P (cmp_mode))
16807     {
16808       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
16809                                             ops[4], ops[5], true))
16810         std::swap (ops[1], ops[2]);
16811     }
16812   else
16813     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
16814
16815   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
16816   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
16817 }
16818
16819 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
16820    true.  However due to issues with register allocation it is preferable
16821    to avoid tieing integer scalar and FP scalar modes.  Executing integer
16822    operations in general registers is better than treating them as scalar
16823    vector operations.  This reduces latency and avoids redundant int<->FP
16824    moves.  So tie modes if they are either the same class, or vector modes
16825    with other vector modes, vector structs or any scalar mode.  */
16826
16827 static bool
16828 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
16829 {
16830   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
16831     return true;
16832
16833   /* We specifically want to allow elements of "structure" modes to
16834      be tieable to the structure.  This more general condition allows
16835      other rarer situations too.  The reason we don't extend this to
16836      predicate modes is that there are no predicate structure modes
16837      nor any specific instructions for extracting part of a predicate
16838      register.  */
16839   if (aarch64_vector_data_mode_p (mode1)
16840       && aarch64_vector_data_mode_p (mode2))
16841     return true;
16842
16843   /* Also allow any scalar modes with vectors.  */
16844   if (aarch64_vector_mode_supported_p (mode1)
16845       || aarch64_vector_mode_supported_p (mode2))
16846     return true;
16847
16848   return false;
16849 }
16850
16851 /* Return a new RTX holding the result of moving POINTER forward by
16852    AMOUNT bytes.  */
16853
16854 static rtx
16855 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16856 {
16857   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16858
16859   return adjust_automodify_address (pointer, GET_MODE (pointer),
16860                                     next, amount);
16861 }
16862
16863 /* Return a new RTX holding the result of moving POINTER forward by the
16864    size of the mode it points to.  */
16865
16866 static rtx
16867 aarch64_progress_pointer (rtx pointer)
16868 {
16869   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16870 }
16871
16872 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16873    MODE bytes.  */
16874
16875 static void
16876 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16877                                               machine_mode mode)
16878 {
16879   rtx reg = gen_reg_rtx (mode);
16880
16881   /* "Cast" the pointers to the correct mode.  */
16882   *src = adjust_address (*src, mode, 0);
16883   *dst = adjust_address (*dst, mode, 0);
16884   /* Emit the memcpy.  */
16885   emit_move_insn (reg, *src);
16886   emit_move_insn (*dst, reg);
16887   /* Move the pointers forward.  */
16888   *src = aarch64_progress_pointer (*src);
16889   *dst = aarch64_progress_pointer (*dst);
16890 }
16891
16892 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
16893    we succeed, otherwise return false.  */
16894
16895 bool
16896 aarch64_expand_movmem (rtx *operands)
16897 {
16898   int n, mode_bits;
16899   rtx dst = operands[0];
16900   rtx src = operands[1];
16901   rtx base;
16902   machine_mode cur_mode = BLKmode, next_mode;
16903   bool speed_p = !optimize_function_for_size_p (cfun);
16904
16905   /* When optimizing for size, give a better estimate of the length of a
16906      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
16907      will always require an even number of instructions to do now.  And each
16908      operation requires both a load+store, so devide the max number by 2.  */
16909   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
16910
16911   /* We can't do anything smart if the amount to copy is not constant.  */
16912   if (!CONST_INT_P (operands[2]))
16913     return false;
16914
16915   n = INTVAL (operands[2]);
16916
16917   /* Try to keep the number of instructions low.  For all cases we will do at
16918      most two moves for the residual amount, since we'll always overlap the
16919      remainder.  */
16920   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
16921     return false;
16922
16923   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16924   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16925
16926   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16927   src = adjust_automodify_address (src, VOIDmode, base, 0);
16928
16929   /* Convert n to bits to make the rest of the code simpler.  */
16930   n = n * BITS_PER_UNIT;
16931
16932   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
16933      larger than TImode, but we should not use them for loads/stores here.  */
16934   const int copy_limit = GET_MODE_BITSIZE (TImode);
16935
16936   while (n > 0)
16937     {
16938       /* Find the largest mode in which to do the copy in without over reading
16939          or writing.  */
16940       opt_scalar_int_mode mode_iter;
16941       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
16942         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
16943           cur_mode = mode_iter.require ();
16944
16945       gcc_assert (cur_mode != BLKmode);
16946
16947       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
16948       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
16949
16950       n -= mode_bits;
16951
16952       /* Do certain trailing copies as overlapping if it's going to be
16953          cheaper.  i.e. less instructions to do so.  For instance doing a 15
16954          byte copy it's more efficient to do two overlapping 8 byte copies than
16955          8 + 6 + 1.  */
16956       if (n > 0 && n <= 8 * BITS_PER_UNIT)
16957         {
16958           next_mode = smallest_mode_for_size (n, MODE_INT);
16959           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
16960           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
16961           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
16962           n = n_bits;
16963         }
16964     }
16965
16966   return true;
16967 }
16968
16969 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16970    SImode stores.  Handle the case when the constant has identical
16971    bottom and top halves.  This is beneficial when the two stores can be
16972    merged into an STP and we avoid synthesising potentially expensive
16973    immediates twice.  Return true if such a split is possible.  */
16974
16975 bool
16976 aarch64_split_dimode_const_store (rtx dst, rtx src)
16977 {
16978   rtx lo = gen_lowpart (SImode, src);
16979   rtx hi = gen_highpart_mode (SImode, DImode, src);
16980
16981   bool size_p = optimize_function_for_size_p (cfun);
16982
16983   if (!rtx_equal_p (lo, hi))
16984     return false;
16985
16986   unsigned int orig_cost
16987     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16988   unsigned int lo_cost
16989     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16990
16991   /* We want to transform:
16992      MOV        x1, 49370
16993      MOVK       x1, 0x140, lsl 16
16994      MOVK       x1, 0xc0da, lsl 32
16995      MOVK       x1, 0x140, lsl 48
16996      STR        x1, [x0]
16997    into:
16998      MOV        w1, 49370
16999      MOVK       w1, 0x140, lsl 16
17000      STP        w1, w1, [x0]
17001    So we want to perform this only when we save two instructions
17002    or more.  When optimizing for size, however, accept any code size
17003    savings we can.  */
17004   if (size_p && orig_cost <= lo_cost)
17005     return false;
17006
17007   if (!size_p
17008       && (orig_cost <= lo_cost + 1))
17009     return false;
17010
17011   rtx mem_lo = adjust_address (dst, SImode, 0);
17012   if (!aarch64_mem_pair_operand (mem_lo, SImode))
17013     return false;
17014
17015   rtx tmp_reg = gen_reg_rtx (SImode);
17016   aarch64_expand_mov_immediate (tmp_reg, lo);
17017   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
17018   /* Don't emit an explicit store pair as this may not be always profitable.
17019      Let the sched-fusion logic decide whether to merge them.  */
17020   emit_move_insn (mem_lo, tmp_reg);
17021   emit_move_insn (mem_hi, tmp_reg);
17022
17023   return true;
17024 }
17025
17026 /* Generate RTL for a conditional branch with rtx comparison CODE in
17027    mode CC_MODE.  The destination of the unlikely conditional branch
17028    is LABEL_REF.  */
17029
17030 void
17031 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
17032                               rtx label_ref)
17033 {
17034   rtx x;
17035   x = gen_rtx_fmt_ee (code, VOIDmode,
17036                       gen_rtx_REG (cc_mode, CC_REGNUM),
17037                       const0_rtx);
17038
17039   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17040                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
17041                             pc_rtx);
17042   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17043 }
17044
17045 /* Generate DImode scratch registers for 128-bit (TImode) addition.
17046
17047    OP1 represents the TImode destination operand 1
17048    OP2 represents the TImode destination operand 2
17049    LOW_DEST represents the low half (DImode) of TImode operand 0
17050    LOW_IN1 represents the low half (DImode) of TImode operand 1
17051    LOW_IN2 represents the low half (DImode) of TImode operand 2
17052    HIGH_DEST represents the high half (DImode) of TImode operand 0
17053    HIGH_IN1 represents the high half (DImode) of TImode operand 1
17054    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
17055
17056 void
17057 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17058                             rtx *low_in1, rtx *low_in2,
17059                             rtx *high_dest, rtx *high_in1,
17060                             rtx *high_in2)
17061 {
17062   *low_dest = gen_reg_rtx (DImode);
17063   *low_in1 = gen_lowpart (DImode, op1);
17064   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17065                                   subreg_lowpart_offset (DImode, TImode));
17066   *high_dest = gen_reg_rtx (DImode);
17067   *high_in1 = gen_highpart (DImode, op1);
17068   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17069                                    subreg_highpart_offset (DImode, TImode));
17070 }
17071
17072 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
17073
17074    This function differs from 'arch64_addti_scratch_regs' in that
17075    OP1 can be an immediate constant (zero). We must call
17076    subreg_highpart_offset with DImode and TImode arguments, otherwise
17077    VOIDmode will be used for the const_int which generates an internal
17078    error from subreg_size_highpart_offset which does not expect a size of zero.
17079
17080    OP1 represents the TImode destination operand 1
17081    OP2 represents the TImode destination operand 2
17082    LOW_DEST represents the low half (DImode) of TImode operand 0
17083    LOW_IN1 represents the low half (DImode) of TImode operand 1
17084    LOW_IN2 represents the low half (DImode) of TImode operand 2
17085    HIGH_DEST represents the high half (DImode) of TImode operand 0
17086    HIGH_IN1 represents the high half (DImode) of TImode operand 1
17087    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
17088
17089
17090 void
17091 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17092                              rtx *low_in1, rtx *low_in2,
17093                              rtx *high_dest, rtx *high_in1,
17094                              rtx *high_in2)
17095 {
17096   *low_dest = gen_reg_rtx (DImode);
17097   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
17098                                   subreg_lowpart_offset (DImode, TImode));
17099
17100   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17101                                   subreg_lowpart_offset (DImode, TImode));
17102   *high_dest = gen_reg_rtx (DImode);
17103
17104   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
17105                                    subreg_highpart_offset (DImode, TImode));
17106   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17107                                    subreg_highpart_offset (DImode, TImode));
17108 }
17109
17110 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
17111
17112    OP0 represents the TImode destination operand 0
17113    LOW_DEST represents the low half (DImode) of TImode operand 0
17114    LOW_IN1 represents the low half (DImode) of TImode operand 1
17115    LOW_IN2 represents the low half (DImode) of TImode operand 2
17116    HIGH_DEST represents the high half (DImode) of TImode operand 0
17117    HIGH_IN1 represents the high half (DImode) of TImode operand 1
17118    HIGH_IN2 represents the high half (DImode) of TImode operand 2
17119    UNSIGNED_P is true if the operation is being performed on unsigned
17120    values.  */
17121 void
17122 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
17123                        rtx low_in2, rtx high_dest, rtx high_in1,
17124                        rtx high_in2, bool unsigned_p)
17125 {
17126   if (low_in2 == const0_rtx)
17127     {
17128       low_dest = low_in1;
17129       high_in2 = force_reg (DImode, high_in2);
17130       if (unsigned_p)
17131         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
17132       else
17133         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
17134     }
17135   else
17136     {
17137       if (CONST_INT_P (low_in2))
17138         {
17139           high_in2 = force_reg (DImode, high_in2);
17140           emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
17141                                               GEN_INT (-INTVAL (low_in2))));
17142         }
17143       else
17144         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
17145
17146       if (unsigned_p)
17147         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
17148       else
17149         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
17150     }
17151
17152   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
17153   emit_move_insn (gen_highpart (DImode, op0), high_dest);
17154
17155 }
17156
17157 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
17158
17159 static unsigned HOST_WIDE_INT
17160 aarch64_asan_shadow_offset (void)
17161 {
17162   return (HOST_WIDE_INT_1 << 36);
17163 }
17164
17165 static rtx
17166 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
17167                         int code, tree treeop0, tree treeop1)
17168 {
17169   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17170   rtx op0, op1;
17171   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17172   insn_code icode;
17173   struct expand_operand ops[4];
17174
17175   start_sequence ();
17176   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17177
17178   op_mode = GET_MODE (op0);
17179   if (op_mode == VOIDmode)
17180     op_mode = GET_MODE (op1);
17181
17182   switch (op_mode)
17183     {
17184     case E_QImode:
17185     case E_HImode:
17186     case E_SImode:
17187       cmp_mode = SImode;
17188       icode = CODE_FOR_cmpsi;
17189       break;
17190
17191     case E_DImode:
17192       cmp_mode = DImode;
17193       icode = CODE_FOR_cmpdi;
17194       break;
17195
17196     case E_SFmode:
17197       cmp_mode = SFmode;
17198       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17199       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
17200       break;
17201
17202     case E_DFmode:
17203       cmp_mode = DFmode;
17204       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17205       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
17206       break;
17207
17208     default:
17209       end_sequence ();
17210       return NULL_RTX;
17211     }
17212
17213   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
17214   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
17215   if (!op0 || !op1)
17216     {
17217       end_sequence ();
17218       return NULL_RTX;
17219     }
17220   *prep_seq = get_insns ();
17221   end_sequence ();
17222
17223   create_fixed_operand (&ops[0], op0);
17224   create_fixed_operand (&ops[1], op1);
17225
17226   start_sequence ();
17227   if (!maybe_expand_insn (icode, 2, ops))
17228     {
17229       end_sequence ();
17230       return NULL_RTX;
17231     }
17232   *gen_seq = get_insns ();
17233   end_sequence ();
17234
17235   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
17236                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
17237 }
17238
17239 static rtx
17240 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
17241                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
17242 {
17243   rtx op0, op1, target;
17244   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17245   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17246   insn_code icode;
17247   struct expand_operand ops[6];
17248   int aarch64_cond;
17249
17250   push_to_sequence (*prep_seq);
17251   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17252
17253   op_mode = GET_MODE (op0);
17254   if (op_mode == VOIDmode)
17255     op_mode = GET_MODE (op1);
17256
17257   switch (op_mode)
17258     {
17259     case E_QImode:
17260     case E_HImode:
17261     case E_SImode:
17262       cmp_mode = SImode;
17263       icode = CODE_FOR_ccmpsi;
17264       break;
17265
17266     case E_DImode:
17267       cmp_mode = DImode;
17268       icode = CODE_FOR_ccmpdi;
17269       break;
17270
17271     case E_SFmode:
17272       cmp_mode = SFmode;
17273       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17274       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
17275       break;
17276
17277     case E_DFmode:
17278       cmp_mode = DFmode;
17279       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17280       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
17281       break;
17282
17283     default:
17284       end_sequence ();
17285       return NULL_RTX;
17286     }
17287
17288   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
17289   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
17290   if (!op0 || !op1)
17291     {
17292       end_sequence ();
17293       return NULL_RTX;
17294     }
17295   *prep_seq = get_insns ();
17296   end_sequence ();
17297
17298   target = gen_rtx_REG (cc_mode, CC_REGNUM);
17299   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
17300
17301   if (bit_code != AND)
17302     {
17303       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
17304                                                 GET_MODE (XEXP (prev, 0))),
17305                              VOIDmode, XEXP (prev, 0), const0_rtx);
17306       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
17307     }
17308
17309   create_fixed_operand (&ops[0], XEXP (prev, 0));
17310   create_fixed_operand (&ops[1], target);
17311   create_fixed_operand (&ops[2], op0);
17312   create_fixed_operand (&ops[3], op1);
17313   create_fixed_operand (&ops[4], prev);
17314   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
17315
17316   push_to_sequence (*gen_seq);
17317   if (!maybe_expand_insn (icode, 6, ops))
17318     {
17319       end_sequence ();
17320       return NULL_RTX;
17321     }
17322
17323   *gen_seq = get_insns ();
17324   end_sequence ();
17325
17326   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
17327 }
17328
17329 #undef TARGET_GEN_CCMP_FIRST
17330 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
17331
17332 #undef TARGET_GEN_CCMP_NEXT
17333 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
17334
17335 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
17336    instruction fusion of some sort.  */
17337
17338 static bool
17339 aarch64_macro_fusion_p (void)
17340 {
17341   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
17342 }
17343
17344
17345 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
17346    should be kept together during scheduling.  */
17347
17348 static bool
17349 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
17350 {
17351   rtx set_dest;
17352   rtx prev_set = single_set (prev);
17353   rtx curr_set = single_set (curr);
17354   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
17355   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
17356
17357   if (!aarch64_macro_fusion_p ())
17358     return false;
17359
17360   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
17361     {
17362       /* We are trying to match:
17363          prev (mov)  == (set (reg r0) (const_int imm16))
17364          curr (movk) == (set (zero_extract (reg r0)
17365                                            (const_int 16)
17366                                            (const_int 16))
17367                              (const_int imm16_1))  */
17368
17369       set_dest = SET_DEST (curr_set);
17370
17371       if (GET_CODE (set_dest) == ZERO_EXTRACT
17372           && CONST_INT_P (SET_SRC (curr_set))
17373           && CONST_INT_P (SET_SRC (prev_set))
17374           && CONST_INT_P (XEXP (set_dest, 2))
17375           && INTVAL (XEXP (set_dest, 2)) == 16
17376           && REG_P (XEXP (set_dest, 0))
17377           && REG_P (SET_DEST (prev_set))
17378           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
17379         {
17380           return true;
17381         }
17382     }
17383
17384   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
17385     {
17386
17387       /*  We're trying to match:
17388           prev (adrp) == (set (reg r1)
17389                               (high (symbol_ref ("SYM"))))
17390           curr (add) == (set (reg r0)
17391                              (lo_sum (reg r1)
17392                                      (symbol_ref ("SYM"))))
17393           Note that r0 need not necessarily be the same as r1, especially
17394           during pre-regalloc scheduling.  */
17395
17396       if (satisfies_constraint_Ush (SET_SRC (prev_set))
17397           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17398         {
17399           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
17400               && REG_P (XEXP (SET_SRC (curr_set), 0))
17401               && REGNO (XEXP (SET_SRC (curr_set), 0))
17402                  == REGNO (SET_DEST (prev_set))
17403               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
17404                               XEXP (SET_SRC (curr_set), 1)))
17405             return true;
17406         }
17407     }
17408
17409   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
17410     {
17411
17412       /* We're trying to match:
17413          prev (movk) == (set (zero_extract (reg r0)
17414                                            (const_int 16)
17415                                            (const_int 32))
17416                              (const_int imm16_1))
17417          curr (movk) == (set (zero_extract (reg r0)
17418                                            (const_int 16)
17419                                            (const_int 48))
17420                              (const_int imm16_2))  */
17421
17422       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
17423           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
17424           && REG_P (XEXP (SET_DEST (prev_set), 0))
17425           && REG_P (XEXP (SET_DEST (curr_set), 0))
17426           && REGNO (XEXP (SET_DEST (prev_set), 0))
17427              == REGNO (XEXP (SET_DEST (curr_set), 0))
17428           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
17429           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
17430           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
17431           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
17432           && CONST_INT_P (SET_SRC (prev_set))
17433           && CONST_INT_P (SET_SRC (curr_set)))
17434         return true;
17435
17436     }
17437   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
17438     {
17439       /* We're trying to match:
17440           prev (adrp) == (set (reg r0)
17441                               (high (symbol_ref ("SYM"))))
17442           curr (ldr) == (set (reg r1)
17443                              (mem (lo_sum (reg r0)
17444                                              (symbol_ref ("SYM")))))
17445                  or
17446           curr (ldr) == (set (reg r1)
17447                              (zero_extend (mem
17448                                            (lo_sum (reg r0)
17449                                                    (symbol_ref ("SYM"))))))  */
17450       if (satisfies_constraint_Ush (SET_SRC (prev_set))
17451           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17452         {
17453           rtx curr_src = SET_SRC (curr_set);
17454
17455           if (GET_CODE (curr_src) == ZERO_EXTEND)
17456             curr_src = XEXP (curr_src, 0);
17457
17458           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
17459               && REG_P (XEXP (XEXP (curr_src, 0), 0))
17460               && REGNO (XEXP (XEXP (curr_src, 0), 0))
17461                  == REGNO (SET_DEST (prev_set))
17462               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
17463                               XEXP (SET_SRC (prev_set), 0)))
17464               return true;
17465         }
17466     }
17467
17468   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
17469        && aarch_crypto_can_dual_issue (prev, curr))
17470     return true;
17471
17472   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
17473       && any_condjump_p (curr))
17474     {
17475       unsigned int condreg1, condreg2;
17476       rtx cc_reg_1;
17477       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
17478       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
17479
17480       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
17481           && prev
17482           && modified_in_p (cc_reg_1, prev))
17483         {
17484           enum attr_type prev_type = get_attr_type (prev);
17485
17486           /* FIXME: this misses some which is considered simple arthematic
17487              instructions for ThunderX.  Simple shifts are missed here.  */
17488           if (prev_type == TYPE_ALUS_SREG
17489               || prev_type == TYPE_ALUS_IMM
17490               || prev_type == TYPE_LOGICS_REG
17491               || prev_type == TYPE_LOGICS_IMM)
17492             return true;
17493         }
17494     }
17495
17496   if (prev_set
17497       && curr_set
17498       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
17499       && any_condjump_p (curr))
17500     {
17501       /* We're trying to match:
17502           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
17503           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
17504                                                          (const_int 0))
17505                                                  (label_ref ("SYM"))
17506                                                  (pc))  */
17507       if (SET_DEST (curr_set) == (pc_rtx)
17508           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
17509           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
17510           && REG_P (SET_DEST (prev_set))
17511           && REGNO (SET_DEST (prev_set))
17512              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
17513         {
17514           /* Fuse ALU operations followed by conditional branch instruction.  */
17515           switch (get_attr_type (prev))
17516             {
17517             case TYPE_ALU_IMM:
17518             case TYPE_ALU_SREG:
17519             case TYPE_ADC_REG:
17520             case TYPE_ADC_IMM:
17521             case TYPE_ADCS_REG:
17522             case TYPE_ADCS_IMM:
17523             case TYPE_LOGIC_REG:
17524             case TYPE_LOGIC_IMM:
17525             case TYPE_CSEL:
17526             case TYPE_ADR:
17527             case TYPE_MOV_IMM:
17528             case TYPE_SHIFT_REG:
17529             case TYPE_SHIFT_IMM:
17530             case TYPE_BFM:
17531             case TYPE_RBIT:
17532             case TYPE_REV:
17533             case TYPE_EXTEND:
17534               return true;
17535
17536             default:;
17537             }
17538         }
17539     }
17540
17541   return false;
17542 }
17543
17544 /* Return true iff the instruction fusion described by OP is enabled.  */
17545
17546 bool
17547 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
17548 {
17549   return (aarch64_tune_params.fusible_ops & op) != 0;
17550 }
17551
17552 /* If MEM is in the form of [base+offset], extract the two parts
17553    of address and set to BASE and OFFSET, otherwise return false
17554    after clearing BASE and OFFSET.  */
17555
17556 bool
17557 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
17558 {
17559   rtx addr;
17560
17561   gcc_assert (MEM_P (mem));
17562
17563   addr = XEXP (mem, 0);
17564
17565   if (REG_P (addr))
17566     {
17567       *base = addr;
17568       *offset = const0_rtx;
17569       return true;
17570     }
17571
17572   if (GET_CODE (addr) == PLUS
17573       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
17574     {
17575       *base = XEXP (addr, 0);
17576       *offset = XEXP (addr, 1);
17577       return true;
17578     }
17579
17580   *base = NULL_RTX;
17581   *offset = NULL_RTX;
17582
17583   return false;
17584 }
17585
17586 /* Types for scheduling fusion.  */
17587 enum sched_fusion_type
17588 {
17589   SCHED_FUSION_NONE = 0,
17590   SCHED_FUSION_LD_SIGN_EXTEND,
17591   SCHED_FUSION_LD_ZERO_EXTEND,
17592   SCHED_FUSION_LD,
17593   SCHED_FUSION_ST,
17594   SCHED_FUSION_NUM
17595 };
17596
17597 /* If INSN is a load or store of address in the form of [base+offset],
17598    extract the two parts and set to BASE and OFFSET.  Return scheduling
17599    fusion type this INSN is.  */
17600
17601 static enum sched_fusion_type
17602 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
17603 {
17604   rtx x, dest, src;
17605   enum sched_fusion_type fusion = SCHED_FUSION_LD;
17606
17607   gcc_assert (INSN_P (insn));
17608   x = PATTERN (insn);
17609   if (GET_CODE (x) != SET)
17610     return SCHED_FUSION_NONE;
17611
17612   src = SET_SRC (x);
17613   dest = SET_DEST (x);
17614
17615   machine_mode dest_mode = GET_MODE (dest);
17616
17617   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
17618     return SCHED_FUSION_NONE;
17619
17620   if (GET_CODE (src) == SIGN_EXTEND)
17621     {
17622       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
17623       src = XEXP (src, 0);
17624       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
17625         return SCHED_FUSION_NONE;
17626     }
17627   else if (GET_CODE (src) == ZERO_EXTEND)
17628     {
17629       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
17630       src = XEXP (src, 0);
17631       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
17632         return SCHED_FUSION_NONE;
17633     }
17634
17635   if (GET_CODE (src) == MEM && REG_P (dest))
17636     extract_base_offset_in_addr (src, base, offset);
17637   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
17638     {
17639       fusion = SCHED_FUSION_ST;
17640       extract_base_offset_in_addr (dest, base, offset);
17641     }
17642   else
17643     return SCHED_FUSION_NONE;
17644
17645   if (*base == NULL_RTX || *offset == NULL_RTX)
17646     fusion = SCHED_FUSION_NONE;
17647
17648   return fusion;
17649 }
17650
17651 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
17652
17653    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
17654    and PRI are only calculated for these instructions.  For other instruction,
17655    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
17656    type instruction fusion can be added by returning different priorities.
17657
17658    It's important that irrelevant instructions get the largest FUSION_PRI.  */
17659
17660 static void
17661 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
17662                                int *fusion_pri, int *pri)
17663 {
17664   int tmp, off_val;
17665   rtx base, offset;
17666   enum sched_fusion_type fusion;
17667
17668   gcc_assert (INSN_P (insn));
17669
17670   tmp = max_pri - 1;
17671   fusion = fusion_load_store (insn, &base, &offset);
17672   if (fusion == SCHED_FUSION_NONE)
17673     {
17674       *pri = tmp;
17675       *fusion_pri = tmp;
17676       return;
17677     }
17678
17679   /* Set FUSION_PRI according to fusion type and base register.  */
17680   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
17681
17682   /* Calculate PRI.  */
17683   tmp /= 2;
17684
17685   /* INSN with smaller offset goes first.  */
17686   off_val = (int)(INTVAL (offset));
17687   if (off_val >= 0)
17688     tmp -= (off_val & 0xfffff);
17689   else
17690     tmp += ((- off_val) & 0xfffff);
17691
17692   *pri = tmp;
17693   return;
17694 }
17695
17696 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
17697    Adjust priority of sha1h instructions so they are scheduled before
17698    other SHA1 instructions.  */
17699
17700 static int
17701 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
17702 {
17703   rtx x = PATTERN (insn);
17704
17705   if (GET_CODE (x) == SET)
17706     {
17707       x = SET_SRC (x);
17708
17709       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
17710         return priority + 10;
17711     }
17712
17713   return priority;
17714 }
17715
17716 /* Given OPERANDS of consecutive load/store, check if we can merge
17717    them into ldp/stp.  LOAD is true if they are load instructions.
17718    MODE is the mode of memory operands.  */
17719
17720 bool
17721 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
17722                                 machine_mode mode)
17723 {
17724   HOST_WIDE_INT offval_1, offval_2, msize;
17725   enum reg_class rclass_1, rclass_2;
17726   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
17727
17728   if (load)
17729     {
17730       mem_1 = operands[1];
17731       mem_2 = operands[3];
17732       reg_1 = operands[0];
17733       reg_2 = operands[2];
17734       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
17735       if (REGNO (reg_1) == REGNO (reg_2))
17736         return false;
17737     }
17738   else
17739     {
17740       mem_1 = operands[0];
17741       mem_2 = operands[2];
17742       reg_1 = operands[1];
17743       reg_2 = operands[3];
17744     }
17745
17746   /* The mems cannot be volatile.  */
17747   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
17748     return false;
17749
17750   /* If we have SImode and slow unaligned ldp,
17751      check the alignment to be at least 8 byte. */
17752   if (mode == SImode
17753       && (aarch64_tune_params.extra_tuning_flags
17754           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17755       && !optimize_size
17756       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
17757     return false;
17758
17759   /* Check if the addresses are in the form of [base+offset].  */
17760   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17761   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
17762     return false;
17763   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17764   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
17765     return false;
17766
17767   /* Check if the bases are same.  */
17768   if (!rtx_equal_p (base_1, base_2))
17769     return false;
17770
17771   /* The operands must be of the same size.  */
17772   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
17773                          GET_MODE_SIZE (GET_MODE (mem_2))));
17774
17775   offval_1 = INTVAL (offset_1);
17776   offval_2 = INTVAL (offset_2);
17777   /* We should only be trying this for fixed-sized modes.  There is no
17778      SVE LDP/STP instruction.  */
17779   msize = GET_MODE_SIZE (mode).to_constant ();
17780   /* Check if the offsets are consecutive.  */
17781   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
17782     return false;
17783
17784   /* Check if the addresses are clobbered by load.  */
17785   if (load)
17786     {
17787       if (reg_mentioned_p (reg_1, mem_1))
17788         return false;
17789
17790       /* In increasing order, the last load can clobber the address.  */
17791       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
17792         return false;
17793     }
17794
17795   /* One of the memory accesses must be a mempair operand.
17796      If it is not the first one, they need to be swapped by the
17797      peephole.  */
17798   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
17799        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
17800     return false;
17801
17802   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
17803     rclass_1 = FP_REGS;
17804   else
17805     rclass_1 = GENERAL_REGS;
17806
17807   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
17808     rclass_2 = FP_REGS;
17809   else
17810     rclass_2 = GENERAL_REGS;
17811
17812   /* Check if the registers are of same class.  */
17813   if (rclass_1 != rclass_2)
17814     return false;
17815
17816   return true;
17817 }
17818
17819 /* Given OPERANDS of consecutive load/store that can be merged,
17820    swap them if they are not in ascending order.  */
17821 void
17822 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
17823 {
17824   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
17825   HOST_WIDE_INT offval_1, offval_2;
17826
17827   if (load)
17828     {
17829       mem_1 = operands[1];
17830       mem_2 = operands[3];
17831     }
17832   else
17833     {
17834       mem_1 = operands[0];
17835       mem_2 = operands[2];
17836     }
17837
17838   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17839   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17840
17841   offval_1 = INTVAL (offset_1);
17842   offval_2 = INTVAL (offset_2);
17843
17844   if (offval_1 > offval_2)
17845     {
17846       /* Irrespective of whether this is a load or a store,
17847          we do the same swap.  */
17848       std::swap (operands[0], operands[2]);
17849       std::swap (operands[1], operands[3]);
17850     }
17851 }
17852
17853 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
17854    comparison between the two.  */
17855 int
17856 aarch64_host_wide_int_compare (const void *x, const void *y)
17857 {
17858   return wi::cmps (* ((const HOST_WIDE_INT *) x),
17859                    * ((const HOST_WIDE_INT *) y));
17860 }
17861
17862 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
17863    other pointing to a REG rtx containing an offset, compare the offsets
17864    of the two pairs.
17865
17866    Return:
17867
17868         1 iff offset (X) > offset (Y)
17869         0 iff offset (X) == offset (Y)
17870         -1 iff offset (X) < offset (Y)  */
17871 int
17872 aarch64_ldrstr_offset_compare (const void *x, const void *y)
17873 {
17874   const rtx * operands_1 = (const rtx *) x;
17875   const rtx * operands_2 = (const rtx *) y;
17876   rtx mem_1, mem_2, base, offset_1, offset_2;
17877
17878   if (MEM_P (operands_1[0]))
17879     mem_1 = operands_1[0];
17880   else
17881     mem_1 = operands_1[1];
17882
17883   if (MEM_P (operands_2[0]))
17884     mem_2 = operands_2[0];
17885   else
17886     mem_2 = operands_2[1];
17887
17888   /* Extract the offsets.  */
17889   extract_base_offset_in_addr (mem_1, &base, &offset_1);
17890   extract_base_offset_in_addr (mem_2, &base, &offset_2);
17891
17892   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
17893
17894   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
17895 }
17896
17897 /* Given OPERANDS of consecutive load/store, check if we can merge
17898    them into ldp/stp by adjusting the offset.  LOAD is true if they
17899    are load instructions.  MODE is the mode of memory operands.
17900
17901    Given below consecutive stores:
17902
17903      str  w1, [xb, 0x100]
17904      str  w1, [xb, 0x104]
17905      str  w1, [xb, 0x108]
17906      str  w1, [xb, 0x10c]
17907
17908    Though the offsets are out of the range supported by stp, we can
17909    still pair them after adjusting the offset, like:
17910
17911      add  scratch, xb, 0x100
17912      stp  w1, w1, [scratch]
17913      stp  w1, w1, [scratch, 0x8]
17914
17915    The peephole patterns detecting this opportunity should guarantee
17916    the scratch register is avaliable.  */
17917
17918 bool
17919 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
17920                                        scalar_mode mode)
17921 {
17922   const int num_insns = 4;
17923   enum reg_class rclass;
17924   HOST_WIDE_INT offvals[num_insns], msize;
17925   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
17926
17927   if (load)
17928     {
17929       for (int i = 0; i < num_insns; i++)
17930         {
17931           reg[i] = operands[2 * i];
17932           mem[i] = operands[2 * i + 1];
17933
17934           gcc_assert (REG_P (reg[i]));
17935         }
17936
17937       /* Do not attempt to merge the loads if the loads clobber each other.  */
17938       for (int i = 0; i < 8; i += 2)
17939         for (int j = i + 2; j < 8; j += 2)
17940           if (reg_overlap_mentioned_p (operands[i], operands[j]))
17941             return false;
17942     }
17943   else
17944     for (int i = 0; i < num_insns; i++)
17945       {
17946         mem[i] = operands[2 * i];
17947         reg[i] = operands[2 * i + 1];
17948       }
17949
17950   /* Skip if memory operand is by itself valid for ldp/stp.  */
17951   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
17952     return false;
17953
17954   for (int i = 0; i < num_insns; i++)
17955     {
17956       /* The mems cannot be volatile.  */
17957       if (MEM_VOLATILE_P (mem[i]))
17958         return false;
17959
17960       /* Check if the addresses are in the form of [base+offset].  */
17961       extract_base_offset_in_addr (mem[i], base + i, offset + i);
17962       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
17963         return false;
17964     }
17965
17966   /* Check if the registers are of same class.  */
17967   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
17968     ? FP_REGS : GENERAL_REGS;
17969
17970   for (int i = 1; i < num_insns; i++)
17971     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
17972       {
17973         if (rclass != FP_REGS)
17974           return false;
17975       }
17976     else
17977       {
17978         if (rclass != GENERAL_REGS)
17979           return false;
17980       }
17981
17982   /* Only the last register in the order in which they occur
17983      may be clobbered by the load.  */
17984   if (rclass == GENERAL_REGS && load)
17985     for (int i = 0; i < num_insns - 1; i++)
17986       if (reg_mentioned_p (reg[i], mem[i]))
17987         return false;
17988
17989   /* Check if the bases are same.  */
17990   for (int i = 0; i < num_insns - 1; i++)
17991     if (!rtx_equal_p (base[i], base[i + 1]))
17992       return false;
17993
17994   for (int i = 0; i < num_insns; i++)
17995     offvals[i] = INTVAL (offset[i]);
17996
17997   msize = GET_MODE_SIZE (mode);
17998
17999   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
18000   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
18001          aarch64_host_wide_int_compare);
18002
18003   if (!(offvals[1] == offvals[0] + msize
18004         && offvals[3] == offvals[2] + msize))
18005     return false;
18006
18007   /* Check that offsets are within range of each other.  The ldp/stp
18008      instructions have 7 bit immediate offsets, so use 0x80.  */
18009   if (offvals[2] - offvals[0] >= msize * 0x80)
18010     return false;
18011
18012   /* The offsets must be aligned with respect to each other.  */
18013   if (offvals[0] % msize != offvals[2] % msize)
18014     return false;
18015
18016   /* If we have SImode and slow unaligned ldp,
18017      check the alignment to be at least 8 byte. */
18018   if (mode == SImode
18019       && (aarch64_tune_params.extra_tuning_flags
18020           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18021       && !optimize_size
18022       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
18023     return false;
18024
18025   return true;
18026 }
18027
18028 /* Given OPERANDS of consecutive load/store, this function pairs them
18029    into LDP/STP after adjusting the offset.  It depends on the fact
18030    that the operands can be sorted so the offsets are correct for STP.
18031    MODE is the mode of memory operands.  CODE is the rtl operator
18032    which should be applied to all memory operands, it's SIGN_EXTEND,
18033    ZERO_EXTEND or UNKNOWN.  */
18034
18035 bool
18036 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
18037                              scalar_mode mode, RTX_CODE code)
18038 {
18039   rtx base, offset_1, offset_3, t1, t2;
18040   rtx mem_1, mem_2, mem_3, mem_4;
18041   rtx temp_operands[8];
18042   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
18043                 stp_off_upper_limit, stp_off_lower_limit, msize;
18044
18045   /* We make changes on a copy as we may still bail out.  */
18046   for (int i = 0; i < 8; i ++)
18047     temp_operands[i] = operands[i];
18048
18049   /* Sort the operands.  */
18050   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
18051
18052   if (load)
18053     {
18054       mem_1 = temp_operands[1];
18055       mem_2 = temp_operands[3];
18056       mem_3 = temp_operands[5];
18057       mem_4 = temp_operands[7];
18058     }
18059   else
18060     {
18061       mem_1 = temp_operands[0];
18062       mem_2 = temp_operands[2];
18063       mem_3 = temp_operands[4];
18064       mem_4 = temp_operands[6];
18065       gcc_assert (code == UNKNOWN);
18066     }
18067
18068   extract_base_offset_in_addr (mem_1, &base, &offset_1);
18069   extract_base_offset_in_addr (mem_3, &base, &offset_3);
18070   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
18071               && offset_3 != NULL_RTX);
18072
18073   /* Adjust offset so it can fit in LDP/STP instruction.  */
18074   msize = GET_MODE_SIZE (mode);
18075   stp_off_upper_limit = msize * (0x40 - 1);
18076   stp_off_lower_limit = - msize * 0x40;
18077
18078   off_val_1 = INTVAL (offset_1);
18079   off_val_3 = INTVAL (offset_3);
18080
18081   /* The base offset is optimally half way between the two STP/LDP offsets.  */
18082   if (msize <= 4)
18083     base_off = (off_val_1 + off_val_3) / 2;
18084   else
18085     /* However, due to issues with negative LDP/STP offset generation for
18086        larger modes, for DF, DI and vector modes. we must not use negative
18087        addresses smaller than 9 signed unadjusted bits can store.  This
18088        provides the most range in this case.  */
18089     base_off = off_val_1;
18090
18091   /* Adjust the base so that it is aligned with the addresses but still
18092      optimal.  */
18093   if (base_off % msize != off_val_1 % msize)
18094     /* Fix the offset, bearing in mind we want to make it bigger not
18095        smaller.  */
18096     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18097   else if (msize <= 4)
18098     /* The negative range of LDP/STP is one larger than the positive range.  */
18099     base_off += msize;
18100
18101   /* Check if base offset is too big or too small.  We can attempt to resolve
18102      this issue by setting it to the maximum value and seeing if the offsets
18103      still fit.  */
18104   if (base_off >= 0x1000)
18105     {
18106       base_off = 0x1000 - 1;
18107       /* We must still make sure that the base offset is aligned with respect
18108          to the address.  But it may may not be made any bigger.  */
18109       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18110     }
18111
18112   /* Likewise for the case where the base is too small.  */
18113   if (base_off <= -0x1000)
18114     {
18115       base_off = -0x1000 + 1;
18116       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18117     }
18118
18119   /* Offset of the first STP/LDP.  */
18120   new_off_1 = off_val_1 - base_off;
18121
18122   /* Offset of the second STP/LDP.  */
18123   new_off_3 = off_val_3 - base_off;
18124
18125   /* The offsets must be within the range of the LDP/STP instructions.  */
18126   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
18127       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
18128     return false;
18129
18130   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
18131                                                   new_off_1), true);
18132   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
18133                                                   new_off_1 + msize), true);
18134   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
18135                                                   new_off_3), true);
18136   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
18137                                                   new_off_3 + msize), true);
18138
18139   if (!aarch64_mem_pair_operand (mem_1, mode)
18140       || !aarch64_mem_pair_operand (mem_3, mode))
18141     return false;
18142
18143   if (code == ZERO_EXTEND)
18144     {
18145       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
18146       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
18147       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
18148       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
18149     }
18150   else if (code == SIGN_EXTEND)
18151     {
18152       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
18153       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
18154       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
18155       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
18156     }
18157
18158   if (load)
18159     {
18160       operands[0] = temp_operands[0];
18161       operands[1] = mem_1;
18162       operands[2] = temp_operands[2];
18163       operands[3] = mem_2;
18164       operands[4] = temp_operands[4];
18165       operands[5] = mem_3;
18166       operands[6] = temp_operands[6];
18167       operands[7] = mem_4;
18168     }
18169   else
18170     {
18171       operands[0] = mem_1;
18172       operands[1] = temp_operands[1];
18173       operands[2] = mem_2;
18174       operands[3] = temp_operands[3];
18175       operands[4] = mem_3;
18176       operands[5] = temp_operands[5];
18177       operands[6] = mem_4;
18178       operands[7] = temp_operands[7];
18179     }
18180
18181   /* Emit adjusting instruction.  */
18182   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
18183   /* Emit ldp/stp instructions.  */
18184   t1 = gen_rtx_SET (operands[0], operands[1]);
18185   t2 = gen_rtx_SET (operands[2], operands[3]);
18186   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18187   t1 = gen_rtx_SET (operands[4], operands[5]);
18188   t2 = gen_rtx_SET (operands[6], operands[7]);
18189   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18190   return true;
18191 }
18192
18193 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
18194    it isn't worth branching around empty masked ops (including masked
18195    stores).  */
18196
18197 static bool
18198 aarch64_empty_mask_is_expensive (unsigned)
18199 {
18200   return false;
18201 }
18202
18203 /* Return 1 if pseudo register should be created and used to hold
18204    GOT address for PIC code.  */
18205
18206 bool
18207 aarch64_use_pseudo_pic_reg (void)
18208 {
18209   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
18210 }
18211
18212 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
18213
18214 static int
18215 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
18216 {
18217   switch (XINT (x, 1))
18218     {
18219     case UNSPEC_GOTSMALLPIC:
18220     case UNSPEC_GOTSMALLPIC28K:
18221     case UNSPEC_GOTTINYPIC:
18222       return 0;
18223     default:
18224       break;
18225     }
18226
18227   return default_unspec_may_trap_p (x, flags);
18228 }
18229
18230
18231 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
18232    return the log2 of that value.  Otherwise return -1.  */
18233
18234 int
18235 aarch64_fpconst_pow_of_2 (rtx x)
18236 {
18237   const REAL_VALUE_TYPE *r;
18238
18239   if (!CONST_DOUBLE_P (x))
18240     return -1;
18241
18242   r = CONST_DOUBLE_REAL_VALUE (x);
18243
18244   if (REAL_VALUE_NEGATIVE (*r)
18245       || REAL_VALUE_ISNAN (*r)
18246       || REAL_VALUE_ISINF (*r)
18247       || !real_isinteger (r, DFmode))
18248     return -1;
18249
18250   return exact_log2 (real_to_integer (r));
18251 }
18252
18253 /* If X is a vector of equal CONST_DOUBLE values and that value is
18254    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
18255
18256 int
18257 aarch64_vec_fpconst_pow_of_2 (rtx x)
18258 {
18259   int nelts;
18260   if (GET_CODE (x) != CONST_VECTOR
18261       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
18262     return -1;
18263
18264   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
18265     return -1;
18266
18267   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
18268   if (firstval <= 0)
18269     return -1;
18270
18271   for (int i = 1; i < nelts; i++)
18272     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
18273       return -1;
18274
18275   return firstval;
18276 }
18277
18278 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
18279    to float.
18280
18281    __fp16 always promotes through this hook.
18282    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
18283    through the generic excess precision logic rather than here.  */
18284
18285 static tree
18286 aarch64_promoted_type (const_tree t)
18287 {
18288   if (SCALAR_FLOAT_TYPE_P (t)
18289       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
18290     return float_type_node;
18291
18292   return NULL_TREE;
18293 }
18294
18295 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
18296
18297 static bool
18298 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
18299                            optimization_type opt_type)
18300 {
18301   switch (op)
18302     {
18303     case rsqrt_optab:
18304       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
18305
18306     default:
18307       return true;
18308     }
18309 }
18310
18311 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
18312
18313 static unsigned int
18314 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
18315                                         int *offset)
18316 {
18317   /* Polynomial invariant 1 == (VG / 2) - 1.  */
18318   gcc_assert (i == 1);
18319   *factor = 2;
18320   *offset = 1;
18321   return AARCH64_DWARF_VG;
18322 }
18323
18324 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
18325    if MODE is HFmode, and punt to the generic implementation otherwise.  */
18326
18327 static bool
18328 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
18329 {
18330   return (mode == HFmode
18331           ? true
18332           : default_libgcc_floating_mode_supported_p (mode));
18333 }
18334
18335 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
18336    if MODE is HFmode, and punt to the generic implementation otherwise.  */
18337
18338 static bool
18339 aarch64_scalar_mode_supported_p (scalar_mode mode)
18340 {
18341   return (mode == HFmode
18342           ? true
18343           : default_scalar_mode_supported_p (mode));
18344 }
18345
18346 /* Set the value of FLT_EVAL_METHOD.
18347    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
18348
18349     0: evaluate all operations and constants, whose semantic type has at
18350        most the range and precision of type float, to the range and
18351        precision of float; evaluate all other operations and constants to
18352        the range and precision of the semantic type;
18353
18354     N, where _FloatN is a supported interchange floating type
18355        evaluate all operations and constants, whose semantic type has at
18356        most the range and precision of _FloatN type, to the range and
18357        precision of the _FloatN type; evaluate all other operations and
18358        constants to the range and precision of the semantic type;
18359
18360    If we have the ARMv8.2-A extensions then we support _Float16 in native
18361    precision, so we should set this to 16.  Otherwise, we support the type,
18362    but want to evaluate expressions in float precision, so set this to
18363    0.  */
18364
18365 static enum flt_eval_method
18366 aarch64_excess_precision (enum excess_precision_type type)
18367 {
18368   switch (type)
18369     {
18370       case EXCESS_PRECISION_TYPE_FAST:
18371       case EXCESS_PRECISION_TYPE_STANDARD:
18372         /* We can calculate either in 16-bit range and precision or
18373            32-bit range and precision.  Make that decision based on whether
18374            we have native support for the ARMv8.2-A 16-bit floating-point
18375            instructions or not.  */
18376         return (TARGET_FP_F16INST
18377                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
18378                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
18379       case EXCESS_PRECISION_TYPE_IMPLICIT:
18380         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
18381       default:
18382         gcc_unreachable ();
18383     }
18384   return FLT_EVAL_METHOD_UNPREDICTABLE;
18385 }
18386
18387 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
18388    scheduled for speculative execution.  Reject the long-running division
18389    and square-root instructions.  */
18390
18391 static bool
18392 aarch64_sched_can_speculate_insn (rtx_insn *insn)
18393 {
18394   switch (get_attr_type (insn))
18395     {
18396       case TYPE_SDIV:
18397       case TYPE_UDIV:
18398       case TYPE_FDIVS:
18399       case TYPE_FDIVD:
18400       case TYPE_FSQRTS:
18401       case TYPE_FSQRTD:
18402       case TYPE_NEON_FP_SQRT_S:
18403       case TYPE_NEON_FP_SQRT_D:
18404       case TYPE_NEON_FP_SQRT_S_Q:
18405       case TYPE_NEON_FP_SQRT_D_Q:
18406       case TYPE_NEON_FP_DIV_S:
18407       case TYPE_NEON_FP_DIV_D:
18408       case TYPE_NEON_FP_DIV_S_Q:
18409       case TYPE_NEON_FP_DIV_D_Q:
18410         return false;
18411       default:
18412         return true;
18413     }
18414 }
18415
18416 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
18417
18418 static int
18419 aarch64_compute_pressure_classes (reg_class *classes)
18420 {
18421   int i = 0;
18422   classes[i++] = GENERAL_REGS;
18423   classes[i++] = FP_REGS;
18424   /* PR_REGS isn't a useful pressure class because many predicate pseudo
18425      registers need to go in PR_LO_REGS at some point during their
18426      lifetime.  Splitting it into two halves has the effect of making
18427      all predicates count against PR_LO_REGS, so that we try whenever
18428      possible to restrict the number of live predicates to 8.  This
18429      greatly reduces the amount of spilling in certain loops.  */
18430   classes[i++] = PR_LO_REGS;
18431   classes[i++] = PR_HI_REGS;
18432   return i;
18433 }
18434
18435 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
18436
18437 static bool
18438 aarch64_can_change_mode_class (machine_mode from,
18439                                machine_mode to, reg_class_t)
18440 {
18441   if (BYTES_BIG_ENDIAN)
18442     {
18443       bool from_sve_p = aarch64_sve_data_mode_p (from);
18444       bool to_sve_p = aarch64_sve_data_mode_p (to);
18445
18446       /* Don't allow changes between SVE data modes and non-SVE modes.
18447          See the comment at the head of aarch64-sve.md for details.  */
18448       if (from_sve_p != to_sve_p)
18449         return false;
18450
18451       /* Don't allow changes in element size: lane 0 of the new vector
18452          would not then be lane 0 of the old vector.  See the comment
18453          above aarch64_maybe_expand_sve_subreg_move for a more detailed
18454          description.
18455
18456          In the worst case, this forces a register to be spilled in
18457          one mode and reloaded in the other, which handles the
18458          endianness correctly.  */
18459       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
18460         return false;
18461     }
18462   return true;
18463 }
18464
18465 /* Implement TARGET_EARLY_REMAT_MODES.  */
18466
18467 static void
18468 aarch64_select_early_remat_modes (sbitmap modes)
18469 {
18470   /* SVE values are not normally live across a call, so it should be
18471      worth doing early rematerialization even in VL-specific mode.  */
18472   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
18473     {
18474       machine_mode mode = (machine_mode) i;
18475       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
18476       if (vec_flags & VEC_ANY_SVE)
18477         bitmap_set_bit (modes, i);
18478     }
18479 }
18480
18481 /* Override the default target speculation_safe_value.  */
18482 static rtx
18483 aarch64_speculation_safe_value (machine_mode mode,
18484                                 rtx result, rtx val, rtx failval)
18485 {
18486   /* Maybe we should warn if falling back to hard barriers.  They are
18487      likely to be noticably more expensive than the alternative below.  */
18488   if (!aarch64_track_speculation)
18489     return default_speculation_safe_value (mode, result, val, failval);
18490
18491   if (!REG_P (val))
18492     val = copy_to_mode_reg (mode, val);
18493
18494   if (!aarch64_reg_or_zero (failval, mode))
18495     failval = copy_to_mode_reg (mode, failval);
18496
18497   emit_insn (gen_despeculate_copy (mode, result, val, failval));
18498   return result;
18499 }
18500
18501 /* Implement TARGET_ESTIMATED_POLY_VALUE.
18502    Look into the tuning structure for an estimate.
18503    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
18504    Advanced SIMD 128 bits.  */
18505
18506 static HOST_WIDE_INT
18507 aarch64_estimated_poly_value (poly_int64 val)
18508 {
18509   enum aarch64_sve_vector_bits_enum width_source
18510     = aarch64_tune_params.sve_width;
18511
18512   /* If we still don't have an estimate, use the default.  */
18513   if (width_source == SVE_SCALABLE)
18514     return default_estimated_poly_value (val);
18515
18516   HOST_WIDE_INT over_128 = width_source - 128;
18517   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
18518 }
18519
18520
18521 /* Return true for types that could be supported as SIMD return or
18522    argument types.  */
18523
18524 static bool
18525 supported_simd_type (tree t)
18526 {
18527   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
18528     {
18529       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
18530       return s == 1 || s == 2 || s == 4 || s == 8;
18531     }
18532   return false;
18533 }
18534
18535 /* Return true for types that currently are supported as SIMD return
18536    or argument types.  */
18537
18538 static bool
18539 currently_supported_simd_type (tree t, tree b)
18540 {
18541   if (COMPLEX_FLOAT_TYPE_P (t))
18542     return false;
18543
18544   if (TYPE_SIZE (t) != TYPE_SIZE (b))
18545     return false;
18546
18547   return supported_simd_type (t);
18548 }
18549
18550 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
18551
18552 static int
18553 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
18554                                         struct cgraph_simd_clone *clonei,
18555                                         tree base_type, int num)
18556 {
18557   tree t, ret_type, arg_type;
18558   unsigned int elt_bits, vec_bits, count;
18559
18560   if (!TARGET_SIMD)
18561     return 0;
18562
18563   if (clonei->simdlen
18564       && (clonei->simdlen < 2
18565           || clonei->simdlen > 1024
18566           || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
18567     {
18568       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18569                   "unsupported simdlen %d", clonei->simdlen);
18570       return 0;
18571     }
18572
18573   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
18574   if (TREE_CODE (ret_type) != VOID_TYPE
18575       && !currently_supported_simd_type (ret_type, base_type))
18576     {
18577       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
18578         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18579                     "GCC does not currently support mixed size types "
18580                     "for %<simd%> functions");
18581       else if (supported_simd_type (ret_type))
18582         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18583                     "GCC does not currently support return type %qT "
18584                     "for %<simd%> functions", ret_type);
18585       else
18586         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18587                     "unsupported return type %qT for %<simd%> functions",
18588                     ret_type);
18589       return 0;
18590     }
18591
18592   for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
18593     {
18594       arg_type = TREE_TYPE (t);
18595
18596       if (!currently_supported_simd_type (arg_type, base_type))
18597         {
18598           if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
18599             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18600                         "GCC does not currently support mixed size types "
18601                         "for %<simd%> functions");
18602           else
18603             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18604                         "GCC does not currently support argument type %qT "
18605                         "for %<simd%> functions", arg_type);
18606           return 0;
18607         }
18608     }
18609
18610   clonei->vecsize_mangle = 'n';
18611   clonei->mask_mode = VOIDmode;
18612   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
18613   if (clonei->simdlen == 0)
18614     {
18615       count = 2;
18616       vec_bits = (num == 0 ? 64 : 128);
18617       clonei->simdlen = vec_bits / elt_bits;
18618     }
18619   else
18620     {
18621       count = 1;
18622       vec_bits = clonei->simdlen * elt_bits;
18623       if (vec_bits != 64 && vec_bits != 128)
18624         {
18625           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18626                       "GCC does not currently support simdlen %d for type %qT",
18627                       clonei->simdlen, base_type);
18628           return 0;
18629         }
18630     }
18631   clonei->vecsize_int = vec_bits;
18632   clonei->vecsize_float = vec_bits;
18633   return count;
18634 }
18635
18636 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
18637
18638 static void
18639 aarch64_simd_clone_adjust (struct cgraph_node *node)
18640 {
18641   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
18642      use the correct ABI.  */
18643
18644   tree t = TREE_TYPE (node->decl);
18645   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
18646                                         TYPE_ATTRIBUTES (t));
18647 }
18648
18649 /* Implement TARGET_SIMD_CLONE_USABLE.  */
18650
18651 static int
18652 aarch64_simd_clone_usable (struct cgraph_node *node)
18653 {
18654   switch (node->simdclone->vecsize_mangle)
18655     {
18656     case 'n':
18657       if (!TARGET_SIMD)
18658         return -1;
18659       return 0;
18660     default:
18661       gcc_unreachable ();
18662     }
18663 }
18664
18665 /* Target-specific selftests.  */
18666
18667 #if CHECKING_P
18668
18669 namespace selftest {
18670
18671 /* Selftest for the RTL loader.
18672    Verify that the RTL loader copes with a dump from
18673    print_rtx_function.  This is essentially just a test that class
18674    function_reader can handle a real dump, but it also verifies
18675    that lookup_reg_by_dump_name correctly handles hard regs.
18676    The presence of hard reg names in the dump means that the test is
18677    target-specific, hence it is in this file.  */
18678
18679 static void
18680 aarch64_test_loading_full_dump ()
18681 {
18682   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
18683
18684   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
18685
18686   rtx_insn *insn_1 = get_insn_by_uid (1);
18687   ASSERT_EQ (NOTE, GET_CODE (insn_1));
18688
18689   rtx_insn *insn_15 = get_insn_by_uid (15);
18690   ASSERT_EQ (INSN, GET_CODE (insn_15));
18691   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
18692
18693   /* Verify crtl->return_rtx.  */
18694   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
18695   ASSERT_EQ (0, REGNO (crtl->return_rtx));
18696   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
18697 }
18698
18699 /* Run all target-specific selftests.  */
18700
18701 static void
18702 aarch64_run_selftests (void)
18703 {
18704   aarch64_test_loading_full_dump ();
18705 }
18706
18707 } // namespace selftest
18708
18709 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
18710    global variable based guard use the default else
18711    return a null tree.  */
18712 static tree
18713 aarch64_stack_protect_guard (void)
18714 {
18715   if (aarch64_stack_protector_guard == SSP_GLOBAL)
18716     return default_stack_protect_guard ();
18717
18718   return NULL_TREE;
18719 }
18720
18721
18722 #endif /* #if CHECKING_P */
18723
18724 #undef TARGET_STACK_PROTECT_GUARD
18725 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
18726
18727 #undef TARGET_ADDRESS_COST
18728 #define TARGET_ADDRESS_COST aarch64_address_cost
18729
18730 /* This hook will determines whether unnamed bitfields affect the alignment
18731    of the containing structure.  The hook returns true if the structure
18732    should inherit the alignment requirements of an unnamed bitfield's
18733    type.  */
18734 #undef TARGET_ALIGN_ANON_BITFIELD
18735 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
18736
18737 #undef TARGET_ASM_ALIGNED_DI_OP
18738 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
18739
18740 #undef TARGET_ASM_ALIGNED_HI_OP
18741 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
18742
18743 #undef TARGET_ASM_ALIGNED_SI_OP
18744 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
18745
18746 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
18747 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
18748   hook_bool_const_tree_hwi_hwi_const_tree_true
18749
18750 #undef TARGET_ASM_FILE_START
18751 #define TARGET_ASM_FILE_START aarch64_start_file
18752
18753 #undef TARGET_ASM_OUTPUT_MI_THUNK
18754 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
18755
18756 #undef TARGET_ASM_SELECT_RTX_SECTION
18757 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
18758
18759 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
18760 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
18761
18762 #undef TARGET_BUILD_BUILTIN_VA_LIST
18763 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
18764
18765 #undef TARGET_CALLEE_COPIES
18766 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
18767
18768 #undef TARGET_CAN_ELIMINATE
18769 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
18770
18771 #undef TARGET_CAN_INLINE_P
18772 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
18773
18774 #undef TARGET_CANNOT_FORCE_CONST_MEM
18775 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
18776
18777 #undef TARGET_CASE_VALUES_THRESHOLD
18778 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
18779
18780 #undef TARGET_CONDITIONAL_REGISTER_USAGE
18781 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
18782
18783 /* Only the least significant bit is used for initialization guard
18784    variables.  */
18785 #undef TARGET_CXX_GUARD_MASK_BIT
18786 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
18787
18788 #undef TARGET_C_MODE_FOR_SUFFIX
18789 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
18790
18791 #ifdef TARGET_BIG_ENDIAN_DEFAULT
18792 #undef  TARGET_DEFAULT_TARGET_FLAGS
18793 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
18794 #endif
18795
18796 #undef TARGET_CLASS_MAX_NREGS
18797 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
18798
18799 #undef TARGET_BUILTIN_DECL
18800 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
18801
18802 #undef TARGET_BUILTIN_RECIPROCAL
18803 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
18804
18805 #undef TARGET_C_EXCESS_PRECISION
18806 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
18807
18808 #undef  TARGET_EXPAND_BUILTIN
18809 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
18810
18811 #undef TARGET_EXPAND_BUILTIN_VA_START
18812 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
18813
18814 #undef TARGET_FOLD_BUILTIN
18815 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
18816
18817 #undef TARGET_FUNCTION_ARG
18818 #define TARGET_FUNCTION_ARG aarch64_function_arg
18819
18820 #undef TARGET_FUNCTION_ARG_ADVANCE
18821 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
18822
18823 #undef TARGET_FUNCTION_ARG_BOUNDARY
18824 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
18825
18826 #undef TARGET_FUNCTION_ARG_PADDING
18827 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
18828
18829 #undef TARGET_GET_RAW_RESULT_MODE
18830 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
18831 #undef TARGET_GET_RAW_ARG_MODE
18832 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
18833
18834 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
18835 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
18836
18837 #undef TARGET_FUNCTION_VALUE
18838 #define TARGET_FUNCTION_VALUE aarch64_function_value
18839
18840 #undef TARGET_FUNCTION_VALUE_REGNO_P
18841 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
18842
18843 #undef TARGET_GIMPLE_FOLD_BUILTIN
18844 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
18845
18846 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
18847 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
18848
18849 #undef  TARGET_INIT_BUILTINS
18850 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
18851
18852 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
18853 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
18854   aarch64_ira_change_pseudo_allocno_class
18855
18856 #undef TARGET_LEGITIMATE_ADDRESS_P
18857 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
18858
18859 #undef TARGET_LEGITIMATE_CONSTANT_P
18860 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
18861
18862 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
18863 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
18864   aarch64_legitimize_address_displacement
18865
18866 #undef TARGET_LIBGCC_CMP_RETURN_MODE
18867 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
18868
18869 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
18870 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
18871 aarch64_libgcc_floating_mode_supported_p
18872
18873 #undef TARGET_MANGLE_TYPE
18874 #define TARGET_MANGLE_TYPE aarch64_mangle_type
18875
18876 #undef TARGET_MEMORY_MOVE_COST
18877 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
18878
18879 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
18880 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
18881
18882 #undef TARGET_MUST_PASS_IN_STACK
18883 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
18884
18885 /* This target hook should return true if accesses to volatile bitfields
18886    should use the narrowest mode possible.  It should return false if these
18887    accesses should use the bitfield container type.  */
18888 #undef TARGET_NARROW_VOLATILE_BITFIELD
18889 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
18890
18891 #undef  TARGET_OPTION_OVERRIDE
18892 #define TARGET_OPTION_OVERRIDE aarch64_override_options
18893
18894 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
18895 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
18896   aarch64_override_options_after_change
18897
18898 #undef TARGET_OPTION_SAVE
18899 #define TARGET_OPTION_SAVE aarch64_option_save
18900
18901 #undef TARGET_OPTION_RESTORE
18902 #define TARGET_OPTION_RESTORE aarch64_option_restore
18903
18904 #undef TARGET_OPTION_PRINT
18905 #define TARGET_OPTION_PRINT aarch64_option_print
18906
18907 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
18908 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
18909
18910 #undef TARGET_SET_CURRENT_FUNCTION
18911 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
18912
18913 #undef TARGET_PASS_BY_REFERENCE
18914 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
18915
18916 #undef TARGET_PREFERRED_RELOAD_CLASS
18917 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
18918
18919 #undef TARGET_SCHED_REASSOCIATION_WIDTH
18920 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
18921
18922 #undef TARGET_PROMOTED_TYPE
18923 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
18924
18925 #undef TARGET_SECONDARY_RELOAD
18926 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
18927
18928 #undef TARGET_SHIFT_TRUNCATION_MASK
18929 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
18930
18931 #undef TARGET_SETUP_INCOMING_VARARGS
18932 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
18933
18934 #undef TARGET_STRUCT_VALUE_RTX
18935 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
18936
18937 #undef TARGET_REGISTER_MOVE_COST
18938 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
18939
18940 #undef TARGET_RETURN_IN_MEMORY
18941 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
18942
18943 #undef TARGET_RETURN_IN_MSB
18944 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
18945
18946 #undef TARGET_RTX_COSTS
18947 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
18948
18949 #undef TARGET_SCALAR_MODE_SUPPORTED_P
18950 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
18951
18952 #undef TARGET_SCHED_ISSUE_RATE
18953 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
18954
18955 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
18956 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
18957   aarch64_sched_first_cycle_multipass_dfa_lookahead
18958
18959 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
18960 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
18961   aarch64_first_cycle_multipass_dfa_lookahead_guard
18962
18963 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
18964 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
18965   aarch64_get_separate_components
18966
18967 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
18968 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
18969   aarch64_components_for_bb
18970
18971 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
18972 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
18973   aarch64_disqualify_components
18974
18975 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
18976 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
18977   aarch64_emit_prologue_components
18978
18979 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
18980 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
18981   aarch64_emit_epilogue_components
18982
18983 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
18984 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
18985   aarch64_set_handled_components
18986
18987 #undef TARGET_TRAMPOLINE_INIT
18988 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
18989
18990 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
18991 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
18992
18993 #undef TARGET_VECTOR_MODE_SUPPORTED_P
18994 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
18995
18996 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
18997 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
18998   aarch64_builtin_support_vector_misalignment
18999
19000 #undef TARGET_ARRAY_MODE
19001 #define TARGET_ARRAY_MODE aarch64_array_mode
19002
19003 #undef TARGET_ARRAY_MODE_SUPPORTED_P
19004 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
19005
19006 #undef TARGET_VECTORIZE_ADD_STMT_COST
19007 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
19008
19009 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
19010 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
19011   aarch64_builtin_vectorization_cost
19012
19013 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
19014 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
19015
19016 #undef TARGET_VECTORIZE_BUILTINS
19017 #define TARGET_VECTORIZE_BUILTINS
19018
19019 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
19020 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
19021   aarch64_builtin_vectorized_function
19022
19023 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
19024 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
19025   aarch64_autovectorize_vector_sizes
19026
19027 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
19028 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
19029   aarch64_atomic_assign_expand_fenv
19030
19031 /* Section anchor support.  */
19032
19033 #undef TARGET_MIN_ANCHOR_OFFSET
19034 #define TARGET_MIN_ANCHOR_OFFSET -256
19035
19036 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
19037    byte offset; we can do much more for larger data types, but have no way
19038    to determine the size of the access.  We assume accesses are aligned.  */
19039 #undef TARGET_MAX_ANCHOR_OFFSET
19040 #define TARGET_MAX_ANCHOR_OFFSET 4095
19041
19042 #undef TARGET_VECTOR_ALIGNMENT
19043 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
19044
19045 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
19046 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
19047   aarch64_vectorize_preferred_vector_alignment
19048 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
19049 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
19050   aarch64_simd_vector_alignment_reachable
19051
19052 /* vec_perm support.  */
19053
19054 #undef TARGET_VECTORIZE_VEC_PERM_CONST
19055 #define TARGET_VECTORIZE_VEC_PERM_CONST \
19056   aarch64_vectorize_vec_perm_const
19057
19058 #undef TARGET_VECTORIZE_GET_MASK_MODE
19059 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
19060 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
19061 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
19062   aarch64_empty_mask_is_expensive
19063 #undef TARGET_PREFERRED_ELSE_VALUE
19064 #define TARGET_PREFERRED_ELSE_VALUE \
19065   aarch64_preferred_else_value
19066
19067 #undef TARGET_INIT_LIBFUNCS
19068 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
19069
19070 #undef TARGET_FIXED_CONDITION_CODE_REGS
19071 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
19072
19073 #undef TARGET_FLAGS_REGNUM
19074 #define TARGET_FLAGS_REGNUM CC_REGNUM
19075
19076 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
19077 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
19078
19079 #undef TARGET_ASAN_SHADOW_OFFSET
19080 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
19081
19082 #undef TARGET_LEGITIMIZE_ADDRESS
19083 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
19084
19085 #undef TARGET_SCHED_CAN_SPECULATE_INSN
19086 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
19087
19088 #undef TARGET_CAN_USE_DOLOOP_P
19089 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
19090
19091 #undef TARGET_SCHED_ADJUST_PRIORITY
19092 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
19093
19094 #undef TARGET_SCHED_MACRO_FUSION_P
19095 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
19096
19097 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
19098 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
19099
19100 #undef TARGET_SCHED_FUSION_PRIORITY
19101 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
19102
19103 #undef TARGET_UNSPEC_MAY_TRAP_P
19104 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
19105
19106 #undef TARGET_USE_PSEUDO_PIC_REG
19107 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
19108
19109 #undef TARGET_PRINT_OPERAND
19110 #define TARGET_PRINT_OPERAND aarch64_print_operand
19111
19112 #undef TARGET_PRINT_OPERAND_ADDRESS
19113 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
19114
19115 #undef TARGET_OPTAB_SUPPORTED_P
19116 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
19117
19118 #undef TARGET_OMIT_STRUCT_RETURN_REG
19119 #define TARGET_OMIT_STRUCT_RETURN_REG true
19120
19121 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
19122 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
19123   aarch64_dwarf_poly_indeterminate_value
19124
19125 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
19126 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
19127 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
19128
19129 #undef TARGET_HARD_REGNO_NREGS
19130 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
19131 #undef TARGET_HARD_REGNO_MODE_OK
19132 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
19133
19134 #undef TARGET_MODES_TIEABLE_P
19135 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
19136
19137 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
19138 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
19139   aarch64_hard_regno_call_part_clobbered
19140
19141 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
19142 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
19143   aarch64_remove_extra_call_preserved_regs
19144
19145 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
19146 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
19147   aarch64_return_call_with_max_clobbers
19148
19149 #undef TARGET_CONSTANT_ALIGNMENT
19150 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
19151
19152 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
19153 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
19154   aarch64_stack_clash_protection_alloca_probe_range
19155
19156 #undef TARGET_COMPUTE_PRESSURE_CLASSES
19157 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
19158
19159 #undef TARGET_CAN_CHANGE_MODE_CLASS
19160 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
19161
19162 #undef TARGET_SELECT_EARLY_REMAT_MODES
19163 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
19164
19165 #undef TARGET_SPECULATION_SAFE_VALUE
19166 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
19167
19168 #undef TARGET_ESTIMATED_POLY_VALUE
19169 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
19170
19171 #undef TARGET_ATTRIBUTE_TABLE
19172 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
19173
19174 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
19175 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
19176   aarch64_simd_clone_compute_vecsize_and_simdlen
19177
19178 #undef TARGET_SIMD_CLONE_ADJUST
19179 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
19180
19181 #undef TARGET_SIMD_CLONE_USABLE
19182 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
19183
19184 #if CHECKING_P
19185 #undef TARGET_RUN_TARGET_SELFTESTS
19186 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
19187 #endif /* #if CHECKING_P */
19188
19189 struct gcc_target targetm = TARGET_INITIALIZER;
19190
19191 #include "gt-aarch64.h"