gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2019 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "cgraph.h"
  44 #include "diagnostic.h"
  45 #include "insn-attr.h"
  46 #include "alias.h"
  47 #include "fold-const.h"
  48 #include "stor-layout.h"
  49 #include "calls.h"
  50 #include "varasm.h"
  51 #include "output.h"
  52 #include "flags.h"
  53 #include "explow.h"
  54 #include "expr.h"
  55 #include "reload.h"
  56 #include "langhooks.h"
  57 #include "opts.h"
  58 #include "params.h"
  59 #include "gimplify.h"
  60 #include "dwarf2.h"
  61 #include "gimple-iterator.h"
  62 #include "tree-vectorizer.h"
  63 #include "aarch64-cost-tables.h"
  64 #include "dumpfile.h"
  65 #include "builtins.h"
  66 #include "rtl-iter.h"
  67 #include "tm-constrs.h"
  68 #include "sched-int.h"
  69 #include "target-globals.h"
  70 #include "common/common-target.h"
  71 #include "cfgrtl.h"
  72 #include "selftest.h"
  73 #include "selftest-rtl.h"
  74 #include "rtx-vector-builder.h"
  75 #include "intl.h"
  76
  77 /* This file should be included last.  */
  78 #include "target-def.h"
  79
  80 /* Defined for convenience.  */
  81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  82
  83 /* Information about a legitimate vector immediate operand.  */
  84 struct simd_immediate_info
  85 {
  86   enum insn_type { MOV, MVN, INDEX, PTRUE };
  87   enum modifier_type { LSL, MSL };
  88
  89   simd_immediate_info () {}
  90   simd_immediate_info (scalar_float_mode, rtx);
  91   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
  92                        insn_type = MOV, modifier_type = LSL,
  93                        unsigned int = 0);
  94   simd_immediate_info (scalar_mode, rtx, rtx);
  95   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
  96
  97   /* The mode of the elements.  */
  98   scalar_mode elt_mode;
  99
 100   /* The instruction to use to move the immediate into a vector.  */
 101   insn_type insn;
 102
 103   union
 104   {
 105     /* For MOV and MVN.  */
 106     struct
 107     {
 108       /* The value of each element.  */
 109       rtx value;
 110
 111       /* The kind of shift modifier to use, and the number of bits to shift.
 112          This is (LSL, 0) if no shift is needed.  */
 113       modifier_type modifier;
 114       unsigned int shift;
 115     } mov;
 116
 117     /* For INDEX.  */
 118     struct
 119     {
 120       /* The value of the first element and the step to be added for each
 121          subsequent element.  */
 122       rtx base, step;
 123     } index;
 124
 125     /* For PTRUE.  */
 126     aarch64_svpattern pattern;
 127   } u;
 128 };
 129
 130 /* Construct a floating-point immediate in which each element has mode
 131    ELT_MODE_IN and value VALUE_IN.  */
 132 inline simd_immediate_info
 133 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 134   : elt_mode (elt_mode_in), insn (MOV)
 135 {
 136   u.mov.value = value_in;
 137   u.mov.modifier = LSL;
 138   u.mov.shift = 0;
 139 }
 140
 141 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 142    and value VALUE_IN.  The other parameters are as for the structure
 143    fields.  */
 144 inline simd_immediate_info
 145 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 146                        unsigned HOST_WIDE_INT value_in,
 147                        insn_type insn_in, modifier_type modifier_in,
 148                        unsigned int shift_in)
 149   : elt_mode (elt_mode_in), insn (insn_in)
 150 {
 151   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 152   u.mov.modifier = modifier_in;
 153   u.mov.shift = shift_in;
 154 }
 155
 156 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 157    and where element I is equal to BASE_IN + I * STEP_IN.  */
 158 inline simd_immediate_info
 159 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 160   : elt_mode (elt_mode_in), insn (INDEX)
 161 {
 162   u.index.base = base_in;
 163   u.index.step = step_in;
 164 }
 165
 166 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 167    and has PTRUE pattern PATTERN_IN.  */
 168 inline simd_immediate_info
 169 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 170                        aarch64_svpattern pattern_in)
 171   : elt_mode (elt_mode_in), insn (PTRUE)
 172 {
 173   u.pattern = pattern_in;
 174 }
 175
 176 /* The current code model.  */
 177 enum aarch64_code_model aarch64_cmodel;
 178
 179 /* The number of 64-bit elements in an SVE vector.  */
 180 poly_uint16 aarch64_sve_vg;
 181
 182 #ifdef HAVE_AS_TLS
 183 #undef TARGET_HAVE_TLS
 184 #define TARGET_HAVE_TLS 1
 185 #endif
 186
 187 static bool aarch64_composite_type_p (const_tree, machine_mode);
 188 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 189                                                      const_tree,
 190                                                      machine_mode *, int *,
 191                                                      bool *);
 192 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 193 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 194 static void aarch64_override_options_after_change (void);
 195 static bool aarch64_vector_mode_supported_p (machine_mode);
 196 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 197 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 198                                                          const_tree type,
 199                                                          int misalignment,
 200                                                          bool is_packed);
 201 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 202 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 203                                             aarch64_addr_query_type);
 204 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
 205
 206 /* Major revision number of the ARM Architecture implemented by the target.  */
 207 unsigned aarch64_architecture_version;
 208
 209 /* The processor for which instructions should be scheduled.  */
 210 enum aarch64_processor aarch64_tune = cortexa53;
 211
 212 /* Mask to specify which instruction scheduling options should be used.  */
 213 uint64_t aarch64_tune_flags = 0;
 214
 215 /* Global flag for PC relative loads.  */
 216 bool aarch64_pcrelative_literal_loads;
 217
 218 /* Global flag for whether frame pointer is enabled.  */
 219 bool aarch64_use_frame_pointer;
 220
 221 #define BRANCH_PROTECT_STR_MAX 255
 222 char *accepted_branch_protection_string = NULL;
 223
 224 static enum aarch64_parse_opt_result
 225 aarch64_parse_branch_protection (const char*, char**);
 226
 227 /* Support for command line parsing of boolean flags in the tuning
 228    structures.  */
 229 struct aarch64_flag_desc
 230 {
 231   const char* name;
 232   unsigned int flag;
 233 };
 234
 235 #define AARCH64_FUSION_PAIR(name, internal_name) \
 236   { name, AARCH64_FUSE_##internal_name },
 237 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 238 {
 239   { "none", AARCH64_FUSE_NOTHING },
 240 #include "aarch64-fusion-pairs.def"
 241   { "all", AARCH64_FUSE_ALL },
 242   { NULL, AARCH64_FUSE_NOTHING }
 243 };
 244
 245 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 246   { name, AARCH64_EXTRA_TUNE_##internal_name },
 247 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 248 {
 249   { "none", AARCH64_EXTRA_TUNE_NONE },
 250 #include "aarch64-tuning-flags.def"
 251   { "all", AARCH64_EXTRA_TUNE_ALL },
 252   { NULL, AARCH64_EXTRA_TUNE_NONE }
 253 };
 254
 255 /* Tuning parameters.  */
 256
 257 static const struct cpu_addrcost_table generic_addrcost_table =
 258 {
 259     {
 260       1, /* hi  */
 261       0, /* si  */
 262       0, /* di  */
 263       1, /* ti  */
 264     },
 265   0, /* pre_modify  */
 266   0, /* post_modify  */
 267   0, /* register_offset  */
 268   0, /* register_sextend  */
 269   0, /* register_zextend  */
 270   0 /* imm_offset  */
 271 };
 272
 273 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 274 {
 275     {
 276       0, /* hi  */
 277       0, /* si  */
 278       0, /* di  */
 279       2, /* ti  */
 280     },
 281   0, /* pre_modify  */
 282   0, /* post_modify  */
 283   1, /* register_offset  */
 284   1, /* register_sextend  */
 285   2, /* register_zextend  */
 286   0, /* imm_offset  */
 287 };
 288
 289 static const struct cpu_addrcost_table xgene1_addrcost_table =
 290 {
 291     {
 292       1, /* hi  */
 293       0, /* si  */
 294       0, /* di  */
 295       1, /* ti  */
 296     },
 297   1, /* pre_modify  */
 298   1, /* post_modify  */
 299   0, /* register_offset  */
 300   1, /* register_sextend  */
 301   1, /* register_zextend  */
 302   0, /* imm_offset  */
 303 };
 304
 305 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 306 {
 307     {
 308       1, /* hi  */
 309       1, /* si  */
 310       1, /* di  */
 311       2, /* ti  */
 312     },
 313   0, /* pre_modify  */
 314   0, /* post_modify  */
 315   2, /* register_offset  */
 316   3, /* register_sextend  */
 317   3, /* register_zextend  */
 318   0, /* imm_offset  */
 319 };
 320
 321 static const struct cpu_addrcost_table tsv110_addrcost_table =
 322 {
 323     {
 324       1, /* hi  */
 325       0, /* si  */
 326       0, /* di  */
 327       1, /* ti  */
 328     },
 329   0, /* pre_modify  */
 330   0, /* post_modify  */
 331   0, /* register_offset  */
 332   1, /* register_sextend  */
 333   1, /* register_zextend  */
 334   0, /* imm_offset  */
 335 };
 336
 337 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 338 {
 339     {
 340       1, /* hi  */
 341       1, /* si  */
 342       1, /* di  */
 343       2, /* ti  */
 344     },
 345   1, /* pre_modify  */
 346   1, /* post_modify  */
 347   3, /* register_offset  */
 348   3, /* register_sextend  */
 349   3, /* register_zextend  */
 350   2, /* imm_offset  */
 351 };
 352
 353 static const struct cpu_regmove_cost generic_regmove_cost =
 354 {
 355   1, /* GP2GP  */
 356   /* Avoid the use of slow int<->fp moves for spilling by setting
 357      their cost higher than memmov_cost.  */
 358   5, /* GP2FP  */
 359   5, /* FP2GP  */
 360   2 /* FP2FP  */
 361 };
 362
 363 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 364 {
 365   1, /* GP2GP  */
 366   /* Avoid the use of slow int<->fp moves for spilling by setting
 367      their cost higher than memmov_cost.  */
 368   5, /* GP2FP  */
 369   5, /* FP2GP  */
 370   2 /* FP2FP  */
 371 };
 372
 373 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 374 {
 375   1, /* GP2GP  */
 376   /* Avoid the use of slow int<->fp moves for spilling by setting
 377      their cost higher than memmov_cost.  */
 378   5, /* GP2FP  */
 379   5, /* FP2GP  */
 380   2 /* FP2FP  */
 381 };
 382
 383 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 384 {
 385   1, /* GP2GP  */
 386   /* Avoid the use of slow int<->fp moves for spilling by setting
 387      their cost higher than memmov_cost (actual, 4 and 9).  */
 388   9, /* GP2FP  */
 389   9, /* FP2GP  */
 390   1 /* FP2FP  */
 391 };
 392
 393 static const struct cpu_regmove_cost thunderx_regmove_cost =
 394 {
 395   2, /* GP2GP  */
 396   2, /* GP2FP  */
 397   6, /* FP2GP  */
 398   4 /* FP2FP  */
 399 };
 400
 401 static const struct cpu_regmove_cost xgene1_regmove_cost =
 402 {
 403   1, /* GP2GP  */
 404   /* Avoid the use of slow int<->fp moves for spilling by setting
 405      their cost higher than memmov_cost.  */
 406   8, /* GP2FP  */
 407   8, /* FP2GP  */
 408   2 /* FP2FP  */
 409 };
 410
 411 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 412 {
 413   2, /* GP2GP  */
 414   /* Avoid the use of int<->fp moves for spilling.  */
 415   6, /* GP2FP  */
 416   6, /* FP2GP  */
 417   4 /* FP2FP  */
 418 };
 419
 420 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 421 {
 422   1, /* GP2GP  */
 423   /* Avoid the use of int<->fp moves for spilling.  */
 424   8, /* GP2FP  */
 425   8, /* FP2GP  */
 426   4  /* FP2FP  */
 427 };
 428
 429 static const struct cpu_regmove_cost tsv110_regmove_cost =
 430 {
 431   1, /* GP2GP  */
 432   /* Avoid the use of slow int<->fp moves for spilling by setting
 433      their cost higher than memmov_cost.  */
 434   2, /* GP2FP  */
 435   3, /* FP2GP  */
 436   2  /* FP2FP  */
 437 };
 438
 439 /* Generic costs for vector insn classes.  */
 440 static const struct cpu_vector_cost generic_vector_cost =
 441 {
 442   1, /* scalar_int_stmt_cost  */
 443   1, /* scalar_fp_stmt_cost  */
 444   1, /* scalar_load_cost  */
 445   1, /* scalar_store_cost  */
 446   1, /* vec_int_stmt_cost  */
 447   1, /* vec_fp_stmt_cost  */
 448   2, /* vec_permute_cost  */
 449   1, /* vec_to_scalar_cost  */
 450   1, /* scalar_to_vec_cost  */
 451   1, /* vec_align_load_cost  */
 452   1, /* vec_unalign_load_cost  */
 453   1, /* vec_unalign_store_cost  */
 454   1, /* vec_store_cost  */
 455   3, /* cond_taken_branch_cost  */
 456   1 /* cond_not_taken_branch_cost  */
 457 };
 458
 459 /* QDF24XX costs for vector insn classes.  */
 460 static const struct cpu_vector_cost qdf24xx_vector_cost =
 461 {
 462   1, /* scalar_int_stmt_cost  */
 463   1, /* scalar_fp_stmt_cost  */
 464   1, /* scalar_load_cost  */
 465   1, /* scalar_store_cost  */
 466   1, /* vec_int_stmt_cost  */
 467   3, /* vec_fp_stmt_cost  */
 468   2, /* vec_permute_cost  */
 469   1, /* vec_to_scalar_cost  */
 470   1, /* scalar_to_vec_cost  */
 471   1, /* vec_align_load_cost  */
 472   1, /* vec_unalign_load_cost  */
 473   1, /* vec_unalign_store_cost  */
 474   1, /* vec_store_cost  */
 475   3, /* cond_taken_branch_cost  */
 476   1 /* cond_not_taken_branch_cost  */
 477 };
 478
 479 /* ThunderX costs for vector insn classes.  */
 480 static const struct cpu_vector_cost thunderx_vector_cost =
 481 {
 482   1, /* scalar_int_stmt_cost  */
 483   1, /* scalar_fp_stmt_cost  */
 484   3, /* scalar_load_cost  */
 485   1, /* scalar_store_cost  */
 486   4, /* vec_int_stmt_cost  */
 487   1, /* vec_fp_stmt_cost  */
 488   4, /* vec_permute_cost  */
 489   2, /* vec_to_scalar_cost  */
 490   2, /* scalar_to_vec_cost  */
 491   3, /* vec_align_load_cost  */
 492   5, /* vec_unalign_load_cost  */
 493   5, /* vec_unalign_store_cost  */
 494   1, /* vec_store_cost  */
 495   3, /* cond_taken_branch_cost  */
 496   3 /* cond_not_taken_branch_cost  */
 497 };
 498
 499 static const struct cpu_vector_cost tsv110_vector_cost =
 500 {
 501   1, /* scalar_int_stmt_cost  */
 502   1, /* scalar_fp_stmt_cost  */
 503   5, /* scalar_load_cost  */
 504   1, /* scalar_store_cost  */
 505   2, /* vec_int_stmt_cost  */
 506   2, /* vec_fp_stmt_cost  */
 507   2, /* vec_permute_cost  */
 508   3, /* vec_to_scalar_cost  */
 509   2, /* scalar_to_vec_cost  */
 510   5, /* vec_align_load_cost  */
 511   5, /* vec_unalign_load_cost  */
 512   1, /* vec_unalign_store_cost  */
 513   1, /* vec_store_cost  */
 514   1, /* cond_taken_branch_cost  */
 515   1 /* cond_not_taken_branch_cost  */
 516 };
 517
 518 /* Generic costs for vector insn classes.  */
 519 static const struct cpu_vector_cost cortexa57_vector_cost =
 520 {
 521   1, /* scalar_int_stmt_cost  */
 522   1, /* scalar_fp_stmt_cost  */
 523   4, /* scalar_load_cost  */
 524   1, /* scalar_store_cost  */
 525   2, /* vec_int_stmt_cost  */
 526   2, /* vec_fp_stmt_cost  */
 527   3, /* vec_permute_cost  */
 528   8, /* vec_to_scalar_cost  */
 529   8, /* scalar_to_vec_cost  */
 530   4, /* vec_align_load_cost  */
 531   4, /* vec_unalign_load_cost  */
 532   1, /* vec_unalign_store_cost  */
 533   1, /* vec_store_cost  */
 534   1, /* cond_taken_branch_cost  */
 535   1 /* cond_not_taken_branch_cost  */
 536 };
 537
 538 static const struct cpu_vector_cost exynosm1_vector_cost =
 539 {
 540   1, /* scalar_int_stmt_cost  */
 541   1, /* scalar_fp_stmt_cost  */
 542   5, /* scalar_load_cost  */
 543   1, /* scalar_store_cost  */
 544   3, /* vec_int_stmt_cost  */
 545   3, /* vec_fp_stmt_cost  */
 546   3, /* vec_permute_cost  */
 547   3, /* vec_to_scalar_cost  */
 548   3, /* scalar_to_vec_cost  */
 549   5, /* vec_align_load_cost  */
 550   5, /* vec_unalign_load_cost  */
 551   1, /* vec_unalign_store_cost  */
 552   1, /* vec_store_cost  */
 553   1, /* cond_taken_branch_cost  */
 554   1 /* cond_not_taken_branch_cost  */
 555 };
 556
 557 /* Generic costs for vector insn classes.  */
 558 static const struct cpu_vector_cost xgene1_vector_cost =
 559 {
 560   1, /* scalar_int_stmt_cost  */
 561   1, /* scalar_fp_stmt_cost  */
 562   5, /* scalar_load_cost  */
 563   1, /* scalar_store_cost  */
 564   2, /* vec_int_stmt_cost  */
 565   2, /* vec_fp_stmt_cost  */
 566   2, /* vec_permute_cost  */
 567   4, /* vec_to_scalar_cost  */
 568   4, /* scalar_to_vec_cost  */
 569   10, /* vec_align_load_cost  */
 570   10, /* vec_unalign_load_cost  */
 571   2, /* vec_unalign_store_cost  */
 572   2, /* vec_store_cost  */
 573   2, /* cond_taken_branch_cost  */
 574   1 /* cond_not_taken_branch_cost  */
 575 };
 576
 577 /* Costs for vector insn classes for Vulcan.  */
 578 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 579 {
 580   1, /* scalar_int_stmt_cost  */
 581   6, /* scalar_fp_stmt_cost  */
 582   4, /* scalar_load_cost  */
 583   1, /* scalar_store_cost  */
 584   5, /* vec_int_stmt_cost  */
 585   6, /* vec_fp_stmt_cost  */
 586   3, /* vec_permute_cost  */
 587   6, /* vec_to_scalar_cost  */
 588   5, /* scalar_to_vec_cost  */
 589   8, /* vec_align_load_cost  */
 590   8, /* vec_unalign_load_cost  */
 591   4, /* vec_unalign_store_cost  */
 592   4, /* vec_store_cost  */
 593   2, /* cond_taken_branch_cost  */
 594   1  /* cond_not_taken_branch_cost  */
 595 };
 596
 597 /* Generic costs for branch instructions.  */
 598 static const struct cpu_branch_cost generic_branch_cost =
 599 {
 600   1,  /* Predictable.  */
 601   3   /* Unpredictable.  */
 602 };
 603
 604 /* Generic approximation modes.  */
 605 static const cpu_approx_modes generic_approx_modes =
 606 {
 607   AARCH64_APPROX_NONE,  /* division  */
 608   AARCH64_APPROX_NONE,  /* sqrt  */
 609   AARCH64_APPROX_NONE   /* recip_sqrt  */
 610 };
 611
 612 /* Approximation modes for Exynos M1.  */
 613 static const cpu_approx_modes exynosm1_approx_modes =
 614 {
 615   AARCH64_APPROX_NONE,  /* division  */
 616   AARCH64_APPROX_ALL,   /* sqrt  */
 617   AARCH64_APPROX_ALL    /* recip_sqrt  */
 618 };
 619
 620 /* Approximation modes for X-Gene 1.  */
 621 static const cpu_approx_modes xgene1_approx_modes =
 622 {
 623   AARCH64_APPROX_NONE,  /* division  */
 624   AARCH64_APPROX_NONE,  /* sqrt  */
 625   AARCH64_APPROX_ALL    /* recip_sqrt  */
 626 };
 627
 628 /* Generic prefetch settings (which disable prefetch).  */
 629 static const cpu_prefetch_tune generic_prefetch_tune =
 630 {
 631   0,                    /* num_slots  */
 632   -1,                   /* l1_cache_size  */
 633   -1,                   /* l1_cache_line_size  */
 634   -1,                   /* l2_cache_size  */
 635   true,                 /* prefetch_dynamic_strides */
 636   -1,                   /* minimum_stride */
 637   -1                    /* default_opt_level  */
 638 };
 639
 640 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 641 {
 642   0,                    /* num_slots  */
 643   -1,                   /* l1_cache_size  */
 644   64,                   /* l1_cache_line_size  */
 645   -1,                   /* l2_cache_size  */
 646   true,                 /* prefetch_dynamic_strides */
 647   -1,                   /* minimum_stride */
 648   -1                    /* default_opt_level  */
 649 };
 650
 651 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 652 {
 653   4,                    /* num_slots  */
 654   32,                   /* l1_cache_size  */
 655   64,                   /* l1_cache_line_size  */
 656   512,                  /* l2_cache_size  */
 657   false,                /* prefetch_dynamic_strides */
 658   2048,                 /* minimum_stride */
 659   3                     /* default_opt_level  */
 660 };
 661
 662 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 663 {
 664   8,                    /* num_slots  */
 665   32,                   /* l1_cache_size  */
 666   128,                  /* l1_cache_line_size  */
 667   16*1024,              /* l2_cache_size  */
 668   true,                 /* prefetch_dynamic_strides */
 669   -1,                   /* minimum_stride */
 670   3                     /* default_opt_level  */
 671 };
 672
 673 static const cpu_prefetch_tune thunderx_prefetch_tune =
 674 {
 675   8,                    /* num_slots  */
 676   32,                   /* l1_cache_size  */
 677   128,                  /* l1_cache_line_size  */
 678   -1,                   /* l2_cache_size  */
 679   true,                 /* prefetch_dynamic_strides */
 680   -1,                   /* minimum_stride */
 681   -1                    /* default_opt_level  */
 682 };
 683
 684 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 685 {
 686   8,                    /* num_slots  */
 687   32,                   /* l1_cache_size  */
 688   64,                   /* l1_cache_line_size  */
 689   256,                  /* l2_cache_size  */
 690   true,                 /* prefetch_dynamic_strides */
 691   -1,                   /* minimum_stride */
 692   -1                    /* default_opt_level  */
 693 };
 694
 695 static const cpu_prefetch_tune tsv110_prefetch_tune =
 696 {
 697   0,                    /* num_slots  */
 698   64,                   /* l1_cache_size  */
 699   64,                   /* l1_cache_line_size  */
 700   512,                  /* l2_cache_size  */
 701   true,                 /* prefetch_dynamic_strides */
 702   -1,                   /* minimum_stride */
 703   -1                    /* default_opt_level  */
 704 };
 705
 706 static const cpu_prefetch_tune xgene1_prefetch_tune =
 707 {
 708   8,                    /* num_slots  */
 709   32,                   /* l1_cache_size  */
 710   64,                   /* l1_cache_line_size  */
 711   256,                  /* l2_cache_size  */
 712   true,                 /* prefetch_dynamic_strides */
 713   -1,                   /* minimum_stride */
 714   -1                    /* default_opt_level  */
 715 };
 716
 717 static const struct tune_params generic_tunings =
 718 {
 719   &cortexa57_extra_costs,
 720   &generic_addrcost_table,
 721   &generic_regmove_cost,
 722   &generic_vector_cost,
 723   &generic_branch_cost,
 724   &generic_approx_modes,
 725   SVE_NOT_IMPLEMENTED, /* sve_width  */
 726   4, /* memmov_cost  */
 727   2, /* issue_rate  */
 728   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 729   "16:12",      /* function_align.  */
 730   "4",  /* jump_align.  */
 731   "8",  /* loop_align.  */
 732   2,    /* int_reassoc_width.  */
 733   4,    /* fp_reassoc_width.  */
 734   1,    /* vec_reassoc_width.  */
 735   2,    /* min_div_recip_mul_sf.  */
 736   2,    /* min_div_recip_mul_df.  */
 737   0,    /* max_case_values.  */
 738   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 739   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 740   &generic_prefetch_tune
 741 };
 742
 743 static const struct tune_params cortexa35_tunings =
 744 {
 745   &cortexa53_extra_costs,
 746   &generic_addrcost_table,
 747   &cortexa53_regmove_cost,
 748   &generic_vector_cost,
 749   &generic_branch_cost,
 750   &generic_approx_modes,
 751   SVE_NOT_IMPLEMENTED, /* sve_width  */
 752   4, /* memmov_cost  */
 753   1, /* issue_rate  */
 754   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 755    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 756   "16", /* function_align.  */
 757   "4",  /* jump_align.  */
 758   "8",  /* loop_align.  */
 759   2,    /* int_reassoc_width.  */
 760   4,    /* fp_reassoc_width.  */
 761   1,    /* vec_reassoc_width.  */
 762   2,    /* min_div_recip_mul_sf.  */
 763   2,    /* min_div_recip_mul_df.  */
 764   0,    /* max_case_values.  */
 765   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 766   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 767   &generic_prefetch_tune
 768 };
 769
 770 static const struct tune_params cortexa53_tunings =
 771 {
 772   &cortexa53_extra_costs,
 773   &generic_addrcost_table,
 774   &cortexa53_regmove_cost,
 775   &generic_vector_cost,
 776   &generic_branch_cost,
 777   &generic_approx_modes,
 778   SVE_NOT_IMPLEMENTED, /* sve_width  */
 779   4, /* memmov_cost  */
 780   2, /* issue_rate  */
 781   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 782    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 783   "16", /* function_align.  */
 784   "4",  /* jump_align.  */
 785   "8",  /* loop_align.  */
 786   2,    /* int_reassoc_width.  */
 787   4,    /* fp_reassoc_width.  */
 788   1,    /* vec_reassoc_width.  */
 789   2,    /* min_div_recip_mul_sf.  */
 790   2,    /* min_div_recip_mul_df.  */
 791   0,    /* max_case_values.  */
 792   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 793   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 794   &generic_prefetch_tune
 795 };
 796
 797 static const struct tune_params cortexa57_tunings =
 798 {
 799   &cortexa57_extra_costs,
 800   &generic_addrcost_table,
 801   &cortexa57_regmove_cost,
 802   &cortexa57_vector_cost,
 803   &generic_branch_cost,
 804   &generic_approx_modes,
 805   SVE_NOT_IMPLEMENTED, /* sve_width  */
 806   4, /* memmov_cost  */
 807   3, /* issue_rate  */
 808   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 809    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 810   "16", /* function_align.  */
 811   "4",  /* jump_align.  */
 812   "8",  /* loop_align.  */
 813   2,    /* int_reassoc_width.  */
 814   4,    /* fp_reassoc_width.  */
 815   1,    /* vec_reassoc_width.  */
 816   2,    /* min_div_recip_mul_sf.  */
 817   2,    /* min_div_recip_mul_df.  */
 818   0,    /* max_case_values.  */
 819   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 820   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 821   &generic_prefetch_tune
 822 };
 823
 824 static const struct tune_params cortexa72_tunings =
 825 {
 826   &cortexa57_extra_costs,
 827   &generic_addrcost_table,
 828   &cortexa57_regmove_cost,
 829   &cortexa57_vector_cost,
 830   &generic_branch_cost,
 831   &generic_approx_modes,
 832   SVE_NOT_IMPLEMENTED, /* sve_width  */
 833   4, /* memmov_cost  */
 834   3, /* issue_rate  */
 835   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 836    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 837   "16", /* function_align.  */
 838   "4",  /* jump_align.  */
 839   "8",  /* loop_align.  */
 840   2,    /* int_reassoc_width.  */
 841   4,    /* fp_reassoc_width.  */
 842   1,    /* vec_reassoc_width.  */
 843   2,    /* min_div_recip_mul_sf.  */
 844   2,    /* min_div_recip_mul_df.  */
 845   0,    /* max_case_values.  */
 846   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 847   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 848   &generic_prefetch_tune
 849 };
 850
 851 static const struct tune_params cortexa73_tunings =
 852 {
 853   &cortexa57_extra_costs,
 854   &generic_addrcost_table,
 855   &cortexa57_regmove_cost,
 856   &cortexa57_vector_cost,
 857   &generic_branch_cost,
 858   &generic_approx_modes,
 859   SVE_NOT_IMPLEMENTED, /* sve_width  */
 860   4, /* memmov_cost.  */
 861   2, /* issue_rate.  */
 862   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 863    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 864   "16", /* function_align.  */
 865   "4",  /* jump_align.  */
 866   "8",  /* loop_align.  */
 867   2,    /* int_reassoc_width.  */
 868   4,    /* fp_reassoc_width.  */
 869   1,    /* vec_reassoc_width.  */
 870   2,    /* min_div_recip_mul_sf.  */
 871   2,    /* min_div_recip_mul_df.  */
 872   0,    /* max_case_values.  */
 873   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 874   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 875   &generic_prefetch_tune
 876 };
 877
 878
 879
 880 static const struct tune_params exynosm1_tunings =
 881 {
 882   &exynosm1_extra_costs,
 883   &exynosm1_addrcost_table,
 884   &exynosm1_regmove_cost,
 885   &exynosm1_vector_cost,
 886   &generic_branch_cost,
 887   &exynosm1_approx_modes,
 888   SVE_NOT_IMPLEMENTED, /* sve_width  */
 889   4,    /* memmov_cost  */
 890   3,    /* issue_rate  */
 891   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 892   "4",  /* function_align.  */
 893   "4",  /* jump_align.  */
 894   "4",  /* loop_align.  */
 895   2,    /* int_reassoc_width.  */
 896   4,    /* fp_reassoc_width.  */
 897   1,    /* vec_reassoc_width.  */
 898   2,    /* min_div_recip_mul_sf.  */
 899   2,    /* min_div_recip_mul_df.  */
 900   48,   /* max_case_values.  */
 901   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 902   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 903   &exynosm1_prefetch_tune
 904 };
 905
 906 static const struct tune_params thunderxt88_tunings =
 907 {
 908   &thunderx_extra_costs,
 909   &generic_addrcost_table,
 910   &thunderx_regmove_cost,
 911   &thunderx_vector_cost,
 912   &generic_branch_cost,
 913   &generic_approx_modes,
 914   SVE_NOT_IMPLEMENTED, /* sve_width  */
 915   6, /* memmov_cost  */
 916   2, /* issue_rate  */
 917   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 918   "8",  /* function_align.  */
 919   "8",  /* jump_align.  */
 920   "8",  /* loop_align.  */
 921   2,    /* int_reassoc_width.  */
 922   4,    /* fp_reassoc_width.  */
 923   1,    /* vec_reassoc_width.  */
 924   2,    /* min_div_recip_mul_sf.  */
 925   2,    /* min_div_recip_mul_df.  */
 926   0,    /* max_case_values.  */
 927   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 928   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 929   &thunderxt88_prefetch_tune
 930 };
 931
 932 static const struct tune_params thunderx_tunings =
 933 {
 934   &thunderx_extra_costs,
 935   &generic_addrcost_table,
 936   &thunderx_regmove_cost,
 937   &thunderx_vector_cost,
 938   &generic_branch_cost,
 939   &generic_approx_modes,
 940   SVE_NOT_IMPLEMENTED, /* sve_width  */
 941   6, /* memmov_cost  */
 942   2, /* issue_rate  */
 943   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 944   "8",  /* function_align.  */
 945   "8",  /* jump_align.  */
 946   "8",  /* loop_align.  */
 947   2,    /* int_reassoc_width.  */
 948   4,    /* fp_reassoc_width.  */
 949   1,    /* vec_reassoc_width.  */
 950   2,    /* min_div_recip_mul_sf.  */
 951   2,    /* min_div_recip_mul_df.  */
 952   0,    /* max_case_values.  */
 953   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 954   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 955    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 956   &thunderx_prefetch_tune
 957 };
 958
 959 static const struct tune_params tsv110_tunings =
 960 {
 961   &tsv110_extra_costs,
 962   &tsv110_addrcost_table,
 963   &tsv110_regmove_cost,
 964   &tsv110_vector_cost,
 965   &generic_branch_cost,
 966   &generic_approx_modes,
 967   SVE_NOT_IMPLEMENTED, /* sve_width  */
 968   4,    /* memmov_cost  */
 969   4,    /* issue_rate  */
 970   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
 971    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 972   "16", /* function_align.  */
 973   "4",  /* jump_align.  */
 974   "8",  /* loop_align.  */
 975   2,    /* int_reassoc_width.  */
 976   4,    /* fp_reassoc_width.  */
 977   1,    /* vec_reassoc_width.  */
 978   2,    /* min_div_recip_mul_sf.  */
 979   2,    /* min_div_recip_mul_df.  */
 980   0,    /* max_case_values.  */
 981   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 982   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
 983   &tsv110_prefetch_tune
 984 };
 985
 986 static const struct tune_params xgene1_tunings =
 987 {
 988   &xgene1_extra_costs,
 989   &xgene1_addrcost_table,
 990   &xgene1_regmove_cost,
 991   &xgene1_vector_cost,
 992   &generic_branch_cost,
 993   &xgene1_approx_modes,
 994   SVE_NOT_IMPLEMENTED, /* sve_width  */
 995   6, /* memmov_cost  */
 996   4, /* issue_rate  */
 997   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 998   "16", /* function_align.  */
 999   "16", /* jump_align.  */
1000   "16", /* loop_align.  */
1001   2,    /* int_reassoc_width.  */
1002   4,    /* fp_reassoc_width.  */
1003   1,    /* vec_reassoc_width.  */
1004   2,    /* min_div_recip_mul_sf.  */
1005   2,    /* min_div_recip_mul_df.  */
1006   17,   /* max_case_values.  */
1007   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1008   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1009   &xgene1_prefetch_tune
1010 };
1011
1012 static const struct tune_params emag_tunings =
1013 {
1014   &xgene1_extra_costs,
1015   &xgene1_addrcost_table,
1016   &xgene1_regmove_cost,
1017   &xgene1_vector_cost,
1018   &generic_branch_cost,
1019   &xgene1_approx_modes,
1020   SVE_NOT_IMPLEMENTED,
1021   6, /* memmov_cost  */
1022   4, /* issue_rate  */
1023   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1024   "16", /* function_align.  */
1025   "16", /* jump_align.  */
1026   "16", /* loop_align.  */
1027   2,    /* int_reassoc_width.  */
1028   4,    /* fp_reassoc_width.  */
1029   1,    /* vec_reassoc_width.  */
1030   2,    /* min_div_recip_mul_sf.  */
1031   2,    /* min_div_recip_mul_df.  */
1032   17,   /* max_case_values.  */
1033   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1034   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1035   &xgene1_prefetch_tune
1036 };
1037
1038 static const struct tune_params qdf24xx_tunings =
1039 {
1040   &qdf24xx_extra_costs,
1041   &qdf24xx_addrcost_table,
1042   &qdf24xx_regmove_cost,
1043   &qdf24xx_vector_cost,
1044   &generic_branch_cost,
1045   &generic_approx_modes,
1046   SVE_NOT_IMPLEMENTED, /* sve_width  */
1047   4, /* memmov_cost  */
1048   4, /* issue_rate  */
1049   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1050    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1051   "16", /* function_align.  */
1052   "8",  /* jump_align.  */
1053   "16", /* loop_align.  */
1054   2,    /* int_reassoc_width.  */
1055   4,    /* fp_reassoc_width.  */
1056   1,    /* vec_reassoc_width.  */
1057   2,    /* min_div_recip_mul_sf.  */
1058   2,    /* min_div_recip_mul_df.  */
1059   0,    /* max_case_values.  */
1060   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1061   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1062   &qdf24xx_prefetch_tune
1063 };
1064
1065 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1066    for now.  */
1067 static const struct tune_params saphira_tunings =
1068 {
1069   &generic_extra_costs,
1070   &generic_addrcost_table,
1071   &generic_regmove_cost,
1072   &generic_vector_cost,
1073   &generic_branch_cost,
1074   &generic_approx_modes,
1075   SVE_NOT_IMPLEMENTED, /* sve_width  */
1076   4, /* memmov_cost  */
1077   4, /* issue_rate  */
1078   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1079    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1080   "16", /* function_align.  */
1081   "8",  /* jump_align.  */
1082   "16", /* loop_align.  */
1083   2,    /* int_reassoc_width.  */
1084   4,    /* fp_reassoc_width.  */
1085   1,    /* vec_reassoc_width.  */
1086   2,    /* min_div_recip_mul_sf.  */
1087   2,    /* min_div_recip_mul_df.  */
1088   0,    /* max_case_values.  */
1089   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1090   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1091   &generic_prefetch_tune
1092 };
1093
1094 static const struct tune_params thunderx2t99_tunings =
1095 {
1096   &thunderx2t99_extra_costs,
1097   &thunderx2t99_addrcost_table,
1098   &thunderx2t99_regmove_cost,
1099   &thunderx2t99_vector_cost,
1100   &generic_branch_cost,
1101   &generic_approx_modes,
1102   SVE_NOT_IMPLEMENTED, /* sve_width  */
1103   4, /* memmov_cost.  */
1104   4, /* issue_rate.  */
1105   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1106    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
1107   "16", /* function_align.  */
1108   "8",  /* jump_align.  */
1109   "16", /* loop_align.  */
1110   3,    /* int_reassoc_width.  */
1111   2,    /* fp_reassoc_width.  */
1112   2,    /* vec_reassoc_width.  */
1113   2,    /* min_div_recip_mul_sf.  */
1114   2,    /* min_div_recip_mul_df.  */
1115   0,    /* max_case_values.  */
1116   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1117   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1118   &thunderx2t99_prefetch_tune
1119 };
1120
1121 static const struct tune_params neoversen1_tunings =
1122 {
1123   &cortexa57_extra_costs,
1124   &generic_addrcost_table,
1125   &generic_regmove_cost,
1126   &cortexa57_vector_cost,
1127   &generic_branch_cost,
1128   &generic_approx_modes,
1129   SVE_NOT_IMPLEMENTED, /* sve_width  */
1130   4, /* memmov_cost  */
1131   3, /* issue_rate  */
1132   AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
1133   "32:16",      /* function_align.  */
1134   "32:16",      /* jump_align.  */
1135   "32:16",      /* loop_align.  */
1136   2,    /* int_reassoc_width.  */
1137   4,    /* fp_reassoc_width.  */
1138   2,    /* vec_reassoc_width.  */
1139   2,    /* min_div_recip_mul_sf.  */
1140   2,    /* min_div_recip_mul_df.  */
1141   0,    /* max_case_values.  */
1142   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1143   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1144   &generic_prefetch_tune
1145 };
1146
1147 /* Support for fine-grained override of the tuning structures.  */
1148 struct aarch64_tuning_override_function
1149 {
1150   const char* name;
1151   void (*parse_override)(const char*, struct tune_params*);
1152 };
1153
1154 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1155 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1156 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1157
1158 static const struct aarch64_tuning_override_function
1159 aarch64_tuning_override_functions[] =
1160 {
1161   { "fuse", aarch64_parse_fuse_string },
1162   { "tune", aarch64_parse_tune_string },
1163   { "sve_width", aarch64_parse_sve_width_string },
1164   { NULL, NULL }
1165 };
1166
1167 /* A processor implementing AArch64.  */
1168 struct processor
1169 {
1170   const char *const name;
1171   enum aarch64_processor ident;
1172   enum aarch64_processor sched_core;
1173   enum aarch64_arch arch;
1174   unsigned architecture_version;
1175   const uint64_t flags;
1176   const struct tune_params *const tune;
1177 };
1178
1179 /* Architectures implementing AArch64.  */
1180 static const struct processor all_architectures[] =
1181 {
1182 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1183   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1184 #include "aarch64-arches.def"
1185   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1186 };
1187
1188 /* Processor cores implementing AArch64.  */
1189 static const struct processor all_cores[] =
1190 {
1191 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1192   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
1193   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
1194   FLAGS, &COSTS##_tunings},
1195 #include "aarch64-cores.def"
1196   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1197     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1198   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1199 };
1200
1201
1202 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1203    handling code or by target attributes.  */
1204 static const struct processor *selected_arch;
1205 static const struct processor *selected_cpu;
1206 static const struct processor *selected_tune;
1207
1208 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1209
1210 /* The current tuning set.  */
1211 struct tune_params aarch64_tune_params = generic_tunings;
1212
1213 /* Table of machine attributes.  */
1214 static const struct attribute_spec aarch64_attribute_table[] =
1215 {
1216   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1217        affects_type_identity, handler, exclude } */
1218   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,  NULL, NULL },
1219   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1220 };
1221
1222 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1223
1224 /* An ISA extension in the co-processor and main instruction set space.  */
1225 struct aarch64_option_extension
1226 {
1227   const char *const name;
1228   const unsigned long flags_on;
1229   const unsigned long flags_off;
1230 };
1231
1232 typedef enum aarch64_cond_code
1233 {
1234   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1235   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1236   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1237 }
1238 aarch64_cc;
1239
1240 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1241
1242 struct aarch64_branch_protect_type
1243 {
1244   /* The type's name that the user passes to the branch-protection option
1245     string.  */
1246   const char* name;
1247   /* Function to handle the protection type and set global variables.
1248     First argument is the string token corresponding with this type and the
1249     second argument is the next token in the option string.
1250     Return values:
1251     * AARCH64_PARSE_OK: Handling was sucessful.
1252     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1253       should print an error.
1254     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1255       own error.  */
1256   enum aarch64_parse_opt_result (*handler)(char*, char*);
1257   /* A list of types that can follow this type in the option string.  */
1258   const aarch64_branch_protect_type* subtypes;
1259   unsigned int num_subtypes;
1260 };
1261
1262 static enum aarch64_parse_opt_result
1263 aarch64_handle_no_branch_protection (char* str, char* rest)
1264 {
1265   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1266   aarch64_enable_bti = 0;
1267   if (rest)
1268     {
1269       error ("unexpected %<%s%> after %<%s%>", rest, str);
1270       return AARCH64_PARSE_INVALID_FEATURE;
1271     }
1272   return AARCH64_PARSE_OK;
1273 }
1274
1275 static enum aarch64_parse_opt_result
1276 aarch64_handle_standard_branch_protection (char* str, char* rest)
1277 {
1278   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1279   aarch64_ra_sign_key = AARCH64_KEY_A;
1280   aarch64_enable_bti = 1;
1281   if (rest)
1282     {
1283       error ("unexpected %<%s%> after %<%s%>", rest, str);
1284       return AARCH64_PARSE_INVALID_FEATURE;
1285     }
1286   return AARCH64_PARSE_OK;
1287 }
1288
1289 static enum aarch64_parse_opt_result
1290 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1291                                     char* rest ATTRIBUTE_UNUSED)
1292 {
1293   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1294   aarch64_ra_sign_key = AARCH64_KEY_A;
1295   return AARCH64_PARSE_OK;
1296 }
1297
1298 static enum aarch64_parse_opt_result
1299 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1300                               char* rest ATTRIBUTE_UNUSED)
1301 {
1302   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1303   return AARCH64_PARSE_OK;
1304 }
1305
1306 static enum aarch64_parse_opt_result
1307 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1308                               char* rest ATTRIBUTE_UNUSED)
1309 {
1310   aarch64_ra_sign_key = AARCH64_KEY_B;
1311   return AARCH64_PARSE_OK;
1312 }
1313
1314 static enum aarch64_parse_opt_result
1315 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1316                                     char* rest ATTRIBUTE_UNUSED)
1317 {
1318   aarch64_enable_bti = 1;
1319   return AARCH64_PARSE_OK;
1320 }
1321
1322 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1323   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1324   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1325   { NULL, NULL, NULL, 0 }
1326 };
1327
1328 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1329   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1330   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1331   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1332     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1333   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1334   { NULL, NULL, NULL, 0 }
1335 };
1336
1337 /* The condition codes of the processor, and the inverse function.  */
1338 static const char * const aarch64_condition_codes[] =
1339 {
1340   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1341   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1342 };
1343
1344 /* The preferred condition codes for SVE conditions.  */
1345 static const char *const aarch64_sve_condition_codes[] =
1346 {
1347   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1348   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1349 };
1350
1351 /* Return the assembly token for svpattern value VALUE.  */
1352
1353 static const char *
1354 svpattern_token (enum aarch64_svpattern pattern)
1355 {
1356   switch (pattern)
1357     {
1358 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1359     AARCH64_FOR_SVPATTERN (CASE)
1360 #undef CASE
1361     case AARCH64_NUM_SVPATTERNS:
1362       break;
1363     }
1364   gcc_unreachable ();
1365 }
1366
1367 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1368 const char *
1369 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1370                         const char * branch_format)
1371 {
1372     rtx_code_label * tmp_label = gen_label_rtx ();
1373     char label_buf[256];
1374     char buffer[128];
1375     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1376                                  CODE_LABEL_NUMBER (tmp_label));
1377     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1378     rtx dest_label = operands[pos_label];
1379     operands[pos_label] = tmp_label;
1380
1381     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1382     output_asm_insn (buffer, operands);
1383
1384     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1385     operands[pos_label] = dest_label;
1386     output_asm_insn (buffer, operands);
1387     return "";
1388 }
1389
1390 void
1391 aarch64_err_no_fpadvsimd (machine_mode mode)
1392 {
1393   if (TARGET_GENERAL_REGS_ONLY)
1394     if (FLOAT_MODE_P (mode))
1395       error ("%qs is incompatible with the use of floating-point types",
1396              "-mgeneral-regs-only");
1397     else
1398       error ("%qs is incompatible with the use of vector types",
1399              "-mgeneral-regs-only");
1400   else
1401     if (FLOAT_MODE_P (mode))
1402       error ("%qs feature modifier is incompatible with the use of"
1403              " floating-point types", "+nofp");
1404     else
1405       error ("%qs feature modifier is incompatible with the use of"
1406              " vector types", "+nofp");
1407 }
1408
1409 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1410    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1411    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1412    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1413    and GENERAL_REGS is lower than the memory cost (in this case the best class
1414    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1415    cost results in bad allocations with many redundant int<->FP moves which
1416    are expensive on various cores.
1417    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1418    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1419    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1420    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1421    The result of this is that it is no longer inefficient to have a higher
1422    memory move cost than the register move cost.
1423 */
1424
1425 static reg_class_t
1426 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1427                                          reg_class_t best_class)
1428 {
1429   machine_mode mode;
1430
1431   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1432       || !reg_class_subset_p (FP_REGS, allocno_class))
1433     return allocno_class;
1434
1435   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1436       || !reg_class_subset_p (FP_REGS, best_class))
1437     return best_class;
1438
1439   mode = PSEUDO_REGNO_MODE (regno);
1440   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1441 }
1442
1443 static unsigned int
1444 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1445 {
1446   if (GET_MODE_UNIT_SIZE (mode) == 4)
1447     return aarch64_tune_params.min_div_recip_mul_sf;
1448   return aarch64_tune_params.min_div_recip_mul_df;
1449 }
1450
1451 /* Return the reassociation width of treeop OPC with mode MODE.  */
1452 static int
1453 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1454 {
1455   if (VECTOR_MODE_P (mode))
1456     return aarch64_tune_params.vec_reassoc_width;
1457   if (INTEGRAL_MODE_P (mode))
1458     return aarch64_tune_params.int_reassoc_width;
1459   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1460   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1461     return aarch64_tune_params.fp_reassoc_width;
1462   return 1;
1463 }
1464
1465 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1466 unsigned
1467 aarch64_dbx_register_number (unsigned regno)
1468 {
1469    if (GP_REGNUM_P (regno))
1470      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1471    else if (regno == SP_REGNUM)
1472      return AARCH64_DWARF_SP;
1473    else if (FP_REGNUM_P (regno))
1474      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1475    else if (PR_REGNUM_P (regno))
1476      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1477    else if (regno == VG_REGNUM)
1478      return AARCH64_DWARF_VG;
1479
1480    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1481       equivalent DWARF register.  */
1482    return DWARF_FRAME_REGISTERS;
1483 }
1484
1485 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1486 static bool
1487 aarch64_advsimd_struct_mode_p (machine_mode mode)
1488 {
1489   return (TARGET_SIMD
1490           && (mode == OImode || mode == CImode || mode == XImode));
1491 }
1492
1493 /* Return true if MODE is an SVE predicate mode.  */
1494 static bool
1495 aarch64_sve_pred_mode_p (machine_mode mode)
1496 {
1497   return (TARGET_SVE
1498           && (mode == VNx16BImode
1499               || mode == VNx8BImode
1500               || mode == VNx4BImode
1501               || mode == VNx2BImode));
1502 }
1503
1504 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1505 const unsigned int VEC_ADVSIMD  = 1;
1506 const unsigned int VEC_SVE_DATA = 2;
1507 const unsigned int VEC_SVE_PRED = 4;
1508 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1509    a structure of 2, 3 or 4 vectors.  */
1510 const unsigned int VEC_STRUCT   = 8;
1511 /* Useful combinations of the above.  */
1512 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1513 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1514
1515 /* Return a set of flags describing the vector properties of mode MODE.
1516    Ignore modes that are not supported by the current target.  */
1517 static unsigned int
1518 aarch64_classify_vector_mode (machine_mode mode)
1519 {
1520   if (aarch64_advsimd_struct_mode_p (mode))
1521     return VEC_ADVSIMD | VEC_STRUCT;
1522
1523   if (aarch64_sve_pred_mode_p (mode))
1524     return VEC_SVE_PRED;
1525
1526   /* Make the decision based on the mode's enum value rather than its
1527      properties, so that we keep the correct classification regardless
1528      of -msve-vector-bits.  */
1529   switch (mode)
1530     {
1531     /* Single SVE vectors.  */
1532     case E_VNx16QImode:
1533     case E_VNx8HImode:
1534     case E_VNx4SImode:
1535     case E_VNx2DImode:
1536     case E_VNx8HFmode:
1537     case E_VNx4SFmode:
1538     case E_VNx2DFmode:
1539       return TARGET_SVE ? VEC_SVE_DATA : 0;
1540
1541     /* x2 SVE vectors.  */
1542     case E_VNx32QImode:
1543     case E_VNx16HImode:
1544     case E_VNx8SImode:
1545     case E_VNx4DImode:
1546     case E_VNx16HFmode:
1547     case E_VNx8SFmode:
1548     case E_VNx4DFmode:
1549     /* x3 SVE vectors.  */
1550     case E_VNx48QImode:
1551     case E_VNx24HImode:
1552     case E_VNx12SImode:
1553     case E_VNx6DImode:
1554     case E_VNx24HFmode:
1555     case E_VNx12SFmode:
1556     case E_VNx6DFmode:
1557     /* x4 SVE vectors.  */
1558     case E_VNx64QImode:
1559     case E_VNx32HImode:
1560     case E_VNx16SImode:
1561     case E_VNx8DImode:
1562     case E_VNx32HFmode:
1563     case E_VNx16SFmode:
1564     case E_VNx8DFmode:
1565       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1566
1567     /* 64-bit Advanced SIMD vectors.  */
1568     case E_V8QImode:
1569     case E_V4HImode:
1570     case E_V2SImode:
1571     /* ...E_V1DImode doesn't exist.  */
1572     case E_V4HFmode:
1573     case E_V2SFmode:
1574     case E_V1DFmode:
1575     /* 128-bit Advanced SIMD vectors.  */
1576     case E_V16QImode:
1577     case E_V8HImode:
1578     case E_V4SImode:
1579     case E_V2DImode:
1580     case E_V8HFmode:
1581     case E_V4SFmode:
1582     case E_V2DFmode:
1583       return TARGET_SIMD ? VEC_ADVSIMD : 0;
1584
1585     default:
1586       return 0;
1587     }
1588 }
1589
1590 /* Return true if MODE is any of the data vector modes, including
1591    structure modes.  */
1592 static bool
1593 aarch64_vector_data_mode_p (machine_mode mode)
1594 {
1595   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1596 }
1597
1598 /* Return true if MODE is an SVE data vector mode; either a single vector
1599    or a structure of vectors.  */
1600 static bool
1601 aarch64_sve_data_mode_p (machine_mode mode)
1602 {
1603   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1604 }
1605
1606 /* Implement target hook TARGET_ARRAY_MODE.  */
1607 static opt_machine_mode
1608 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1609 {
1610   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1611       && IN_RANGE (nelems, 2, 4))
1612     return mode_for_vector (GET_MODE_INNER (mode),
1613                             GET_MODE_NUNITS (mode) * nelems);
1614
1615   return opt_machine_mode ();
1616 }
1617
1618 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1619 static bool
1620 aarch64_array_mode_supported_p (machine_mode mode,
1621                                 unsigned HOST_WIDE_INT nelems)
1622 {
1623   if (TARGET_SIMD
1624       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1625           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1626       && (nelems >= 2 && nelems <= 4))
1627     return true;
1628
1629   return false;
1630 }
1631
1632 /* Return the SVE predicate mode to use for elements that have
1633    ELEM_NBYTES bytes, if such a mode exists.  */
1634
1635 opt_machine_mode
1636 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1637 {
1638   if (TARGET_SVE)
1639     {
1640       if (elem_nbytes == 1)
1641         return VNx16BImode;
1642       if (elem_nbytes == 2)
1643         return VNx8BImode;
1644       if (elem_nbytes == 4)
1645         return VNx4BImode;
1646       if (elem_nbytes == 8)
1647         return VNx2BImode;
1648     }
1649   return opt_machine_mode ();
1650 }
1651
1652 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1653
1654 static opt_machine_mode
1655 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1656 {
1657   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1658     {
1659       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1660       machine_mode pred_mode;
1661       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1662         return pred_mode;
1663     }
1664
1665   return default_get_mask_mode (nunits, nbytes);
1666 }
1667
1668 /* Return the integer element mode associated with SVE mode MODE.  */
1669
1670 static scalar_int_mode
1671 aarch64_sve_element_int_mode (machine_mode mode)
1672 {
1673   unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
1674                                                GET_MODE_NUNITS (mode));
1675   return int_mode_for_size (elt_bits, 0).require ();
1676 }
1677
1678 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1679    prefer to use the first arithmetic operand as the else value if
1680    the else value doesn't matter, since that exactly matches the SVE
1681    destructive merging form.  For ternary operations we could either
1682    pick the first operand and use FMAD-like instructions or the last
1683    operand and use FMLA-like instructions; the latter seems more
1684    natural.  */
1685
1686 static tree
1687 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1688 {
1689   return nops == 3 ? ops[2] : ops[0];
1690 }
1691
1692 /* Implement TARGET_HARD_REGNO_NREGS.  */
1693
1694 static unsigned int
1695 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1696 {
1697   /* ??? Logically we should only need to provide a value when
1698      HARD_REGNO_MODE_OK says that the combination is valid,
1699      but at the moment we need to handle all modes.  Just ignore
1700      any runtime parts for registers that can't store them.  */
1701   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1702   switch (aarch64_regno_regclass (regno))
1703     {
1704     case FP_REGS:
1705     case FP_LO_REGS:
1706     case FP_LO8_REGS:
1707       if (aarch64_sve_data_mode_p (mode))
1708         return exact_div (GET_MODE_SIZE (mode),
1709                           BYTES_PER_SVE_VECTOR).to_constant ();
1710       return CEIL (lowest_size, UNITS_PER_VREG);
1711     case PR_REGS:
1712     case PR_LO_REGS:
1713     case PR_HI_REGS:
1714       return 1;
1715     default:
1716       return CEIL (lowest_size, UNITS_PER_WORD);
1717     }
1718   gcc_unreachable ();
1719 }
1720
1721 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1722
1723 static bool
1724 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1725 {
1726   if (GET_MODE_CLASS (mode) == MODE_CC)
1727     return regno == CC_REGNUM;
1728
1729   if (regno == VG_REGNUM)
1730     /* This must have the same size as _Unwind_Word.  */
1731     return mode == DImode;
1732
1733   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1734   if (vec_flags & VEC_SVE_PRED)
1735     return PR_REGNUM_P (regno);
1736
1737   if (PR_REGNUM_P (regno))
1738     return 0;
1739
1740   if (regno == SP_REGNUM)
1741     /* The purpose of comparing with ptr_mode is to support the
1742        global register variable associated with the stack pointer
1743        register via the syntax of asm ("wsp") in ILP32.  */
1744     return mode == Pmode || mode == ptr_mode;
1745
1746   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1747     return mode == Pmode;
1748
1749   if (GP_REGNUM_P (regno))
1750     {
1751       if (known_le (GET_MODE_SIZE (mode), 8))
1752         return true;
1753       else if (known_le (GET_MODE_SIZE (mode), 16))
1754         return (regno & 1) == 0;
1755     }
1756   else if (FP_REGNUM_P (regno))
1757     {
1758       if (vec_flags & VEC_STRUCT)
1759         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1760       else
1761         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1762     }
1763
1764   return false;
1765 }
1766
1767 /* Return true if this is a definition of a vectorized simd function.  */
1768
1769 static bool
1770 aarch64_simd_decl_p (tree fndecl)
1771 {
1772   tree fntype;
1773
1774   if (fndecl == NULL)
1775     return false;
1776   fntype = TREE_TYPE (fndecl);
1777   if (fntype == NULL)
1778     return false;
1779
1780   /* Functions with the aarch64_vector_pcs attribute use the simd ABI.  */
1781   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1782     return true;
1783
1784   return false;
1785 }
1786
1787 /* Return the mode a register save/restore should use.  DImode for integer
1788    registers, DFmode for FP registers in non-SIMD functions (they only save
1789    the bottom half of a 128 bit register), or TFmode for FP registers in
1790    SIMD functions.  */
1791
1792 static machine_mode
1793 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1794 {
1795   return GP_REGNUM_P (regno)
1796            ? E_DImode
1797            : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1798 }
1799
1800 /* Return true if the instruction is a call to a SIMD function, false
1801    if it is not a SIMD function or if we do not know anything about
1802    the function.  */
1803
1804 static bool
1805 aarch64_simd_call_p (rtx_insn *insn)
1806 {
1807   rtx symbol;
1808   rtx call;
1809   tree fndecl;
1810
1811   gcc_assert (CALL_P (insn));
1812   call = get_call_rtx_from (insn);
1813   symbol = XEXP (XEXP (call, 0), 0);
1814   if (GET_CODE (symbol) != SYMBOL_REF)
1815     return false;
1816   fndecl = SYMBOL_REF_DECL (symbol);
1817   if (!fndecl)
1818     return false;
1819
1820   return aarch64_simd_decl_p (fndecl);
1821 }
1822
1823 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS.  If INSN calls
1824    a function that uses the SIMD ABI, take advantage of the extra
1825    call-preserved registers that the ABI provides.  */
1826
1827 void
1828 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1829                                           HARD_REG_SET *return_set)
1830 {
1831   if (aarch64_simd_call_p (insn))
1832     {
1833       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1834         if (FP_SIMD_SAVED_REGNUM_P (regno))
1835           CLEAR_HARD_REG_BIT (*return_set, regno);
1836     }
1837 }
1838
1839 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1840    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1841    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1842
1843 static bool
1844 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1845                                         machine_mode mode)
1846 {
1847   bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1848   return FP_REGNUM_P (regno)
1849          && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1850 }
1851
1852 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS.  */
1853
1854 rtx_insn *
1855 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1856 {
1857   gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1858
1859   if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1860     return call_1;
1861   else
1862     return call_2;
1863 }
1864
1865 /* Implement REGMODE_NATURAL_SIZE.  */
1866 poly_uint64
1867 aarch64_regmode_natural_size (machine_mode mode)
1868 {
1869   /* The natural size for SVE data modes is one SVE data vector,
1870      and similarly for predicates.  We can't independently modify
1871      anything smaller than that.  */
1872   /* ??? For now, only do this for variable-width SVE registers.
1873      Doing it for constant-sized registers breaks lower-subreg.c.  */
1874   /* ??? And once that's fixed, we should probably have similar
1875      code for Advanced SIMD.  */
1876   if (!aarch64_sve_vg.is_constant ())
1877     {
1878       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1879       if (vec_flags & VEC_SVE_PRED)
1880         return BYTES_PER_SVE_PRED;
1881       if (vec_flags & VEC_SVE_DATA)
1882         return BYTES_PER_SVE_VECTOR;
1883     }
1884   return UNITS_PER_WORD;
1885 }
1886
1887 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1888 machine_mode
1889 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1890                                      machine_mode mode)
1891 {
1892   /* The predicate mode determines which bits are significant and
1893      which are "don't care".  Decreasing the number of lanes would
1894      lose data while increasing the number of lanes would make bits
1895      unnecessarily significant.  */
1896   if (PR_REGNUM_P (regno))
1897     return mode;
1898   if (known_ge (GET_MODE_SIZE (mode), 4))
1899     return mode;
1900   else
1901     return SImode;
1902 }
1903
1904 /* Return true if I's bits are consecutive ones from the MSB.  */
1905 bool
1906 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1907 {
1908   return exact_log2 (-i) != HOST_WIDE_INT_M1;
1909 }
1910
1911 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1912    that strcpy from constants will be faster.  */
1913
1914 static HOST_WIDE_INT
1915 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1916 {
1917   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1918     return MAX (align, BITS_PER_WORD);
1919   return align;
1920 }
1921
1922 /* Return true if calls to DECL should be treated as
1923    long-calls (ie called via a register).  */
1924 static bool
1925 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1926 {
1927   return false;
1928 }
1929
1930 /* Return true if calls to symbol-ref SYM should be treated as
1931    long-calls (ie called via a register).  */
1932 bool
1933 aarch64_is_long_call_p (rtx sym)
1934 {
1935   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1936 }
1937
1938 /* Return true if calls to symbol-ref SYM should not go through
1939    plt stubs.  */
1940
1941 bool
1942 aarch64_is_noplt_call_p (rtx sym)
1943 {
1944   const_tree decl = SYMBOL_REF_DECL (sym);
1945
1946   if (flag_pic
1947       && decl
1948       && (!flag_plt
1949           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1950       && !targetm.binds_local_p (decl))
1951     return true;
1952
1953   return false;
1954 }
1955
1956 /* Return true if the offsets to a zero/sign-extract operation
1957    represent an expression that matches an extend operation.  The
1958    operands represent the paramters from
1959
1960    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1961 bool
1962 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1963                                 rtx extract_imm)
1964 {
1965   HOST_WIDE_INT mult_val, extract_val;
1966
1967   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1968     return false;
1969
1970   mult_val = INTVAL (mult_imm);
1971   extract_val = INTVAL (extract_imm);
1972
1973   if (extract_val > 8
1974       && extract_val < GET_MODE_BITSIZE (mode)
1975       && exact_log2 (extract_val & ~7) > 0
1976       && (extract_val & 7) <= 4
1977       && mult_val == (1 << (extract_val & 7)))
1978     return true;
1979
1980   return false;
1981 }
1982
1983 /* Emit an insn that's a simple single-set.  Both the operands must be
1984    known to be valid.  */
1985 inline static rtx_insn *
1986 emit_set_insn (rtx x, rtx y)
1987 {
1988   return emit_insn (gen_rtx_SET (x, y));
1989 }
1990
1991 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1992    return the rtx for register 0 in the proper mode.  */
1993 rtx
1994 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1995 {
1996   machine_mode mode = SELECT_CC_MODE (code, x, y);
1997   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1998
1999   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
2000   return cc_reg;
2001 }
2002
2003 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
2004
2005 static rtx
2006 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2007                                   machine_mode y_mode)
2008 {
2009   if (y_mode == E_QImode || y_mode == E_HImode)
2010     {
2011       if (CONST_INT_P (y))
2012         y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2013       else
2014         {
2015           rtx t, cc_reg;
2016           machine_mode cc_mode;
2017
2018           t = gen_rtx_ZERO_EXTEND (SImode, y);
2019           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2020           cc_mode = CC_SWPmode;
2021           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2022           emit_set_insn (cc_reg, t);
2023           return cc_reg;
2024         }
2025     }
2026
2027   return aarch64_gen_compare_reg (code, x, y);
2028 }
2029
2030 /* Build the SYMBOL_REF for __tls_get_addr.  */
2031
2032 static GTY(()) rtx tls_get_addr_libfunc;
2033
2034 rtx
2035 aarch64_tls_get_addr (void)
2036 {
2037   if (!tls_get_addr_libfunc)
2038     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2039   return tls_get_addr_libfunc;
2040 }
2041
2042 /* Return the TLS model to use for ADDR.  */
2043
2044 static enum tls_model
2045 tls_symbolic_operand_type (rtx addr)
2046 {
2047   enum tls_model tls_kind = TLS_MODEL_NONE;
2048   if (GET_CODE (addr) == CONST)
2049     {
2050       poly_int64 addend;
2051       rtx sym = strip_offset (addr, &addend);
2052       if (GET_CODE (sym) == SYMBOL_REF)
2053         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2054     }
2055   else if (GET_CODE (addr) == SYMBOL_REF)
2056     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2057
2058   return tls_kind;
2059 }
2060
2061 /* We'll allow lo_sum's in addresses in our legitimate addresses
2062    so that combine would take care of combining addresses where
2063    necessary, but for generation purposes, we'll generate the address
2064    as :
2065    RTL                               Absolute
2066    tmp = hi (symbol_ref);            adrp  x1, foo
2067    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
2068                                      nop
2069
2070    PIC                               TLS
2071    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
2072    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
2073                                      bl   __tls_get_addr
2074                                      nop
2075
2076    Load TLS symbol, depending on TLS mechanism and TLS access model.
2077
2078    Global Dynamic - Traditional TLS:
2079    adrp tmp, :tlsgd:imm
2080    add  dest, tmp, #:tlsgd_lo12:imm
2081    bl   __tls_get_addr
2082
2083    Global Dynamic - TLS Descriptors:
2084    adrp dest, :tlsdesc:imm
2085    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
2086    add  dest, dest, #:tlsdesc_lo12:imm
2087    blr  tmp
2088    mrs  tp, tpidr_el0
2089    add  dest, dest, tp
2090
2091    Initial Exec:
2092    mrs  tp, tpidr_el0
2093    adrp tmp, :gottprel:imm
2094    ldr  dest, [tmp, #:gottprel_lo12:imm]
2095    add  dest, dest, tp
2096
2097    Local Exec:
2098    mrs  tp, tpidr_el0
2099    add  t0, tp, #:tprel_hi12:imm, lsl #12
2100    add  t0, t0, #:tprel_lo12_nc:imm
2101 */
2102
2103 static void
2104 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2105                                    enum aarch64_symbol_type type)
2106 {
2107   switch (type)
2108     {
2109     case SYMBOL_SMALL_ABSOLUTE:
2110       {
2111         /* In ILP32, the mode of dest can be either SImode or DImode.  */
2112         rtx tmp_reg = dest;
2113         machine_mode mode = GET_MODE (dest);
2114
2115         gcc_assert (mode == Pmode || mode == ptr_mode);
2116
2117         if (can_create_pseudo_p ())
2118           tmp_reg = gen_reg_rtx (mode);
2119
2120         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2121         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2122         return;
2123       }
2124
2125     case SYMBOL_TINY_ABSOLUTE:
2126       emit_insn (gen_rtx_SET (dest, imm));
2127       return;
2128
2129     case SYMBOL_SMALL_GOT_28K:
2130       {
2131         machine_mode mode = GET_MODE (dest);
2132         rtx gp_rtx = pic_offset_table_rtx;
2133         rtx insn;
2134         rtx mem;
2135
2136         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2137            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2138            decide rtx costs, in which case pic_offset_table_rtx is not
2139            initialized.  For that case no need to generate the first adrp
2140            instruction as the final cost for global variable access is
2141            one instruction.  */
2142         if (gp_rtx != NULL)
2143           {
2144             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2145                using the page base as GOT base, the first page may be wasted,
2146                in the worst scenario, there is only 28K space for GOT).
2147
2148                The generate instruction sequence for accessing global variable
2149                is:
2150
2151                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2152
2153                Only one instruction needed. But we must initialize
2154                pic_offset_table_rtx properly.  We generate initialize insn for
2155                every global access, and allow CSE to remove all redundant.
2156
2157                The final instruction sequences will look like the following
2158                for multiply global variables access.
2159
2160                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2161
2162                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2163                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2164                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2165                  ...  */
2166
2167             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2168             crtl->uses_pic_offset_table = 1;
2169             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2170
2171             if (mode != GET_MODE (gp_rtx))
2172              gp_rtx = gen_lowpart (mode, gp_rtx);
2173
2174           }
2175
2176         if (mode == ptr_mode)
2177           {
2178             if (mode == DImode)
2179               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2180             else
2181               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2182
2183             mem = XVECEXP (SET_SRC (insn), 0, 0);
2184           }
2185         else
2186           {
2187             gcc_assert (mode == Pmode);
2188
2189             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2190             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2191           }
2192
2193         /* The operand is expected to be MEM.  Whenever the related insn
2194            pattern changed, above code which calculate mem should be
2195            updated.  */
2196         gcc_assert (GET_CODE (mem) == MEM);
2197         MEM_READONLY_P (mem) = 1;
2198         MEM_NOTRAP_P (mem) = 1;
2199         emit_insn (insn);
2200         return;
2201       }
2202
2203     case SYMBOL_SMALL_GOT_4G:
2204       {
2205         /* In ILP32, the mode of dest can be either SImode or DImode,
2206            while the got entry is always of SImode size.  The mode of
2207            dest depends on how dest is used: if dest is assigned to a
2208            pointer (e.g. in the memory), it has SImode; it may have
2209            DImode if dest is dereferenced to access the memeory.
2210            This is why we have to handle three different ldr_got_small
2211            patterns here (two patterns for ILP32).  */
2212
2213         rtx insn;
2214         rtx mem;
2215         rtx tmp_reg = dest;
2216         machine_mode mode = GET_MODE (dest);
2217
2218         if (can_create_pseudo_p ())
2219           tmp_reg = gen_reg_rtx (mode);
2220
2221         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2222         if (mode == ptr_mode)
2223           {
2224             if (mode == DImode)
2225               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2226             else
2227               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2228
2229             mem = XVECEXP (SET_SRC (insn), 0, 0);
2230           }
2231         else
2232           {
2233             gcc_assert (mode == Pmode);
2234
2235             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2236             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2237           }
2238
2239         gcc_assert (GET_CODE (mem) == MEM);
2240         MEM_READONLY_P (mem) = 1;
2241         MEM_NOTRAP_P (mem) = 1;
2242         emit_insn (insn);
2243         return;
2244       }
2245
2246     case SYMBOL_SMALL_TLSGD:
2247       {
2248         rtx_insn *insns;
2249         machine_mode mode = GET_MODE (dest);
2250         rtx result = gen_rtx_REG (mode, R0_REGNUM);
2251
2252         start_sequence ();
2253         if (TARGET_ILP32)
2254           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2255         else
2256           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2257         insns = get_insns ();
2258         end_sequence ();
2259
2260         RTL_CONST_CALL_P (insns) = 1;
2261         emit_libcall_block (insns, dest, result, imm);
2262         return;
2263       }
2264
2265     case SYMBOL_SMALL_TLSDESC:
2266       {
2267         machine_mode mode = GET_MODE (dest);
2268         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2269         rtx tp;
2270
2271         gcc_assert (mode == Pmode || mode == ptr_mode);
2272
2273         /* In ILP32, the got entry is always of SImode size.  Unlike
2274            small GOT, the dest is fixed at reg 0.  */
2275         if (TARGET_ILP32)
2276           emit_insn (gen_tlsdesc_small_si (imm));
2277         else
2278           emit_insn (gen_tlsdesc_small_di (imm));
2279         tp = aarch64_load_tp (NULL);
2280
2281         if (mode != Pmode)
2282           tp = gen_lowpart (mode, tp);
2283
2284         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2285         if (REG_P (dest))
2286           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2287         return;
2288       }
2289
2290     case SYMBOL_SMALL_TLSIE:
2291       {
2292         /* In ILP32, the mode of dest can be either SImode or DImode,
2293            while the got entry is always of SImode size.  The mode of
2294            dest depends on how dest is used: if dest is assigned to a
2295            pointer (e.g. in the memory), it has SImode; it may have
2296            DImode if dest is dereferenced to access the memeory.
2297            This is why we have to handle three different tlsie_small
2298            patterns here (two patterns for ILP32).  */
2299         machine_mode mode = GET_MODE (dest);
2300         rtx tmp_reg = gen_reg_rtx (mode);
2301         rtx tp = aarch64_load_tp (NULL);
2302
2303         if (mode == ptr_mode)
2304           {
2305             if (mode == DImode)
2306               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2307             else
2308               {
2309                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2310                 tp = gen_lowpart (mode, tp);
2311               }
2312           }
2313         else
2314           {
2315             gcc_assert (mode == Pmode);
2316             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2317           }
2318
2319         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2320         if (REG_P (dest))
2321           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2322         return;
2323       }
2324
2325     case SYMBOL_TLSLE12:
2326     case SYMBOL_TLSLE24:
2327     case SYMBOL_TLSLE32:
2328     case SYMBOL_TLSLE48:
2329       {
2330         machine_mode mode = GET_MODE (dest);
2331         rtx tp = aarch64_load_tp (NULL);
2332
2333         if (mode != Pmode)
2334           tp = gen_lowpart (mode, tp);
2335
2336         switch (type)
2337           {
2338           case SYMBOL_TLSLE12:
2339             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2340                         (dest, tp, imm));
2341             break;
2342           case SYMBOL_TLSLE24:
2343             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2344                         (dest, tp, imm));
2345           break;
2346           case SYMBOL_TLSLE32:
2347             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2348                         (dest, imm));
2349             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2350                         (dest, dest, tp));
2351           break;
2352           case SYMBOL_TLSLE48:
2353             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2354                         (dest, imm));
2355             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2356                         (dest, dest, tp));
2357             break;
2358           default:
2359             gcc_unreachable ();
2360           }
2361
2362         if (REG_P (dest))
2363           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2364         return;
2365       }
2366
2367     case SYMBOL_TINY_GOT:
2368       emit_insn (gen_ldr_got_tiny (dest, imm));
2369       return;
2370
2371     case SYMBOL_TINY_TLSIE:
2372       {
2373         machine_mode mode = GET_MODE (dest);
2374         rtx tp = aarch64_load_tp (NULL);
2375
2376         if (mode == ptr_mode)
2377           {
2378             if (mode == DImode)
2379               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2380             else
2381               {
2382                 tp = gen_lowpart (mode, tp);
2383                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2384               }
2385           }
2386         else
2387           {
2388             gcc_assert (mode == Pmode);
2389             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2390           }
2391
2392         if (REG_P (dest))
2393           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2394         return;
2395       }
2396
2397     default:
2398       gcc_unreachable ();
2399     }
2400 }
2401
2402 /* Emit a move from SRC to DEST.  Assume that the move expanders can
2403    handle all moves if !can_create_pseudo_p ().  The distinction is
2404    important because, unlike emit_move_insn, the move expanders know
2405    how to force Pmode objects into the constant pool even when the
2406    constant pool address is not itself legitimate.  */
2407 static rtx
2408 aarch64_emit_move (rtx dest, rtx src)
2409 {
2410   return (can_create_pseudo_p ()
2411           ? emit_move_insn (dest, src)
2412           : emit_move_insn_1 (dest, src));
2413 }
2414
2415 /* Apply UNOPTAB to OP and store the result in DEST.  */
2416
2417 static void
2418 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2419 {
2420   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2421   if (dest != tmp)
2422     emit_move_insn (dest, tmp);
2423 }
2424
2425 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2426
2427 static void
2428 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2429 {
2430   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2431                           OPTAB_DIRECT);
2432   if (dest != tmp)
2433     emit_move_insn (dest, tmp);
2434 }
2435
2436 /* Split a 128-bit move operation into two 64-bit move operations,
2437    taking care to handle partial overlap of register to register
2438    copies.  Special cases are needed when moving between GP regs and
2439    FP regs.  SRC can be a register, constant or memory; DST a register
2440    or memory.  If either operand is memory it must not have any side
2441    effects.  */
2442 void
2443 aarch64_split_128bit_move (rtx dst, rtx src)
2444 {
2445   rtx dst_lo, dst_hi;
2446   rtx src_lo, src_hi;
2447
2448   machine_mode mode = GET_MODE (dst);
2449
2450   gcc_assert (mode == TImode || mode == TFmode);
2451   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2452   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2453
2454   if (REG_P (dst) && REG_P (src))
2455     {
2456       int src_regno = REGNO (src);
2457       int dst_regno = REGNO (dst);
2458
2459       /* Handle FP <-> GP regs.  */
2460       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2461         {
2462           src_lo = gen_lowpart (word_mode, src);
2463           src_hi = gen_highpart (word_mode, src);
2464
2465           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2466           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2467           return;
2468         }
2469       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2470         {
2471           dst_lo = gen_lowpart (word_mode, dst);
2472           dst_hi = gen_highpart (word_mode, dst);
2473
2474           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2475           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2476           return;
2477         }
2478     }
2479
2480   dst_lo = gen_lowpart (word_mode, dst);
2481   dst_hi = gen_highpart (word_mode, dst);
2482   src_lo = gen_lowpart (word_mode, src);
2483   src_hi = gen_highpart_mode (word_mode, mode, src);
2484
2485   /* At most one pairing may overlap.  */
2486   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2487     {
2488       aarch64_emit_move (dst_hi, src_hi);
2489       aarch64_emit_move (dst_lo, src_lo);
2490     }
2491   else
2492     {
2493       aarch64_emit_move (dst_lo, src_lo);
2494       aarch64_emit_move (dst_hi, src_hi);
2495     }
2496 }
2497
2498 bool
2499 aarch64_split_128bit_move_p (rtx dst, rtx src)
2500 {
2501   return (! REG_P (src)
2502           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2503 }
2504
2505 /* Split a complex SIMD combine.  */
2506
2507 void
2508 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2509 {
2510   machine_mode src_mode = GET_MODE (src1);
2511   machine_mode dst_mode = GET_MODE (dst);
2512
2513   gcc_assert (VECTOR_MODE_P (dst_mode));
2514   gcc_assert (register_operand (dst, dst_mode)
2515               && register_operand (src1, src_mode)
2516               && register_operand (src2, src_mode));
2517
2518   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2519   return;
2520 }
2521
2522 /* Split a complex SIMD move.  */
2523
2524 void
2525 aarch64_split_simd_move (rtx dst, rtx src)
2526 {
2527   machine_mode src_mode = GET_MODE (src);
2528   machine_mode dst_mode = GET_MODE (dst);
2529
2530   gcc_assert (VECTOR_MODE_P (dst_mode));
2531
2532   if (REG_P (dst) && REG_P (src))
2533     {
2534       gcc_assert (VECTOR_MODE_P (src_mode));
2535       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2536     }
2537 }
2538
2539 bool
2540 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2541                               machine_mode ymode, rtx y)
2542 {
2543   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2544   gcc_assert (r != NULL);
2545   return rtx_equal_p (x, r);
2546 }
2547
2548
2549 /* Return TARGET if it is nonnull and a register of mode MODE.
2550    Otherwise, return a fresh register of mode MODE if we can,
2551    or TARGET reinterpreted as MODE if we can't.  */
2552
2553 static rtx
2554 aarch64_target_reg (rtx target, machine_mode mode)
2555 {
2556   if (target && REG_P (target) && GET_MODE (target) == mode)
2557     return target;
2558   if (!can_create_pseudo_p ())
2559     {
2560       gcc_assert (target);
2561       return gen_lowpart (mode, target);
2562     }
2563   return gen_reg_rtx (mode);
2564 }
2565
2566 /* Return a register that contains the constant in BUILDER, given that
2567    the constant is a legitimate move operand.  Use TARGET as the register
2568    if it is nonnull and convenient.  */
2569
2570 static rtx
2571 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2572 {
2573   rtx src = builder.build ();
2574   target = aarch64_target_reg (target, GET_MODE (src));
2575   emit_insn (gen_rtx_SET (target, src));
2576   return target;
2577 }
2578
2579 static rtx
2580 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2581 {
2582   if (can_create_pseudo_p ())
2583     return force_reg (mode, value);
2584   else
2585     {
2586       gcc_assert (x);
2587       aarch64_emit_move (x, value);
2588       return x;
2589     }
2590 }
2591
2592 /* Return true if predicate value X is a constant in which every element
2593    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
2594    value, i.e. as a predicate in which all bits are significant.  */
2595
2596 static bool
2597 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2598 {
2599   if (GET_CODE (x) != CONST_VECTOR)
2600     return false;
2601
2602   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2603                                              GET_MODE_NUNITS (GET_MODE (x)));
2604   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2605   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2606   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2607
2608   unsigned int nelts = const_vector_encoded_nelts (x);
2609   for (unsigned int i = 0; i < nelts; ++i)
2610     {
2611       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2612       if (!CONST_INT_P (elt))
2613         return false;
2614
2615       builder.quick_push (elt);
2616       for (unsigned int j = 1; j < factor; ++j)
2617         builder.quick_push (const0_rtx);
2618     }
2619   builder.finalize ();
2620   return true;
2621 }
2622
2623 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
2624    widest predicate element size it can have (that is, the largest size
2625    for which each element would still be 0 or 1).  */
2626
2627 unsigned int
2628 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
2629 {
2630   /* Start with the most optimistic assumption: that we only need
2631      one bit per pattern.  This is what we will use if only the first
2632      bit in each pattern is ever set.  */
2633   unsigned int mask = GET_MODE_SIZE (DImode);
2634   mask |= builder.npatterns ();
2635
2636   /* Look for set bits.  */
2637   unsigned int nelts = builder.encoded_nelts ();
2638   for (unsigned int i = 1; i < nelts; ++i)
2639     if (INTVAL (builder.elt (i)) != 0)
2640       {
2641         if (i & 1)
2642           return 1;
2643         mask |= i;
2644       }
2645   return mask & -mask;
2646 }
2647
2648 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
2649    that the constant would have with predicate element size ELT_SIZE
2650    (ignoring the upper bits in each element) and return:
2651
2652    * -1 if all bits are set
2653    * N if the predicate has N leading set bits followed by all clear bits
2654    * 0 if the predicate does not have any of these forms.  */
2655
2656 int
2657 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
2658                               unsigned int elt_size)
2659 {
2660   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2661      followed by set bits.  */
2662   if (builder.nelts_per_pattern () == 3)
2663     return 0;
2664
2665   /* Skip over leading set bits.  */
2666   unsigned int nelts = builder.encoded_nelts ();
2667   unsigned int i = 0;
2668   for (; i < nelts; i += elt_size)
2669     if (INTVAL (builder.elt (i)) == 0)
2670       break;
2671   unsigned int vl = i / elt_size;
2672
2673   /* Check for the all-true case.  */
2674   if (i == nelts)
2675     return -1;
2676
2677   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2678      repeating pattern of set bits followed by clear bits.  */
2679   if (builder.nelts_per_pattern () != 2)
2680     return 0;
2681
2682   /* We have a "foreground" value and a duplicated "background" value.
2683      If the background might repeat and the last set bit belongs to it,
2684      we might have set bits followed by clear bits followed by set bits.  */
2685   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
2686     return 0;
2687
2688   /* Make sure that the rest are all clear.  */
2689   for (; i < nelts; i += elt_size)
2690     if (INTVAL (builder.elt (i)) != 0)
2691       return 0;
2692
2693   return vl;
2694 }
2695
2696 /* See if there is an svpattern that encodes an SVE predicate of mode
2697    PRED_MODE in which the first VL bits are set and the rest are clear.
2698    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2699    A VL of -1 indicates an all-true vector.  */
2700
2701 aarch64_svpattern
2702 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
2703 {
2704   if (vl < 0)
2705     return AARCH64_SV_ALL;
2706
2707   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
2708     return AARCH64_NUM_SVPATTERNS;
2709
2710   if (vl >= 1 && vl <= 8)
2711     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
2712
2713   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
2714     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
2715
2716   int max_vl;
2717   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
2718     {
2719       if (vl == (max_vl / 3) * 3)
2720         return AARCH64_SV_MUL3;
2721       /* These would only trigger for non-power-of-2 lengths.  */
2722       if (vl == (max_vl & -4))
2723         return AARCH64_SV_MUL4;
2724       if (vl == (1 << floor_log2 (max_vl)))
2725         return AARCH64_SV_POW2;
2726       if (vl == max_vl)
2727         return AARCH64_SV_ALL;
2728     }
2729   return AARCH64_NUM_SVPATTERNS;
2730 }
2731
2732 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2733    bits has the lowest bit set and the upper bits clear.  This is the
2734    VNx16BImode equivalent of a PTRUE for controlling elements of
2735    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
2736    all bits are significant, even the upper zeros.  */
2737
2738 rtx
2739 aarch64_ptrue_all (unsigned int elt_size)
2740 {
2741   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
2742   builder.quick_push (const1_rtx);
2743   for (unsigned int i = 1; i < elt_size; ++i)
2744     builder.quick_push (const0_rtx);
2745   return builder.build ();
2746 }
2747
2748 /* Return an all-true predicate register of mode MODE.  */
2749
2750 rtx
2751 aarch64_ptrue_reg (machine_mode mode)
2752 {
2753   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2754   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
2755   return gen_lowpart (mode, reg);
2756 }
2757
2758 /* Return an all-false predicate register of mode MODE.  */
2759
2760 rtx
2761 aarch64_pfalse_reg (machine_mode mode)
2762 {
2763   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2764   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
2765   return gen_lowpart (mode, reg);
2766 }
2767
2768 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
2769    true, or alternatively if we know that the operation predicated by
2770    PRED1[0] is safe to perform whenever PRED2 is true.  PRED1[1] is a
2771    aarch64_sve_gp_strictness operand that describes the operation
2772    predicated by PRED1[0].  */
2773
2774 bool
2775 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
2776 {
2777   machine_mode mode = GET_MODE (pred2);
2778   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2779               && mode == GET_MODE (pred1[0])
2780               && aarch64_sve_gp_strictness (pred1[1], SImode));
2781   return (pred1[0] == CONSTM1_RTX (mode)
2782           || INTVAL (pred1[1]) == SVE_RELAXED_GP
2783           || rtx_equal_p (pred1[0], pred2));
2784 }
2785
2786 /* Use a comparison to convert integer vector SRC into MODE, which is
2787    the corresponding SVE predicate mode.  Use TARGET for the result
2788    if it's nonnull and convenient.  */
2789
2790 static rtx
2791 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
2792 {
2793   machine_mode src_mode = GET_MODE (src);
2794   insn_code icode = code_for_aarch64_pred_cmp (NE, src_mode);
2795   expand_operand ops[4];
2796   create_output_operand (&ops[0], target, mode);
2797   create_input_operand (&ops[1], CONSTM1_RTX (mode), mode);
2798   create_input_operand (&ops[2], src, src_mode);
2799   create_input_operand (&ops[3], CONST0_RTX (src_mode), src_mode);
2800   expand_insn (icode, 4, ops);
2801   return ops[0].value;
2802 }
2803
2804 /* Return true if we can move VALUE into a register using a single
2805    CNT[BHWD] instruction.  */
2806
2807 static bool
2808 aarch64_sve_cnt_immediate_p (poly_int64 value)
2809 {
2810   HOST_WIDE_INT factor = value.coeffs[0];
2811   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2812   return (value.coeffs[1] == factor
2813           && IN_RANGE (factor, 2, 16 * 16)
2814           && (factor & 1) == 0
2815           && factor <= 16 * (factor & -factor));
2816 }
2817
2818 /* Likewise for rtx X.  */
2819
2820 bool
2821 aarch64_sve_cnt_immediate_p (rtx x)
2822 {
2823   poly_int64 value;
2824   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2825 }
2826
2827 /* Return the asm string for an instruction with a CNT-like vector size
2828    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2829    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2830    first part of the operands template (the part that comes before the
2831    vector size itself).  FACTOR is the number of quadwords.
2832    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2833    If it is zero, we can use any element size.  */
2834
2835 static char *
2836 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2837                                   unsigned int factor,
2838                                   unsigned int nelts_per_vq)
2839 {
2840   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2841
2842   if (nelts_per_vq == 0)
2843     /* There is some overlap in the ranges of the four CNT instructions.
2844        Here we always use the smallest possible element size, so that the
2845        multiplier is 1 whereever possible.  */
2846     nelts_per_vq = factor & -factor;
2847   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2848   gcc_assert (IN_RANGE (shift, 1, 4));
2849   char suffix = "dwhb"[shift - 1];
2850
2851   factor >>= shift;
2852   unsigned int written;
2853   if (factor == 1)
2854     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2855                         prefix, suffix, operands);
2856   else
2857     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2858                         prefix, suffix, operands, factor);
2859   gcc_assert (written < sizeof (buffer));
2860   return buffer;
2861 }
2862
2863 /* Return the asm string for an instruction with a CNT-like vector size
2864    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2865    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2866    first part of the operands template (the part that comes before the
2867    vector size itself).  X is the value of the vector size operand,
2868    as a polynomial integer rtx.  */
2869
2870 char *
2871 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2872                                   rtx x)
2873 {
2874   poly_int64 value = rtx_to_poly_int64 (x);
2875   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2876   return aarch64_output_sve_cnt_immediate (prefix, operands,
2877                                            value.coeffs[1], 0);
2878 }
2879
2880 /* Return true if we can add VALUE to a register using a single ADDVL
2881    or ADDPL instruction.  */
2882
2883 static bool
2884 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2885 {
2886   HOST_WIDE_INT factor = value.coeffs[0];
2887   if (factor == 0 || value.coeffs[1] != factor)
2888     return false;
2889   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2890      and a value of 16 is one vector width.  */
2891   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2892           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2893 }
2894
2895 /* Likewise for rtx X.  */
2896
2897 bool
2898 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2899 {
2900   poly_int64 value;
2901   return (poly_int_rtx_p (x, &value)
2902           && aarch64_sve_addvl_addpl_immediate_p (value));
2903 }
2904
2905 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2906    and storing the result in operand 0.  */
2907
2908 char *
2909 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2910 {
2911   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2912   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2913   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2914
2915   /* Use INC or DEC if possible.  */
2916   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2917     {
2918       if (aarch64_sve_cnt_immediate_p (offset_value))
2919         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2920                                                  offset_value.coeffs[1], 0);
2921       if (aarch64_sve_cnt_immediate_p (-offset_value))
2922         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2923                                                  -offset_value.coeffs[1], 0);
2924     }
2925
2926   int factor = offset_value.coeffs[1];
2927   if ((factor & 15) == 0)
2928     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2929   else
2930     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2931   return buffer;
2932 }
2933
2934 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2935    instruction.  If it is, store the number of elements in each vector
2936    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2937    factor in *FACTOR_OUT (if nonnull).  */
2938
2939 bool
2940 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2941                                  unsigned int *nelts_per_vq_out)
2942 {
2943   rtx elt;
2944   poly_int64 value;
2945
2946   if (!const_vec_duplicate_p (x, &elt)
2947       || !poly_int_rtx_p (elt, &value))
2948     return false;
2949
2950   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2951   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2952     /* There's no vector INCB.  */
2953     return false;
2954
2955   HOST_WIDE_INT factor = value.coeffs[0];
2956   if (value.coeffs[1] != factor)
2957     return false;
2958
2959   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2960   if ((factor % nelts_per_vq) != 0
2961       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2962     return false;
2963
2964   if (factor_out)
2965     *factor_out = factor;
2966   if (nelts_per_vq_out)
2967     *nelts_per_vq_out = nelts_per_vq;
2968   return true;
2969 }
2970
2971 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2972    instruction.  */
2973
2974 bool
2975 aarch64_sve_inc_dec_immediate_p (rtx x)
2976 {
2977   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2978 }
2979
2980 /* Return the asm template for an SVE vector INC or DEC instruction.
2981    OPERANDS gives the operands before the vector count and X is the
2982    value of the vector count operand itself.  */
2983
2984 char *
2985 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2986 {
2987   int factor;
2988   unsigned int nelts_per_vq;
2989   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2990     gcc_unreachable ();
2991   if (factor < 0)
2992     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2993                                              nelts_per_vq);
2994   else
2995     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2996                                              nelts_per_vq);
2997 }
2998
2999 static int
3000 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3001                                 scalar_int_mode mode)
3002 {
3003   int i;
3004   unsigned HOST_WIDE_INT val, val2, mask;
3005   int one_match, zero_match;
3006   int num_insns;
3007
3008   val = INTVAL (imm);
3009
3010   if (aarch64_move_imm (val, mode))
3011     {
3012       if (generate)
3013         emit_insn (gen_rtx_SET (dest, imm));
3014       return 1;
3015     }
3016
3017   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3018      (with XXXX non-zero). In that case check to see if the move can be done in
3019      a smaller mode.  */
3020   val2 = val & 0xffffffff;
3021   if (mode == DImode
3022       && aarch64_move_imm (val2, SImode)
3023       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3024     {
3025       if (generate)
3026         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3027
3028       /* Check if we have to emit a second instruction by checking to see
3029          if any of the upper 32 bits of the original DI mode value is set.  */
3030       if (val == val2)
3031         return 1;
3032
3033       i = (val >> 48) ? 48 : 32;
3034
3035       if (generate)
3036          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3037                                     GEN_INT ((val >> i) & 0xffff)));
3038
3039       return 2;
3040     }
3041
3042   if ((val >> 32) == 0 || mode == SImode)
3043     {
3044       if (generate)
3045         {
3046           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3047           if (mode == SImode)
3048             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3049                                        GEN_INT ((val >> 16) & 0xffff)));
3050           else
3051             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3052                                        GEN_INT ((val >> 16) & 0xffff)));
3053         }
3054       return 2;
3055     }
3056
3057   /* Remaining cases are all for DImode.  */
3058
3059   mask = 0xffff;
3060   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3061     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3062   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3063     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3064
3065   if (zero_match != 2 && one_match != 2)
3066     {
3067       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3068          For a 64-bit bitmask try whether changing 16 bits to all ones or
3069          zeroes creates a valid bitmask.  To check any repeated bitmask,
3070          try using 16 bits from the other 32-bit half of val.  */
3071
3072       for (i = 0; i < 64; i += 16, mask <<= 16)
3073         {
3074           val2 = val & ~mask;
3075           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3076             break;
3077           val2 = val | mask;
3078           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3079             break;
3080           val2 = val2 & ~mask;
3081           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3082           if (val2 != val && aarch64_bitmask_imm (val2, mode))
3083             break;
3084         }
3085       if (i != 64)
3086         {
3087           if (generate)
3088             {
3089               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3090               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3091                                          GEN_INT ((val >> i) & 0xffff)));
3092             }
3093           return 2;
3094         }
3095     }
3096
3097   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3098      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
3099      otherwise skip zero bits.  */
3100
3101   num_insns = 1;
3102   mask = 0xffff;
3103   val2 = one_match > zero_match ? ~val : val;
3104   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3105
3106   if (generate)
3107     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3108                                            ? (val | ~(mask << i))
3109                                            : (val & (mask << i)))));
3110   for (i += 16; i < 64; i += 16)
3111     {
3112       if ((val2 & (mask << i)) == 0)
3113         continue;
3114       if (generate)
3115         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3116                                    GEN_INT ((val >> i) & 0xffff)));
3117       num_insns ++;
3118     }
3119
3120   return num_insns;
3121 }
3122
3123 /* Return whether imm is a 128-bit immediate which is simple enough to
3124    expand inline.  */
3125 bool
3126 aarch64_mov128_immediate (rtx imm)
3127 {
3128   if (GET_CODE (imm) == CONST_INT)
3129     return true;
3130
3131   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3132
3133   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3134   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3135
3136   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3137          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3138 }
3139
3140
3141 /* Return the number of temporary registers that aarch64_add_offset_1
3142    would need to add OFFSET to a register.  */
3143
3144 static unsigned int
3145 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3146 {
3147   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3148 }
3149
3150 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
3151    a non-polynomial OFFSET.  MODE is the mode of the addition.
3152    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3153    be set and CFA adjustments added to the generated instructions.
3154
3155    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3156    temporary if register allocation is already complete.  This temporary
3157    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
3158    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3159    the immediate again.
3160
3161    Since this function may be used to adjust the stack pointer, we must
3162    ensure that it cannot cause transient stack deallocation (for example
3163    by first incrementing SP and then decrementing when adjusting by a
3164    large immediate).  */
3165
3166 static void
3167 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3168                       rtx src, HOST_WIDE_INT offset, rtx temp1,
3169                       bool frame_related_p, bool emit_move_imm)
3170 {
3171   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3172   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3173
3174   HOST_WIDE_INT moffset = abs_hwi (offset);
3175   rtx_insn *insn;
3176
3177   if (!moffset)
3178     {
3179       if (!rtx_equal_p (dest, src))
3180         {
3181           insn = emit_insn (gen_rtx_SET (dest, src));
3182           RTX_FRAME_RELATED_P (insn) = frame_related_p;
3183         }
3184       return;
3185     }
3186
3187   /* Single instruction adjustment.  */
3188   if (aarch64_uimm12_shift (moffset))
3189     {
3190       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3191       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3192       return;
3193     }
3194
3195   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3196      and either:
3197
3198      a) the offset cannot be loaded by a 16-bit move or
3199      b) there is no spare register into which we can move it.  */
3200   if (moffset < 0x1000000
3201       && ((!temp1 && !can_create_pseudo_p ())
3202           || !aarch64_move_imm (moffset, mode)))
3203     {
3204       HOST_WIDE_INT low_off = moffset & 0xfff;
3205
3206       low_off = offset < 0 ? -low_off : low_off;
3207       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3208       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3209       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3210       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3211       return;
3212     }
3213
3214   /* Emit a move immediate if required and an addition/subtraction.  */
3215   if (emit_move_imm)
3216     {
3217       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3218       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3219     }
3220   insn = emit_insn (offset < 0
3221                     ? gen_sub3_insn (dest, src, temp1)
3222                     : gen_add3_insn (dest, src, temp1));
3223   if (frame_related_p)
3224     {
3225       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3226       rtx adj = plus_constant (mode, src, offset);
3227       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3228     }
3229 }
3230
3231 /* Return the number of temporary registers that aarch64_add_offset
3232    would need to move OFFSET into a register or add OFFSET to a register;
3233    ADD_P is true if we want the latter rather than the former.  */
3234
3235 static unsigned int
3236 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3237 {
3238   /* This follows the same structure as aarch64_add_offset.  */
3239   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3240     return 0;
3241
3242   unsigned int count = 0;
3243   HOST_WIDE_INT factor = offset.coeffs[1];
3244   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3245   poly_int64 poly_offset (factor, factor);
3246   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3247     /* Need one register for the ADDVL/ADDPL result.  */
3248     count += 1;
3249   else if (factor != 0)
3250     {
3251       factor = abs (factor);
3252       if (factor > 16 * (factor & -factor))
3253         /* Need one register for the CNT result and one for the multiplication
3254            factor.  If necessary, the second temporary can be reused for the
3255            constant part of the offset.  */
3256         return 2;
3257       /* Need one register for the CNT result (which might then
3258          be shifted).  */
3259       count += 1;
3260     }
3261   return count + aarch64_add_offset_1_temporaries (constant);
3262 }
3263
3264 /* If X can be represented as a poly_int64, return the number
3265    of temporaries that are required to add it to a register.
3266    Return -1 otherwise.  */
3267
3268 int
3269 aarch64_add_offset_temporaries (rtx x)
3270 {
3271   poly_int64 offset;
3272   if (!poly_int_rtx_p (x, &offset))
3273     return -1;
3274   return aarch64_offset_temporaries (true, offset);
3275 }
3276
3277 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
3278    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3279    be set and CFA adjustments added to the generated instructions.
3280
3281    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3282    temporary if register allocation is already complete.  This temporary
3283    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3284    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3285    false to avoid emitting the immediate again.
3286
3287    TEMP2, if nonnull, is a second temporary register that doesn't
3288    overlap either DEST or REG.
3289
3290    Since this function may be used to adjust the stack pointer, we must
3291    ensure that it cannot cause transient stack deallocation (for example
3292    by first incrementing SP and then decrementing when adjusting by a
3293    large immediate).  */
3294
3295 static void
3296 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3297                     poly_int64 offset, rtx temp1, rtx temp2,
3298                     bool frame_related_p, bool emit_move_imm = true)
3299 {
3300   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3301   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3302   gcc_assert (temp1 == NULL_RTX
3303               || !frame_related_p
3304               || !reg_overlap_mentioned_p (temp1, dest));
3305   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3306
3307   /* Try using ADDVL or ADDPL to add the whole value.  */
3308   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3309     {
3310       rtx offset_rtx = gen_int_mode (offset, mode);
3311       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3312       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3313       return;
3314     }
3315
3316   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3317      SVE vector register, over and above the minimum size of 128 bits.
3318      This is equivalent to half the value returned by CNTD with a
3319      vector shape of ALL.  */
3320   HOST_WIDE_INT factor = offset.coeffs[1];
3321   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3322
3323   /* Try using ADDVL or ADDPL to add the VG-based part.  */
3324   poly_int64 poly_offset (factor, factor);
3325   if (src != const0_rtx
3326       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3327     {
3328       rtx offset_rtx = gen_int_mode (poly_offset, mode);
3329       if (frame_related_p)
3330         {
3331           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3332           RTX_FRAME_RELATED_P (insn) = true;
3333           src = dest;
3334         }
3335       else
3336         {
3337           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3338           src = aarch64_force_temporary (mode, temp1, addr);
3339           temp1 = temp2;
3340           temp2 = NULL_RTX;
3341         }
3342     }
3343   /* Otherwise use a CNT-based sequence.  */
3344   else if (factor != 0)
3345     {
3346       /* Use a subtraction if we have a negative factor.  */
3347       rtx_code code = PLUS;
3348       if (factor < 0)
3349         {
3350           factor = -factor;
3351           code = MINUS;
3352         }
3353
3354       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
3355          into the multiplication.  */
3356       rtx val;
3357       int shift = 0;
3358       if (factor & 1)
3359         /* Use a right shift by 1.  */
3360         shift = -1;
3361       else
3362         factor /= 2;
3363       HOST_WIDE_INT low_bit = factor & -factor;
3364       if (factor <= 16 * low_bit)
3365         {
3366           if (factor > 16 * 8)
3367             {
3368               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3369                  the value with the minimum multiplier and shift it into
3370                  position.  */
3371               int extra_shift = exact_log2 (low_bit);
3372               shift += extra_shift;
3373               factor >>= extra_shift;
3374             }
3375           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3376         }
3377       else
3378         {
3379           /* Use CNTD, then multiply it by FACTOR.  */
3380           val = gen_int_mode (poly_int64 (2, 2), mode);
3381           val = aarch64_force_temporary (mode, temp1, val);
3382
3383           /* Go back to using a negative multiplication factor if we have
3384              no register from which to subtract.  */
3385           if (code == MINUS && src == const0_rtx)
3386             {
3387               factor = -factor;
3388               code = PLUS;
3389             }
3390           rtx coeff1 = gen_int_mode (factor, mode);
3391           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3392           val = gen_rtx_MULT (mode, val, coeff1);
3393         }
3394
3395       if (shift > 0)
3396         {
3397           /* Multiply by 1 << SHIFT.  */
3398           val = aarch64_force_temporary (mode, temp1, val);
3399           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3400         }
3401       else if (shift == -1)
3402         {
3403           /* Divide by 2.  */
3404           val = aarch64_force_temporary (mode, temp1, val);
3405           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3406         }
3407
3408       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
3409       if (src != const0_rtx)
3410         {
3411           val = aarch64_force_temporary (mode, temp1, val);
3412           val = gen_rtx_fmt_ee (code, mode, src, val);
3413         }
3414       else if (code == MINUS)
3415         {
3416           val = aarch64_force_temporary (mode, temp1, val);
3417           val = gen_rtx_NEG (mode, val);
3418         }
3419
3420       if (constant == 0 || frame_related_p)
3421         {
3422           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3423           if (frame_related_p)
3424             {
3425               RTX_FRAME_RELATED_P (insn) = true;
3426               add_reg_note (insn, REG_CFA_ADJUST_CFA,
3427                             gen_rtx_SET (dest, plus_constant (Pmode, src,
3428                                                               poly_offset)));
3429             }
3430           src = dest;
3431           if (constant == 0)
3432             return;
3433         }
3434       else
3435         {
3436           src = aarch64_force_temporary (mode, temp1, val);
3437           temp1 = temp2;
3438           temp2 = NULL_RTX;
3439         }
3440
3441       emit_move_imm = true;
3442     }
3443
3444   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3445                         frame_related_p, emit_move_imm);
3446 }
3447
3448 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3449    than a poly_int64.  */
3450
3451 void
3452 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3453                           rtx offset_rtx, rtx temp1, rtx temp2)
3454 {
3455   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3456                       temp1, temp2, false);
3457 }
3458
3459 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3460    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
3461    if TEMP1 already contains abs (DELTA).  */
3462
3463 static inline void
3464 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3465 {
3466   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3467                       temp1, temp2, true, emit_move_imm);
3468 }
3469
3470 /* Subtract DELTA from the stack pointer, marking the instructions
3471    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
3472    if nonnull.  */
3473
3474 static inline void
3475 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3476                 bool emit_move_imm = true)
3477 {
3478   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3479                       temp1, temp2, frame_related_p, emit_move_imm);
3480 }
3481
3482 /* Set DEST to (vec_series BASE STEP).  */
3483
3484 static void
3485 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3486 {
3487   machine_mode mode = GET_MODE (dest);
3488   scalar_mode inner = GET_MODE_INNER (mode);
3489
3490   /* Each operand can be a register or an immediate in the range [-16, 15].  */
3491   if (!aarch64_sve_index_immediate_p (base))
3492     base = force_reg (inner, base);
3493   if (!aarch64_sve_index_immediate_p (step))
3494     step = force_reg (inner, step);
3495
3496   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3497 }
3498
3499 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3500    register of mode MODE.  Use TARGET for the result if it's nonnull
3501    and convenient.
3502
3503    The two vector modes must have the same element mode.  The behavior
3504    is to duplicate architectural lane N of SRC into architectural lanes
3505    N + I * STEP of the result.  On big-endian targets, architectural
3506    lane 0 of an Advanced SIMD vector is the last element of the vector
3507    in memory layout, so for big-endian targets this operation has the
3508    effect of reversing SRC before duplicating it.  Callers need to
3509    account for this.  */
3510
3511 rtx
3512 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
3513 {
3514   machine_mode src_mode = GET_MODE (src);
3515   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
3516   insn_code icode = (BYTES_BIG_ENDIAN
3517                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
3518                      : code_for_aarch64_vec_duplicate_vq_le (mode));
3519
3520   unsigned int i = 0;
3521   expand_operand ops[3];
3522   create_output_operand (&ops[i++], target, mode);
3523   create_output_operand (&ops[i++], src, src_mode);
3524   if (BYTES_BIG_ENDIAN)
3525     {
3526       /* Create a PARALLEL describing the reversal of SRC.  */
3527       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
3528       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
3529                                                   nelts_per_vq - 1, -1);
3530       create_fixed_operand (&ops[i++], sel);
3531     }
3532   expand_insn (icode, i, ops);
3533   return ops[0].value;
3534 }
3535
3536 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3537    the memory image into DEST.  Return true on success.  */
3538
3539 static bool
3540 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
3541 {
3542   src = force_const_mem (GET_MODE (src), src);
3543   if (!src)
3544     return false;
3545
3546   /* Make sure that the address is legitimate.  */
3547   if (!aarch64_sve_ld1rq_operand_p (src))
3548     {
3549       rtx addr = force_reg (Pmode, XEXP (src, 0));
3550       src = replace_equiv_address (src, addr);
3551     }
3552
3553   machine_mode mode = GET_MODE (dest);
3554   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3555   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3556   rtx ptrue = aarch64_ptrue_reg (pred_mode);
3557   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
3558   return true;
3559 }
3560
3561 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3562    SVE data mode and isn't a legitimate constant.  Use TARGET for the
3563    result if convenient.
3564
3565    The returned register can have whatever mode seems most natural
3566    given the contents of SRC.  */
3567
3568 static rtx
3569 aarch64_expand_sve_const_vector (rtx target, rtx src)
3570 {
3571   machine_mode mode = GET_MODE (src);
3572   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3573   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3574   scalar_mode elt_mode = GET_MODE_INNER (mode);
3575   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
3576   unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
3577
3578   if (nelts_per_pattern == 1 && encoded_bits == 128)
3579     {
3580       /* The constant is a duplicated quadword but can't be narrowed
3581          beyond a quadword.  Get the memory image of the first quadword
3582          as a 128-bit vector and try using LD1RQ to load it from memory.
3583
3584          The effect for both endiannesses is to load memory lane N into
3585          architectural lanes N + I * STEP of the result.  On big-endian
3586          targets, the layout of the 128-bit vector in an Advanced SIMD
3587          register would be different from its layout in an SVE register,
3588          but this 128-bit vector is a memory value only.  */
3589       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3590       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
3591       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
3592         return target;
3593     }
3594
3595   if (nelts_per_pattern == 1 && encoded_bits < 128)
3596     {
3597       /* The vector is a repeating sequence of 64 bits or fewer.
3598          See if we can load them using an Advanced SIMD move and then
3599          duplicate it to fill a vector.  This is better than using a GPR
3600          move because it keeps everything in the same register file.  */
3601       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3602       rtx_vector_builder builder (vq_mode, npatterns, 1);
3603       for (unsigned int i = 0; i < npatterns; ++i)
3604         {
3605           /* We want memory lane N to go into architectural lane N,
3606              so reverse for big-endian targets.  The DUP .Q pattern
3607              has a compensating reverse built-in.  */
3608           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
3609           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
3610         }
3611       rtx vq_src = builder.build ();
3612       if (aarch64_simd_valid_immediate (vq_src, NULL))
3613         {
3614           vq_src = force_reg (vq_mode, vq_src);
3615           return aarch64_expand_sve_dupq (target, mode, vq_src);
3616         }
3617
3618       /* Get an integer representation of the repeating part of Advanced
3619          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
3620          which for big-endian targets is lane-swapped wrt a normal
3621          Advanced SIMD vector.  This means that for both endiannesses,
3622          memory lane N of SVE vector SRC corresponds to architectural
3623          lane N of a register holding VQ_SRC.  This in turn means that
3624          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3625          as a single 128-bit value) and thus that memory lane 0 of SRC is
3626          in the lsb of the integer.  Duplicating the integer therefore
3627          ensures that memory lane N of SRC goes into architectural lane
3628          N + I * INDEX of the SVE register.  */
3629       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
3630       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
3631       if (elt_value)
3632         {
3633           /* Pretend that we had a vector of INT_MODE to start with.  */
3634           elt_mode = int_mode;
3635           mode = aarch64_full_sve_mode (int_mode).require ();
3636
3637           /* If the integer can be moved into a general register by a
3638              single instruction, do that and duplicate the result.  */
3639           if (CONST_INT_P (elt_value)
3640               && aarch64_move_imm (INTVAL (elt_value), elt_mode))
3641             {
3642               elt_value = force_reg (elt_mode, elt_value);
3643               return expand_vector_broadcast (mode, elt_value);
3644             }
3645         }
3646       else if (npatterns == 1)
3647         /* We're duplicating a single value, but can't do better than
3648            force it to memory and load from there.  This handles things
3649            like symbolic constants.  */
3650         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
3651
3652       if (elt_value)
3653         {
3654           /* Load the element from memory if we can, otherwise move it into
3655              a register and use a DUP.  */
3656           rtx op = force_const_mem (elt_mode, elt_value);
3657           if (!op)
3658             op = force_reg (elt_mode, elt_value);
3659           return expand_vector_broadcast (mode, op);
3660         }
3661     }
3662
3663   /* Try using INDEX.  */
3664   rtx base, step;
3665   if (const_vec_series_p (src, &base, &step))
3666     {
3667       aarch64_expand_vec_series (target, base, step);
3668       return target;
3669     }
3670
3671   /* From here on, it's better to force the whole constant to memory
3672      if we can.  */
3673   if (GET_MODE_NUNITS (mode).is_constant ())
3674     return NULL_RTX;
3675
3676   /* Expand each pattern individually.  */
3677   gcc_assert (npatterns > 1);
3678   rtx_vector_builder builder;
3679   auto_vec<rtx, 16> vectors (npatterns);
3680   for (unsigned int i = 0; i < npatterns; ++i)
3681     {
3682       builder.new_vector (mode, 1, nelts_per_pattern);
3683       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3684         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3685       vectors.quick_push (force_reg (mode, builder.build ()));
3686     }
3687
3688   /* Use permutes to interleave the separate vectors.  */
3689   while (npatterns > 1)
3690     {
3691       npatterns /= 2;
3692       for (unsigned int i = 0; i < npatterns; ++i)
3693         {
3694           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
3695           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3696           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3697           vectors[i] = tmp;
3698         }
3699     }
3700   gcc_assert (vectors[0] == target);
3701   return target;
3702 }
3703
3704 /* Use WHILE to set a predicate register of mode MODE in which the first
3705    VL bits are set and the rest are clear.  Use TARGET for the register
3706    if it's nonnull and convenient.  */
3707
3708 static rtx
3709 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
3710                                  unsigned int vl)
3711 {
3712   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
3713   target = aarch64_target_reg (target, mode);
3714   emit_insn (gen_while_ult (DImode, mode, target, const0_rtx, limit));
3715   return target;
3716 }
3717
3718 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
3719    constant in BUILDER into an SVE predicate register.  Return the register
3720    on success, otherwise return null.  Use TARGET for the register if
3721    nonnull and convenient.  */
3722
3723 static rtx
3724 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder)
3725 {
3726   if (builder.encoded_nelts () == 1)
3727     /* A PFALSE or a PTRUE .B ALL.  */
3728     return aarch64_emit_set_immediate (target, builder);
3729
3730   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
3731   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
3732     {
3733       /* If we can load the constant using PTRUE, use it as-is.  */
3734       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
3735       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
3736         return aarch64_emit_set_immediate (target, builder);
3737
3738       /* Otherwise use WHILE to set the first VL bits.  */
3739       return aarch64_sve_move_pred_via_while (target, mode, vl);
3740     }
3741
3742   return NULL_RTX;
3743 }
3744
3745 /* Return an SVE predicate register that contains the VNx16BImode
3746    constant in BUILDER, without going through the move expanders.
3747
3748    The returned register can have whatever mode seems most natural
3749    given the contents of BUILDER.  Use TARGET for the result if
3750    convenient.  */
3751
3752 static rtx
3753 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
3754 {
3755   /* Try loading the constant using pure predicate operations.  */
3756   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder))
3757     return res;
3758
3759   /* Try forcing the constant to memory.  */
3760   if (builder.full_nelts ().is_constant ())
3761     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
3762       {
3763         target = aarch64_target_reg (target, VNx16BImode);
3764         emit_move_insn (target, mem);
3765         return target;
3766       }
3767
3768   /* The last resort is to load the constant as an integer and then
3769      compare it against zero.  Use -1 for set bits in order to increase
3770      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
3771   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
3772                                   builder.nelts_per_pattern ());
3773   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
3774     int_builder.quick_push (INTVAL (builder.elt (i))
3775                             ? constm1_rtx : const0_rtx);
3776   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
3777                                            int_builder.build ());
3778 }
3779
3780 /* Set DEST to immediate IMM.  */
3781
3782 void
3783 aarch64_expand_mov_immediate (rtx dest, rtx imm)
3784 {
3785   machine_mode mode = GET_MODE (dest);
3786
3787   /* Check on what type of symbol it is.  */
3788   scalar_int_mode int_mode;
3789   if ((GET_CODE (imm) == SYMBOL_REF
3790        || GET_CODE (imm) == LABEL_REF
3791        || GET_CODE (imm) == CONST
3792        || GET_CODE (imm) == CONST_POLY_INT)
3793       && is_a <scalar_int_mode> (mode, &int_mode))
3794     {
3795       rtx mem;
3796       poly_int64 offset;
3797       HOST_WIDE_INT const_offset;
3798       enum aarch64_symbol_type sty;
3799
3800       /* If we have (const (plus symbol offset)), separate out the offset
3801          before we start classifying the symbol.  */
3802       rtx base = strip_offset (imm, &offset);
3803
3804       /* We must always add an offset involving VL separately, rather than
3805          folding it into the relocation.  */
3806       if (!offset.is_constant (&const_offset))
3807         {
3808           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
3809             emit_insn (gen_rtx_SET (dest, imm));
3810           else
3811             {
3812               /* Do arithmetic on 32-bit values if the result is smaller
3813                  than that.  */
3814               if (partial_subreg_p (int_mode, SImode))
3815                 {
3816                   /* It is invalid to do symbol calculations in modes
3817                      narrower than SImode.  */
3818                   gcc_assert (base == const0_rtx);
3819                   dest = gen_lowpart (SImode, dest);
3820                   int_mode = SImode;
3821                 }
3822               if (base != const0_rtx)
3823                 {
3824                   base = aarch64_force_temporary (int_mode, dest, base);
3825                   aarch64_add_offset (int_mode, dest, base, offset,
3826                                       NULL_RTX, NULL_RTX, false);
3827                 }
3828               else
3829                 aarch64_add_offset (int_mode, dest, base, offset,
3830                                     dest, NULL_RTX, false);
3831             }
3832           return;
3833         }
3834
3835       sty = aarch64_classify_symbol (base, const_offset);
3836       switch (sty)
3837         {
3838         case SYMBOL_FORCE_TO_MEM:
3839           if (const_offset != 0
3840               && targetm.cannot_force_const_mem (int_mode, imm))
3841             {
3842               gcc_assert (can_create_pseudo_p ());
3843               base = aarch64_force_temporary (int_mode, dest, base);
3844               aarch64_add_offset (int_mode, dest, base, const_offset,
3845                                   NULL_RTX, NULL_RTX, false);
3846               return;
3847             }
3848
3849           mem = force_const_mem (ptr_mode, imm);
3850           gcc_assert (mem);
3851
3852           /* If we aren't generating PC relative literals, then
3853              we need to expand the literal pool access carefully.
3854              This is something that needs to be done in a number
3855              of places, so could well live as a separate function.  */
3856           if (!aarch64_pcrelative_literal_loads)
3857             {
3858               gcc_assert (can_create_pseudo_p ());
3859               base = gen_reg_rtx (ptr_mode);
3860               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3861               if (ptr_mode != Pmode)
3862                 base = convert_memory_address (Pmode, base);
3863               mem = gen_rtx_MEM (ptr_mode, base);
3864             }
3865
3866           if (int_mode != ptr_mode)
3867             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3868
3869           emit_insn (gen_rtx_SET (dest, mem));
3870
3871           return;
3872
3873         case SYMBOL_SMALL_TLSGD:
3874         case SYMBOL_SMALL_TLSDESC:
3875         case SYMBOL_SMALL_TLSIE:
3876         case SYMBOL_SMALL_GOT_28K:
3877         case SYMBOL_SMALL_GOT_4G:
3878         case SYMBOL_TINY_GOT:
3879         case SYMBOL_TINY_TLSIE:
3880           if (const_offset != 0)
3881             {
3882               gcc_assert(can_create_pseudo_p ());
3883               base = aarch64_force_temporary (int_mode, dest, base);
3884               aarch64_add_offset (int_mode, dest, base, const_offset,
3885                                   NULL_RTX, NULL_RTX, false);
3886               return;
3887             }
3888           /* FALLTHRU */
3889
3890         case SYMBOL_SMALL_ABSOLUTE:
3891         case SYMBOL_TINY_ABSOLUTE:
3892         case SYMBOL_TLSLE12:
3893         case SYMBOL_TLSLE24:
3894         case SYMBOL_TLSLE32:
3895         case SYMBOL_TLSLE48:
3896           aarch64_load_symref_appropriately (dest, imm, sty);
3897           return;
3898
3899         default:
3900           gcc_unreachable ();
3901         }
3902     }
3903
3904   if (!CONST_INT_P (imm))
3905     {
3906       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
3907         {
3908           /* Only the low bit of each .H, .S and .D element is defined,
3909              so we can set the upper bits to whatever we like.  If the
3910              predicate is all-true in MODE, prefer to set all the undefined
3911              bits as well, so that we can share a single .B predicate for
3912              all modes.  */
3913           if (imm == CONSTM1_RTX (mode))
3914             imm = CONSTM1_RTX (VNx16BImode);
3915
3916           /* All methods for constructing predicate modes wider than VNx16BI
3917              will set the upper bits of each element to zero.  Expose this
3918              by moving such constants as a VNx16BI, so that all bits are
3919              significant and so that constants for different modes can be
3920              shared.  The wider constant will still be available as a
3921              REG_EQUAL note.  */
3922           rtx_vector_builder builder;
3923           if (aarch64_get_sve_pred_bits (builder, imm))
3924             {
3925               rtx res = aarch64_expand_sve_const_pred (dest, builder);
3926               if (dest != res)
3927                 emit_move_insn (dest, gen_lowpart (mode, res));
3928               return;
3929             }
3930         }
3931
3932       if (GET_CODE (imm) == HIGH
3933           || aarch64_simd_valid_immediate (imm, NULL))
3934         {
3935           emit_insn (gen_rtx_SET (dest, imm));
3936           return;
3937         }
3938
3939       if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
3940         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
3941           {
3942             if (dest != res)
3943               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
3944             return;
3945           }
3946
3947       rtx mem = force_const_mem (mode, imm);
3948       gcc_assert (mem);
3949       emit_move_insn (dest, mem);
3950       return;
3951     }
3952
3953   aarch64_internal_mov_immediate (dest, imm, true,
3954                                   as_a <scalar_int_mode> (mode));
3955 }
3956
3957 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3958    that is known to contain PTRUE.  */
3959
3960 void
3961 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3962 {
3963   expand_operand ops[3];
3964   machine_mode mode = GET_MODE (dest);
3965   create_output_operand (&ops[0], dest, mode);
3966   create_input_operand (&ops[1], pred, GET_MODE(pred));
3967   create_input_operand (&ops[2], src, mode);
3968   temporary_volatile_ok v (true);
3969   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
3970 }
3971
3972 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3973    operand is in memory.  In this case we need to use the predicated LD1
3974    and ST1 instead of LDR and STR, both for correctness on big-endian
3975    targets and because LD1 and ST1 support a wider range of addressing modes.
3976    PRED_MODE is the mode of the predicate.
3977
3978    See the comment at the head of aarch64-sve.md for details about the
3979    big-endian handling.  */
3980
3981 void
3982 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3983 {
3984   machine_mode mode = GET_MODE (dest);
3985   rtx ptrue = aarch64_ptrue_reg (pred_mode);
3986   if (!register_operand (src, mode)
3987       && !register_operand (dest, mode))
3988     {
3989       rtx tmp = gen_reg_rtx (mode);
3990       if (MEM_P (src))
3991         aarch64_emit_sve_pred_move (tmp, ptrue, src);
3992       else
3993         emit_move_insn (tmp, src);
3994       src = tmp;
3995     }
3996   aarch64_emit_sve_pred_move (dest, ptrue, src);
3997 }
3998
3999 /* Called only on big-endian targets.  See whether an SVE vector move
4000    from SRC to DEST is effectively a REV[BHW] instruction, because at
4001    least one operand is a subreg of an SVE vector that has wider or
4002    narrower elements.  Return true and emit the instruction if so.
4003
4004    For example:
4005
4006      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4007
4008    represents a VIEW_CONVERT between the following vectors, viewed
4009    in memory order:
4010
4011      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
4012      R1: { [0],      [1],      [2],      [3],     ... }
4013
4014    The high part of lane X in R2 should therefore correspond to lane X*2
4015    of R1, but the register representations are:
4016
4017          msb                                      lsb
4018      R2: ...... [1].high  [1].low   [0].high  [0].low
4019      R1: ...... [3]       [2]       [1]       [0]
4020
4021    where the low part of lane X in R2 corresponds to lane X*2 in R1.
4022    We therefore need a reverse operation to swap the high and low values
4023    around.
4024
4025    This is purely an optimization.  Without it we would spill the
4026    subreg operand to the stack in one mode and reload it in the
4027    other mode, which has the same effect as the REV.  */
4028
4029 bool
4030 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4031 {
4032   gcc_assert (BYTES_BIG_ENDIAN);
4033   if (GET_CODE (dest) == SUBREG)
4034     dest = SUBREG_REG (dest);
4035   if (GET_CODE (src) == SUBREG)
4036     src = SUBREG_REG (src);
4037
4038   /* The optimization handles two single SVE REGs with different element
4039      sizes.  */
4040   if (!REG_P (dest)
4041       || !REG_P (src)
4042       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4043       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4044       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4045           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4046     return false;
4047
4048   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
4049   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4050   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4051                                UNSPEC_REV_SUBREG);
4052   emit_insn (gen_rtx_SET (dest, unspec));
4053   return true;
4054 }
4055
4056 /* Return a copy of X with mode MODE, without changing its other
4057    attributes.  Unlike gen_lowpart, this doesn't care whether the
4058    mode change is valid.  */
4059
4060 static rtx
4061 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4062 {
4063   if (GET_MODE (x) == mode)
4064     return x;
4065
4066   x = shallow_copy_rtx (x);
4067   set_mode_and_regno (x, mode, REGNO (x));
4068   return x;
4069 }
4070
4071 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4072    operands.  */
4073
4074 void
4075 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4076 {
4077   /* Decide which REV operation we need.  The mode with narrower elements
4078      determines the mode of the operands and the mode with the wider
4079      elements determines the reverse width.  */
4080   machine_mode mode_with_wider_elts = GET_MODE (dest);
4081   machine_mode mode_with_narrower_elts = GET_MODE (src);
4082   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4083       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4084     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4085
4086   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
4087   unsigned int unspec;
4088   if (wider_bytes == 8)
4089     unspec = UNSPEC_REV64;
4090   else if (wider_bytes == 4)
4091     unspec = UNSPEC_REV32;
4092   else if (wider_bytes == 2)
4093     unspec = UNSPEC_REV16;
4094   else
4095     gcc_unreachable ();
4096   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
4097
4098   /* Emit:
4099
4100        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
4101                          UNSPEC_MERGE_PTRUE))
4102
4103      with the appropriate modes.  */
4104   ptrue = gen_lowpart (pred_mode, ptrue);
4105   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
4106   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
4107   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
4108   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
4109                         UNSPEC_MERGE_PTRUE);
4110   emit_insn (gen_rtx_SET (dest, src));
4111 }
4112
4113 static bool
4114 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
4115                                  tree exp ATTRIBUTE_UNUSED)
4116 {
4117   if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
4118     return false;
4119
4120   return true;
4121 }
4122
4123 /* Implement TARGET_PASS_BY_REFERENCE.  */
4124
4125 static bool
4126 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
4127                            machine_mode mode,
4128                            const_tree type,
4129                            bool named ATTRIBUTE_UNUSED)
4130 {
4131   HOST_WIDE_INT size;
4132   machine_mode dummymode;
4133   int nregs;
4134
4135   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
4136   if (mode == BLKmode && type)
4137     size = int_size_in_bytes (type);
4138   else
4139     /* No frontends can create types with variable-sized modes, so we
4140        shouldn't be asked to pass or return them.  */
4141     size = GET_MODE_SIZE (mode).to_constant ();
4142
4143   /* Aggregates are passed by reference based on their size.  */
4144   if (type && AGGREGATE_TYPE_P (type))
4145     {
4146       size = int_size_in_bytes (type);
4147     }
4148
4149   /* Variable sized arguments are always returned by reference.  */
4150   if (size < 0)
4151     return true;
4152
4153   /* Can this be a candidate to be passed in fp/simd register(s)?  */
4154   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4155                                                &dummymode, &nregs,
4156                                                NULL))
4157     return false;
4158
4159   /* Arguments which are variable sized or larger than 2 registers are
4160      passed by reference unless they are a homogenous floating point
4161      aggregate.  */
4162   return size > 2 * UNITS_PER_WORD;
4163 }
4164
4165 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
4166 static bool
4167 aarch64_return_in_msb (const_tree valtype)
4168 {
4169   machine_mode dummy_mode;
4170   int dummy_int;
4171
4172   /* Never happens in little-endian mode.  */
4173   if (!BYTES_BIG_ENDIAN)
4174     return false;
4175
4176   /* Only composite types smaller than or equal to 16 bytes can
4177      be potentially returned in registers.  */
4178   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4179       || int_size_in_bytes (valtype) <= 0
4180       || int_size_in_bytes (valtype) > 16)
4181     return false;
4182
4183   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4184      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4185      is always passed/returned in the least significant bits of fp/simd
4186      register(s).  */
4187   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4188                                                &dummy_mode, &dummy_int, NULL))
4189     return false;
4190
4191   return true;
4192 }
4193
4194 /* Implement TARGET_FUNCTION_VALUE.
4195    Define how to find the value returned by a function.  */
4196
4197 static rtx
4198 aarch64_function_value (const_tree type, const_tree func,
4199                         bool outgoing ATTRIBUTE_UNUSED)
4200 {
4201   machine_mode mode;
4202   int unsignedp;
4203   int count;
4204   machine_mode ag_mode;
4205
4206   mode = TYPE_MODE (type);
4207   if (INTEGRAL_TYPE_P (type))
4208     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
4209
4210   if (aarch64_return_in_msb (type))
4211     {
4212       HOST_WIDE_INT size = int_size_in_bytes (type);
4213
4214       if (size % UNITS_PER_WORD != 0)
4215         {
4216           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4217           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4218         }
4219     }
4220
4221   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4222                                                &ag_mode, &count, NULL))
4223     {
4224       if (!aarch64_composite_type_p (type, mode))
4225         {
4226           gcc_assert (count == 1 && mode == ag_mode);
4227           return gen_rtx_REG (mode, V0_REGNUM);
4228         }
4229       else
4230         {
4231           int i;
4232           rtx par;
4233
4234           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
4235           for (i = 0; i < count; i++)
4236             {
4237               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
4238               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
4239               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4240               XVECEXP (par, 0, i) = tmp;
4241             }
4242           return par;
4243         }
4244     }
4245   else
4246     return gen_rtx_REG (mode, R0_REGNUM);
4247 }
4248
4249 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4250    Return true if REGNO is the number of a hard register in which the values
4251    of called function may come back.  */
4252
4253 static bool
4254 aarch64_function_value_regno_p (const unsigned int regno)
4255 {
4256   /* Maximum of 16 bytes can be returned in the general registers.  Examples
4257      of 16-byte return values are: 128-bit integers and 16-byte small
4258      structures (excluding homogeneous floating-point aggregates).  */
4259   if (regno == R0_REGNUM || regno == R1_REGNUM)
4260     return true;
4261
4262   /* Up to four fp/simd registers can return a function value, e.g. a
4263      homogeneous floating-point aggregate having four members.  */
4264   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
4265     return TARGET_FLOAT;
4266
4267   return false;
4268 }
4269
4270 /* Implement TARGET_RETURN_IN_MEMORY.
4271
4272    If the type T of the result of a function is such that
4273      void func (T arg)
4274    would require that arg be passed as a value in a register (or set of
4275    registers) according to the parameter passing rules, then the result
4276    is returned in the same registers as would be used for such an
4277    argument.  */
4278
4279 static bool
4280 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
4281 {
4282   HOST_WIDE_INT size;
4283   machine_mode ag_mode;
4284   int count;
4285
4286   if (!AGGREGATE_TYPE_P (type)
4287       && TREE_CODE (type) != COMPLEX_TYPE
4288       && TREE_CODE (type) != VECTOR_TYPE)
4289     /* Simple scalar types always returned in registers.  */
4290     return false;
4291
4292   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
4293                                                type,
4294                                                &ag_mode,
4295                                                &count,
4296                                                NULL))
4297     return false;
4298
4299   /* Types larger than 2 registers returned in memory.  */
4300   size = int_size_in_bytes (type);
4301   return (size < 0 || size > 2 * UNITS_PER_WORD);
4302 }
4303
4304 static bool
4305 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
4306                                const_tree type, int *nregs)
4307 {
4308   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4309   return aarch64_vfp_is_call_or_return_candidate (mode,
4310                                                   type,
4311                                                   &pcum->aapcs_vfp_rmode,
4312                                                   nregs,
4313                                                   NULL);
4314 }
4315
4316 /* Given MODE and TYPE of a function argument, return the alignment in
4317    bits.  The idea is to suppress any stronger alignment requested by
4318    the user and opt for the natural alignment (specified in AAPCS64 \S
4319    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
4320    calculated in versions of GCC prior to GCC-9.  This is a helper
4321    function for local use only.  */
4322
4323 static unsigned int
4324 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
4325                                 bool *abi_break)
4326 {
4327   *abi_break = false;
4328   if (!type)
4329     return GET_MODE_ALIGNMENT (mode);
4330
4331   if (integer_zerop (TYPE_SIZE (type)))
4332     return 0;
4333
4334   gcc_assert (TYPE_MODE (type) == mode);
4335
4336   if (!AGGREGATE_TYPE_P (type))
4337     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
4338
4339   if (TREE_CODE (type) == ARRAY_TYPE)
4340     return TYPE_ALIGN (TREE_TYPE (type));
4341
4342   unsigned int alignment = 0;
4343   unsigned int bitfield_alignment = 0;
4344   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
4345     if (TREE_CODE (field) == FIELD_DECL)
4346       {
4347         alignment = std::max (alignment, DECL_ALIGN (field));
4348         if (DECL_BIT_FIELD_TYPE (field))
4349           bitfield_alignment
4350             = std::max (bitfield_alignment,
4351                         TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
4352       }
4353
4354   if (bitfield_alignment > alignment)
4355     {
4356       *abi_break = true;
4357       return bitfield_alignment;
4358     }
4359
4360   return alignment;
4361 }
4362
4363 /* Layout a function argument according to the AAPCS64 rules.  The rule
4364    numbers refer to the rule numbers in the AAPCS64.  */
4365
4366 static void
4367 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
4368                     const_tree type,
4369                     bool named ATTRIBUTE_UNUSED)
4370 {
4371   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4372   int ncrn, nvrn, nregs;
4373   bool allocate_ncrn, allocate_nvrn;
4374   HOST_WIDE_INT size;
4375   bool abi_break;
4376
4377   /* We need to do this once per argument.  */
4378   if (pcum->aapcs_arg_processed)
4379     return;
4380
4381   pcum->aapcs_arg_processed = true;
4382
4383   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
4384   if (type)
4385     size = int_size_in_bytes (type);
4386   else
4387     /* No frontends can create types with variable-sized modes, so we
4388        shouldn't be asked to pass or return them.  */
4389     size = GET_MODE_SIZE (mode).to_constant ();
4390   size = ROUND_UP (size, UNITS_PER_WORD);
4391
4392   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
4393   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
4394                                                  mode,
4395                                                  type,
4396                                                  &nregs);
4397
4398   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4399      The following code thus handles passing by SIMD/FP registers first.  */
4400
4401   nvrn = pcum->aapcs_nvrn;
4402
4403   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4404      and homogenous short-vector aggregates (HVA).  */
4405   if (allocate_nvrn)
4406     {
4407       if (!TARGET_FLOAT)
4408         aarch64_err_no_fpadvsimd (mode);
4409
4410       if (nvrn + nregs <= NUM_FP_ARG_REGS)
4411         {
4412           pcum->aapcs_nextnvrn = nvrn + nregs;
4413           if (!aarch64_composite_type_p (type, mode))
4414             {
4415               gcc_assert (nregs == 1);
4416               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
4417             }
4418           else
4419             {
4420               rtx par;
4421               int i;
4422               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4423               for (i = 0; i < nregs; i++)
4424                 {
4425                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
4426                                          V0_REGNUM + nvrn + i);
4427                   rtx offset = gen_int_mode
4428                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
4429                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4430                   XVECEXP (par, 0, i) = tmp;
4431                 }
4432               pcum->aapcs_reg = par;
4433             }
4434           return;
4435         }
4436       else
4437         {
4438           /* C.3 NSRN is set to 8.  */
4439           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
4440           goto on_stack;
4441         }
4442     }
4443
4444   ncrn = pcum->aapcs_ncrn;
4445   nregs = size / UNITS_PER_WORD;
4446
4447   /* C6 - C9.  though the sign and zero extension semantics are
4448      handled elsewhere.  This is the case where the argument fits
4449      entirely general registers.  */
4450   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
4451     {
4452       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
4453
4454       /* C.8 if the argument has an alignment of 16 then the NGRN is
4455          rounded up to the next even number.  */
4456       if (nregs == 2
4457           && ncrn % 2
4458           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4459              comparison is there because for > 16 * BITS_PER_UNIT
4460              alignment nregs should be > 2 and therefore it should be
4461              passed by reference rather than value.  */
4462           && (aarch64_function_arg_alignment (mode, type, &abi_break)
4463               == 16 * BITS_PER_UNIT))
4464         {
4465           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4466             inform (input_location, "parameter passing for argument of type "
4467                     "%qT changed in GCC 9.1", type);
4468           ++ncrn;
4469           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
4470         }
4471
4472       /* NREGS can be 0 when e.g. an empty structure is to be passed.
4473          A reg is still generated for it, but the caller should be smart
4474          enough not to use it.  */
4475       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
4476         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
4477       else
4478         {
4479           rtx par;
4480           int i;
4481
4482           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4483           for (i = 0; i < nregs; i++)
4484             {
4485               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
4486               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
4487                                        GEN_INT (i * UNITS_PER_WORD));
4488               XVECEXP (par, 0, i) = tmp;
4489             }
4490           pcum->aapcs_reg = par;
4491         }
4492
4493       pcum->aapcs_nextncrn = ncrn + nregs;
4494       return;
4495     }
4496
4497   /* C.11  */
4498   pcum->aapcs_nextncrn = NUM_ARG_REGS;
4499
4500   /* The argument is passed on stack; record the needed number of words for
4501      this argument and align the total size if necessary.  */
4502 on_stack:
4503   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
4504
4505   if (aarch64_function_arg_alignment (mode, type, &abi_break)
4506       == 16 * BITS_PER_UNIT)
4507     {
4508       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4509       if (pcum->aapcs_stack_size != new_size)
4510         {
4511           if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4512             inform (input_location, "parameter passing for argument of type "
4513                     "%qT changed in GCC 9.1", type);
4514           pcum->aapcs_stack_size = new_size;
4515         }
4516     }
4517   return;
4518 }
4519
4520 /* Implement TARGET_FUNCTION_ARG.  */
4521
4522 static rtx
4523 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
4524                       const_tree type, bool named)
4525 {
4526   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4527   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
4528
4529   if (mode == VOIDmode)
4530     return NULL_RTX;
4531
4532   aarch64_layout_arg (pcum_v, mode, type, named);
4533   return pcum->aapcs_reg;
4534 }
4535
4536 void
4537 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4538                            const_tree fntype ATTRIBUTE_UNUSED,
4539                            rtx libname ATTRIBUTE_UNUSED,
4540                            const_tree fndecl ATTRIBUTE_UNUSED,
4541                            unsigned n_named ATTRIBUTE_UNUSED)
4542 {
4543   pcum->aapcs_ncrn = 0;
4544   pcum->aapcs_nvrn = 0;
4545   pcum->aapcs_nextncrn = 0;
4546   pcum->aapcs_nextnvrn = 0;
4547   pcum->pcs_variant = ARM_PCS_AAPCS64;
4548   pcum->aapcs_reg = NULL_RTX;
4549   pcum->aapcs_arg_processed = false;
4550   pcum->aapcs_stack_words = 0;
4551   pcum->aapcs_stack_size = 0;
4552
4553   if (!TARGET_FLOAT
4554       && fndecl && TREE_PUBLIC (fndecl)
4555       && fntype && fntype != error_mark_node)
4556     {
4557       const_tree type = TREE_TYPE (fntype);
4558       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
4559       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
4560       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4561                                                    &mode, &nregs, NULL))
4562         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4563     }
4564   return;
4565 }
4566
4567 static void
4568 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4569                               machine_mode mode,
4570                               const_tree type,
4571                               bool named)
4572 {
4573   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4574   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4575     {
4576       aarch64_layout_arg (pcum_v, mode, type, named);
4577       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4578                   != (pcum->aapcs_stack_words != 0));
4579       pcum->aapcs_arg_processed = false;
4580       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4581       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4582       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4583       pcum->aapcs_stack_words = 0;
4584       pcum->aapcs_reg = NULL_RTX;
4585     }
4586 }
4587
4588 bool
4589 aarch64_function_arg_regno_p (unsigned regno)
4590 {
4591   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4592           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4593 }
4594
4595 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
4596    PARM_BOUNDARY bits of alignment, but will be given anything up
4597    to STACK_BOUNDARY bits if the type requires it.  This makes sure
4598    that both before and after the layout of each argument, the Next
4599    Stacked Argument Address (NSAA) will have a minimum alignment of
4600    8 bytes.  */
4601
4602 static unsigned int
4603 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4604 {
4605   bool abi_break;
4606   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4607                                                            &abi_break);
4608   if (abi_break & warn_psabi)
4609     inform (input_location, "parameter passing for argument of type "
4610             "%qT changed in GCC 9.1", type);
4611
4612   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4613 }
4614
4615 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
4616
4617 static fixed_size_mode
4618 aarch64_get_reg_raw_mode (int regno)
4619 {
4620   if (TARGET_SVE && FP_REGNUM_P (regno))
4621     /* Don't use the SVE part of the register for __builtin_apply and
4622        __builtin_return.  The SVE registers aren't used by the normal PCS,
4623        so using them there would be a waste of time.  The PCS extensions
4624        for SVE types are fundamentally incompatible with the
4625        __builtin_return/__builtin_apply interface.  */
4626     return as_a <fixed_size_mode> (V16QImode);
4627   return default_get_reg_raw_mode (regno);
4628 }
4629
4630 /* Implement TARGET_FUNCTION_ARG_PADDING.
4631
4632    Small aggregate types are placed in the lowest memory address.
4633
4634    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
4635
4636 static pad_direction
4637 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4638 {
4639   /* On little-endian targets, the least significant byte of every stack
4640      argument is passed at the lowest byte address of the stack slot.  */
4641   if (!BYTES_BIG_ENDIAN)
4642     return PAD_UPWARD;
4643
4644   /* Otherwise, integral, floating-point and pointer types are padded downward:
4645      the least significant byte of a stack argument is passed at the highest
4646      byte address of the stack slot.  */
4647   if (type
4648       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4649          || POINTER_TYPE_P (type))
4650       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4651     return PAD_DOWNWARD;
4652
4653   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
4654   return PAD_UPWARD;
4655 }
4656
4657 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4658
4659    It specifies padding for the last (may also be the only)
4660    element of a block move between registers and memory.  If
4661    assuming the block is in the memory, padding upward means that
4662    the last element is padded after its highest significant byte,
4663    while in downward padding, the last element is padded at the
4664    its least significant byte side.
4665
4666    Small aggregates and small complex types are always padded
4667    upwards.
4668
4669    We don't need to worry about homogeneous floating-point or
4670    short-vector aggregates; their move is not affected by the
4671    padding direction determined here.  Regardless of endianness,
4672    each element of such an aggregate is put in the least
4673    significant bits of a fp/simd register.
4674
4675    Return !BYTES_BIG_ENDIAN if the least significant byte of the
4676    register has useful data, and return the opposite if the most
4677    significant byte does.  */
4678
4679 bool
4680 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4681                      bool first ATTRIBUTE_UNUSED)
4682 {
4683
4684   /* Small composite types are always padded upward.  */
4685   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4686     {
4687       HOST_WIDE_INT size;
4688       if (type)
4689         size = int_size_in_bytes (type);
4690       else
4691         /* No frontends can create types with variable-sized modes, so we
4692            shouldn't be asked to pass or return them.  */
4693         size = GET_MODE_SIZE (mode).to_constant ();
4694       if (size < 2 * UNITS_PER_WORD)
4695         return true;
4696     }
4697
4698   /* Otherwise, use the default padding.  */
4699   return !BYTES_BIG_ENDIAN;
4700 }
4701
4702 static scalar_int_mode
4703 aarch64_libgcc_cmp_return_mode (void)
4704 {
4705   return SImode;
4706 }
4707
4708 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4709
4710 /* We use the 12-bit shifted immediate arithmetic instructions so values
4711    must be multiple of (1 << 12), i.e. 4096.  */
4712 #define ARITH_FACTOR 4096
4713
4714 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4715 #error Cannot use simple address calculation for stack probing
4716 #endif
4717
4718 /* The pair of scratch registers used for stack probing.  */
4719 #define PROBE_STACK_FIRST_REG  R9_REGNUM
4720 #define PROBE_STACK_SECOND_REG R10_REGNUM
4721
4722 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4723    inclusive.  These are offsets from the current stack pointer.  */
4724
4725 static void
4726 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4727 {
4728   HOST_WIDE_INT size;
4729   if (!poly_size.is_constant (&size))
4730     {
4731       sorry ("stack probes for SVE frames");
4732       return;
4733     }
4734
4735   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4736
4737   /* See the same assertion on PROBE_INTERVAL above.  */
4738   gcc_assert ((first % ARITH_FACTOR) == 0);
4739
4740   /* See if we have a constant small number of probes to generate.  If so,
4741      that's the easy case.  */
4742   if (size <= PROBE_INTERVAL)
4743     {
4744       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4745
4746       emit_set_insn (reg1,
4747                      plus_constant (Pmode,
4748                                     stack_pointer_rtx, -(first + base)));
4749       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4750     }
4751
4752   /* The run-time loop is made up of 8 insns in the generic case while the
4753      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
4754   else if (size <= 4 * PROBE_INTERVAL)
4755     {
4756       HOST_WIDE_INT i, rem;
4757
4758       emit_set_insn (reg1,
4759                      plus_constant (Pmode,
4760                                     stack_pointer_rtx,
4761                                     -(first + PROBE_INTERVAL)));
4762       emit_stack_probe (reg1);
4763
4764       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4765          it exceeds SIZE.  If only two probes are needed, this will not
4766          generate any code.  Then probe at FIRST + SIZE.  */
4767       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4768         {
4769           emit_set_insn (reg1,
4770                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4771           emit_stack_probe (reg1);
4772         }
4773
4774       rem = size - (i - PROBE_INTERVAL);
4775       if (rem > 256)
4776         {
4777           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4778
4779           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4780           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4781         }
4782       else
4783         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4784     }
4785
4786   /* Otherwise, do the same as above, but in a loop.  Note that we must be
4787      extra careful with variables wrapping around because we might be at
4788      the very top (or the very bottom) of the address space and we have
4789      to be able to handle this case properly; in particular, we use an
4790      equality test for the loop condition.  */
4791   else
4792     {
4793       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
4794
4795       /* Step 1: round SIZE to the previous multiple of the interval.  */
4796
4797       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
4798
4799
4800       /* Step 2: compute initial and final value of the loop counter.  */
4801
4802       /* TEST_ADDR = SP + FIRST.  */
4803       emit_set_insn (reg1,
4804                      plus_constant (Pmode, stack_pointer_rtx, -first));
4805
4806       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
4807       HOST_WIDE_INT adjustment = - (first + rounded_size);
4808       if (! aarch64_uimm12_shift (adjustment))
4809         {
4810           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
4811                                           true, Pmode);
4812           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
4813         }
4814       else
4815         emit_set_insn (reg2,
4816                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
4817
4818       /* Step 3: the loop
4819
4820          do
4821            {
4822              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4823              probe at TEST_ADDR
4824            }
4825          while (TEST_ADDR != LAST_ADDR)
4826
4827          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4828          until it is equal to ROUNDED_SIZE.  */
4829
4830       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
4831
4832
4833       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4834          that SIZE is equal to ROUNDED_SIZE.  */
4835
4836       if (size != rounded_size)
4837         {
4838           HOST_WIDE_INT rem = size - rounded_size;
4839
4840           if (rem > 256)
4841             {
4842               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4843
4844               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
4845               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
4846             }
4847           else
4848             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
4849         }
4850     }
4851
4852   /* Make sure nothing is scheduled before we are done.  */
4853   emit_insn (gen_blockage ());
4854 }
4855
4856 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
4857    absolute addresses.  */
4858
4859 const char *
4860 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
4861 {
4862   static int labelno = 0;
4863   char loop_lab[32];
4864   rtx xops[2];
4865
4866   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
4867
4868   /* Loop.  */
4869   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
4870
4871   HOST_WIDE_INT stack_clash_probe_interval
4872     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
4873
4874   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
4875   xops[0] = reg1;
4876   HOST_WIDE_INT interval;
4877   if (flag_stack_clash_protection)
4878     interval = stack_clash_probe_interval;
4879   else
4880     interval = PROBE_INTERVAL;
4881
4882   gcc_assert (aarch64_uimm12_shift (interval));
4883   xops[1] = GEN_INT (interval);
4884
4885   output_asm_insn ("sub\t%0, %0, %1", xops);
4886
4887   /* If doing stack clash protection then we probe up by the ABI specified
4888      amount.  We do this because we're dropping full pages at a time in the
4889      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
4890   if (flag_stack_clash_protection)
4891     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
4892   else
4893     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
4894
4895   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
4896      by this amount for each iteration.  */
4897   output_asm_insn ("str\txzr, [%0, %1]", xops);
4898
4899   /* Test if TEST_ADDR == LAST_ADDR.  */
4900   xops[1] = reg2;
4901   output_asm_insn ("cmp\t%0, %1", xops);
4902
4903   /* Branch.  */
4904   fputs ("\tb.ne\t", asm_out_file);
4905   assemble_name_raw (asm_out_file, loop_lab);
4906   fputc ('\n', asm_out_file);
4907
4908   return "";
4909 }
4910
4911 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4912    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4913    of GUARD_SIZE.  When a probe is emitted it is done at most
4914    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4915    at most MIN_PROBE_THRESHOLD.  By the end of this function
4916    BASE = BASE - ADJUSTMENT.  */
4917
4918 const char *
4919 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
4920                                       rtx min_probe_threshold, rtx guard_size)
4921 {
4922   /* This function is not allowed to use any instruction generation function
4923      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
4924      so instead emit the code you want using output_asm_insn.  */
4925   gcc_assert (flag_stack_clash_protection);
4926   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
4927   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
4928
4929   /* The minimum required allocation before the residual requires probing.  */
4930   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
4931
4932   /* Clamp the value down to the nearest value that can be used with a cmp.  */
4933   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
4934   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
4935
4936   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
4937   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
4938
4939   static int labelno = 0;
4940   char loop_start_lab[32];
4941   char loop_end_lab[32];
4942   rtx xops[2];
4943
4944   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
4945   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
4946
4947   /* Emit loop start label.  */
4948   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
4949
4950   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
4951   xops[0] = adjustment;
4952   xops[1] = probe_offset_value_rtx;
4953   output_asm_insn ("cmp\t%0, %1", xops);
4954
4955   /* Branch to end if not enough adjustment to probe.  */
4956   fputs ("\tb.lt\t", asm_out_file);
4957   assemble_name_raw (asm_out_file, loop_end_lab);
4958   fputc ('\n', asm_out_file);
4959
4960   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
4961   xops[0] = base;
4962   xops[1] = probe_offset_value_rtx;
4963   output_asm_insn ("sub\t%0, %0, %1", xops);
4964
4965   /* Probe at BASE.  */
4966   xops[1] = const0_rtx;
4967   output_asm_insn ("str\txzr, [%0, %1]", xops);
4968
4969   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
4970   xops[0] = adjustment;
4971   xops[1] = probe_offset_value_rtx;
4972   output_asm_insn ("sub\t%0, %0, %1", xops);
4973
4974   /* Branch to start if still more bytes to allocate.  */
4975   fputs ("\tb\t", asm_out_file);
4976   assemble_name_raw (asm_out_file, loop_start_lab);
4977   fputc ('\n', asm_out_file);
4978
4979   /* No probe leave.  */
4980   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
4981
4982   /* BASE = BASE - ADJUSTMENT.  */
4983   xops[0] = base;
4984   xops[1] = adjustment;
4985   output_asm_insn ("sub\t%0, %0, %1", xops);
4986   return "";
4987 }
4988
4989 /* Determine whether a frame chain needs to be generated.  */
4990 static bool
4991 aarch64_needs_frame_chain (void)
4992 {
4993   /* Force a frame chain for EH returns so the return address is at FP+8.  */
4994   if (frame_pointer_needed || crtl->calls_eh_return)
4995     return true;
4996
4997   /* A leaf function cannot have calls or write LR.  */
4998   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4999
5000   /* Don't use a frame chain in leaf functions if leaf frame pointers
5001      are disabled.  */
5002   if (flag_omit_leaf_frame_pointer && is_leaf)
5003     return false;
5004
5005   return aarch64_use_frame_pointer;
5006 }
5007
5008 /* Mark the registers that need to be saved by the callee and calculate
5009    the size of the callee-saved registers area and frame record (both FP
5010    and LR may be omitted).  */
5011 static void
5012 aarch64_layout_frame (void)
5013 {
5014   HOST_WIDE_INT offset = 0;
5015   int regno, last_fp_reg = INVALID_REGNUM;
5016   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5017
5018   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
5019
5020   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
5021      the mid-end is doing.  */
5022   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5023
5024 #define SLOT_NOT_REQUIRED (-2)
5025 #define SLOT_REQUIRED     (-1)
5026
5027   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
5028   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
5029
5030   /* If this is a non-leaf simd function with calls we assume that
5031      at least one of those calls is to a non-simd function and thus
5032      we must save V8 to V23 in the prologue.  */
5033
5034   if (simd_function && !crtl->is_leaf)
5035     {
5036       for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5037         if (FP_SIMD_SAVED_REGNUM_P (regno))
5038           df_set_regs_ever_live (regno, true);
5039     }
5040
5041   /* First mark all the registers that really need to be saved...  */
5042   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5043     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5044
5045   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5046     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5047
5048   /* ... that includes the eh data registers (if needed)...  */
5049   if (crtl->calls_eh_return)
5050     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5051       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
5052         = SLOT_REQUIRED;
5053
5054   /* ... and any callee saved register that dataflow says is live.  */
5055   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5056     if (df_regs_ever_live_p (regno)
5057         && (regno == R30_REGNUM
5058             || !call_used_regs[regno]))
5059       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5060
5061   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5062     if (df_regs_ever_live_p (regno)
5063         && (!call_used_regs[regno]
5064             || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
5065       {
5066         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5067         last_fp_reg = regno;
5068       }
5069
5070   if (cfun->machine->frame.emit_frame_chain)
5071     {
5072       /* FP and LR are placed in the linkage record.  */
5073       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
5074       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
5075       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
5076       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
5077       offset = 2 * UNITS_PER_WORD;
5078     }
5079
5080   /* With stack-clash, LR must be saved in non-leaf functions.  */
5081   gcc_assert (crtl->is_leaf
5082               || (cfun->machine->frame.reg_offset[R30_REGNUM]
5083                   != SLOT_NOT_REQUIRED));
5084
5085   /* Now assign stack slots for them.  */
5086   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5087     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5088       {
5089         cfun->machine->frame.reg_offset[regno] = offset;
5090         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5091           cfun->machine->frame.wb_candidate1 = regno;
5092         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
5093           cfun->machine->frame.wb_candidate2 = regno;
5094         offset += UNITS_PER_WORD;
5095       }
5096
5097   HOST_WIDE_INT max_int_offset = offset;
5098   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5099   bool has_align_gap = offset != max_int_offset;
5100
5101   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5102     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5103       {
5104         /* If there is an alignment gap between integer and fp callee-saves,
5105            allocate the last fp register to it if possible.  */
5106         if (regno == last_fp_reg
5107             && has_align_gap
5108             && !simd_function
5109             && (offset & 8) == 0)
5110           {
5111             cfun->machine->frame.reg_offset[regno] = max_int_offset;
5112             break;
5113           }
5114
5115         cfun->machine->frame.reg_offset[regno] = offset;
5116         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5117           cfun->machine->frame.wb_candidate1 = regno;
5118         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
5119                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
5120           cfun->machine->frame.wb_candidate2 = regno;
5121         offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
5122       }
5123
5124   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5125
5126   cfun->machine->frame.saved_regs_size = offset;
5127
5128   HOST_WIDE_INT varargs_and_saved_regs_size
5129     = offset + cfun->machine->frame.saved_varargs_size;
5130
5131   cfun->machine->frame.hard_fp_offset
5132     = aligned_upper_bound (varargs_and_saved_regs_size
5133                            + get_frame_size (),
5134                            STACK_BOUNDARY / BITS_PER_UNIT);
5135
5136   /* Both these values are already aligned.  */
5137   gcc_assert (multiple_p (crtl->outgoing_args_size,
5138                           STACK_BOUNDARY / BITS_PER_UNIT));
5139   cfun->machine->frame.frame_size
5140     = (cfun->machine->frame.hard_fp_offset
5141        + crtl->outgoing_args_size);
5142
5143   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
5144
5145   cfun->machine->frame.initial_adjust = 0;
5146   cfun->machine->frame.final_adjust = 0;
5147   cfun->machine->frame.callee_adjust = 0;
5148   cfun->machine->frame.callee_offset = 0;
5149
5150   HOST_WIDE_INT max_push_offset = 0;
5151   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
5152     max_push_offset = 512;
5153   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
5154     max_push_offset = 256;
5155
5156   HOST_WIDE_INT const_size, const_fp_offset;
5157   if (cfun->machine->frame.frame_size.is_constant (&const_size)
5158       && const_size < max_push_offset
5159       && known_eq (crtl->outgoing_args_size, 0))
5160     {
5161       /* Simple, small frame with no outgoing arguments:
5162          stp reg1, reg2, [sp, -frame_size]!
5163          stp reg3, reg4, [sp, 16]  */
5164       cfun->machine->frame.callee_adjust = const_size;
5165     }
5166   else if (known_lt (crtl->outgoing_args_size
5167                      + cfun->machine->frame.saved_regs_size, 512)
5168            && !(cfun->calls_alloca
5169                 && known_lt (cfun->machine->frame.hard_fp_offset,
5170                              max_push_offset)))
5171     {
5172       /* Frame with small outgoing arguments:
5173          sub sp, sp, frame_size
5174          stp reg1, reg2, [sp, outgoing_args_size]
5175          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
5176       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
5177       cfun->machine->frame.callee_offset
5178         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
5179     }
5180   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
5181            && const_fp_offset < max_push_offset)
5182     {
5183       /* Frame with large outgoing arguments but a small local area:
5184          stp reg1, reg2, [sp, -hard_fp_offset]!
5185          stp reg3, reg4, [sp, 16]
5186          sub sp, sp, outgoing_args_size  */
5187       cfun->machine->frame.callee_adjust = const_fp_offset;
5188       cfun->machine->frame.final_adjust
5189         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
5190     }
5191   else
5192     {
5193       /* Frame with large local area and outgoing arguments using frame pointer:
5194          sub sp, sp, hard_fp_offset
5195          stp x29, x30, [sp, 0]
5196          add x29, sp, 0
5197          stp reg3, reg4, [sp, 16]
5198          sub sp, sp, outgoing_args_size  */
5199       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
5200       cfun->machine->frame.final_adjust
5201         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
5202     }
5203
5204   cfun->machine->frame.laid_out = true;
5205 }
5206
5207 /* Return true if the register REGNO is saved on entry to
5208    the current function.  */
5209
5210 static bool
5211 aarch64_register_saved_on_entry (int regno)
5212 {
5213   return cfun->machine->frame.reg_offset[regno] >= 0;
5214 }
5215
5216 /* Return the next register up from REGNO up to LIMIT for the callee
5217    to save.  */
5218
5219 static unsigned
5220 aarch64_next_callee_save (unsigned regno, unsigned limit)
5221 {
5222   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
5223     regno ++;
5224   return regno;
5225 }
5226
5227 /* Push the register number REGNO of mode MODE to the stack with write-back
5228    adjusting the stack by ADJUSTMENT.  */
5229
5230 static void
5231 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
5232                            HOST_WIDE_INT adjustment)
5233  {
5234   rtx base_rtx = stack_pointer_rtx;
5235   rtx insn, reg, mem;
5236
5237   reg = gen_rtx_REG (mode, regno);
5238   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
5239                             plus_constant (Pmode, base_rtx, -adjustment));
5240   mem = gen_frame_mem (mode, mem);
5241
5242   insn = emit_move_insn (mem, reg);
5243   RTX_FRAME_RELATED_P (insn) = 1;
5244 }
5245
5246 /* Generate and return an instruction to store the pair of registers
5247    REG and REG2 of mode MODE to location BASE with write-back adjusting
5248    the stack location BASE by ADJUSTMENT.  */
5249
5250 static rtx
5251 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5252                           HOST_WIDE_INT adjustment)
5253 {
5254   switch (mode)
5255     {
5256     case E_DImode:
5257       return gen_storewb_pairdi_di (base, base, reg, reg2,
5258                                     GEN_INT (-adjustment),
5259                                     GEN_INT (UNITS_PER_WORD - adjustment));
5260     case E_DFmode:
5261       return gen_storewb_pairdf_di (base, base, reg, reg2,
5262                                     GEN_INT (-adjustment),
5263                                     GEN_INT (UNITS_PER_WORD - adjustment));
5264     case E_TFmode:
5265       return gen_storewb_pairtf_di (base, base, reg, reg2,
5266                                     GEN_INT (-adjustment),
5267                                     GEN_INT (UNITS_PER_VREG - adjustment));
5268     default:
5269       gcc_unreachable ();
5270     }
5271 }
5272
5273 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5274    stack pointer by ADJUSTMENT.  */
5275
5276 static void
5277 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
5278 {
5279   rtx_insn *insn;
5280   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5281
5282   if (regno2 == INVALID_REGNUM)
5283     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
5284
5285   rtx reg1 = gen_rtx_REG (mode, regno1);
5286   rtx reg2 = gen_rtx_REG (mode, regno2);
5287
5288   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
5289                                               reg2, adjustment));
5290   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
5291   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5292   RTX_FRAME_RELATED_P (insn) = 1;
5293 }
5294
5295 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5296    adjusting it by ADJUSTMENT afterwards.  */
5297
5298 static rtx
5299 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5300                          HOST_WIDE_INT adjustment)
5301 {
5302   switch (mode)
5303     {
5304     case E_DImode:
5305       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
5306                                    GEN_INT (UNITS_PER_WORD));
5307     case E_DFmode:
5308       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
5309                                    GEN_INT (UNITS_PER_WORD));
5310     case E_TFmode:
5311       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
5312                                    GEN_INT (UNITS_PER_VREG));
5313     default:
5314       gcc_unreachable ();
5315     }
5316 }
5317
5318 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5319    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5320    into CFI_OPS.  */
5321
5322 static void
5323 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
5324                   rtx *cfi_ops)
5325 {
5326   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5327   rtx reg1 = gen_rtx_REG (mode, regno1);
5328
5329   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
5330
5331   if (regno2 == INVALID_REGNUM)
5332     {
5333       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
5334       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
5335       emit_move_insn (reg1, gen_frame_mem (mode, mem));
5336     }
5337   else
5338     {
5339       rtx reg2 = gen_rtx_REG (mode, regno2);
5340       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5341       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
5342                                           reg2, adjustment));
5343     }
5344 }
5345
5346 /* Generate and return a store pair instruction of mode MODE to store
5347    register REG1 to MEM1 and register REG2 to MEM2.  */
5348
5349 static rtx
5350 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
5351                         rtx reg2)
5352 {
5353   switch (mode)
5354     {
5355     case E_DImode:
5356       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
5357
5358     case E_DFmode:
5359       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
5360
5361     case E_TFmode:
5362       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
5363
5364     default:
5365       gcc_unreachable ();
5366     }
5367 }
5368
5369 /* Generate and regurn a load pair isntruction of mode MODE to load register
5370    REG1 from MEM1 and register REG2 from MEM2.  */
5371
5372 static rtx
5373 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
5374                        rtx mem2)
5375 {
5376   switch (mode)
5377     {
5378     case E_DImode:
5379       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
5380
5381     case E_DFmode:
5382       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
5383
5384     case E_TFmode:
5385       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
5386
5387     default:
5388       gcc_unreachable ();
5389     }
5390 }
5391
5392 /* Return TRUE if return address signing should be enabled for the current
5393    function, otherwise return FALSE.  */
5394
5395 bool
5396 aarch64_return_address_signing_enabled (void)
5397 {
5398   /* This function should only be called after frame laid out.   */
5399   gcc_assert (cfun->machine->frame.laid_out);
5400
5401   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5402      if its LR is pushed onto stack.  */
5403   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
5404           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
5405               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
5406 }
5407
5408 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
5409 bool
5410 aarch64_bti_enabled (void)
5411 {
5412   return (aarch64_enable_bti == 1);
5413 }
5414
5415 /* Emit code to save the callee-saved registers from register number START
5416    to LIMIT to the stack at the location starting at offset START_OFFSET,
5417    skipping any write-back candidates if SKIP_WB is true.  */
5418
5419 static void
5420 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
5421                            unsigned start, unsigned limit, bool skip_wb)
5422 {
5423   rtx_insn *insn;
5424   unsigned regno;
5425   unsigned regno2;
5426
5427   for (regno = aarch64_next_callee_save (start, limit);
5428        regno <= limit;
5429        regno = aarch64_next_callee_save (regno + 1, limit))
5430     {
5431       rtx reg, mem;
5432       poly_int64 offset;
5433       int offset_diff;
5434
5435       if (skip_wb
5436           && (regno == cfun->machine->frame.wb_candidate1
5437               || regno == cfun->machine->frame.wb_candidate2))
5438         continue;
5439
5440       if (cfun->machine->reg_is_wrapped_separately[regno])
5441        continue;
5442
5443       reg = gen_rtx_REG (mode, regno);
5444       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5445       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5446                                                 offset));
5447
5448       regno2 = aarch64_next_callee_save (regno + 1, limit);
5449       offset_diff = cfun->machine->frame.reg_offset[regno2]
5450                     - cfun->machine->frame.reg_offset[regno];
5451
5452       if (regno2 <= limit
5453           && !cfun->machine->reg_is_wrapped_separately[regno2]
5454           && known_eq (GET_MODE_SIZE (mode), offset_diff))
5455         {
5456           rtx reg2 = gen_rtx_REG (mode, regno2);
5457           rtx mem2;
5458
5459           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5460           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5461                                                      offset));
5462           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
5463                                                     reg2));
5464
5465           /* The first part of a frame-related parallel insn is
5466              always assumed to be relevant to the frame
5467              calculations; subsequent parts, are only
5468              frame-related if explicitly marked.  */
5469           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5470           regno = regno2;
5471         }
5472       else
5473         insn = emit_move_insn (mem, reg);
5474
5475       RTX_FRAME_RELATED_P (insn) = 1;
5476     }
5477 }
5478
5479 /* Emit code to restore the callee registers of mode MODE from register
5480    number START up to and including LIMIT.  Restore from the stack offset
5481    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5482    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
5483
5484 static void
5485 aarch64_restore_callee_saves (machine_mode mode,
5486                               poly_int64 start_offset, unsigned start,
5487                               unsigned limit, bool skip_wb, rtx *cfi_ops)
5488 {
5489   rtx base_rtx = stack_pointer_rtx;
5490   unsigned regno;
5491   unsigned regno2;
5492   poly_int64 offset;
5493
5494   for (regno = aarch64_next_callee_save (start, limit);
5495        regno <= limit;
5496        regno = aarch64_next_callee_save (regno + 1, limit))
5497     {
5498       if (cfun->machine->reg_is_wrapped_separately[regno])
5499        continue;
5500
5501       rtx reg, mem;
5502       int offset_diff;
5503
5504       if (skip_wb
5505           && (regno == cfun->machine->frame.wb_candidate1
5506               || regno == cfun->machine->frame.wb_candidate2))
5507         continue;
5508
5509       reg = gen_rtx_REG (mode, regno);
5510       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5511       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5512
5513       regno2 = aarch64_next_callee_save (regno + 1, limit);
5514       offset_diff = cfun->machine->frame.reg_offset[regno2]
5515                     - cfun->machine->frame.reg_offset[regno];
5516
5517       if (regno2 <= limit
5518           && !cfun->machine->reg_is_wrapped_separately[regno2]
5519           && known_eq (GET_MODE_SIZE (mode), offset_diff))
5520         {
5521           rtx reg2 = gen_rtx_REG (mode, regno2);
5522           rtx mem2;
5523
5524           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5525           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5526           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5527
5528           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5529           regno = regno2;
5530         }
5531       else
5532         emit_move_insn (reg, mem);
5533       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5534     }
5535 }
5536
5537 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5538    of MODE.  */
5539
5540 static inline bool
5541 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5542 {
5543   HOST_WIDE_INT multiple;
5544   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5545           && IN_RANGE (multiple, -8, 7));
5546 }
5547
5548 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5549    of MODE.  */
5550
5551 static inline bool
5552 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5553 {
5554   HOST_WIDE_INT multiple;
5555   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5556           && IN_RANGE (multiple, 0, 63));
5557 }
5558
5559 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5560    of MODE.  */
5561
5562 bool
5563 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5564 {
5565   HOST_WIDE_INT multiple;
5566   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5567           && IN_RANGE (multiple, -64, 63));
5568 }
5569
5570 /* Return true if OFFSET is a signed 9-bit value.  */
5571
5572 bool
5573 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5574                                        poly_int64 offset)
5575 {
5576   HOST_WIDE_INT const_offset;
5577   return (offset.is_constant (&const_offset)
5578           && IN_RANGE (const_offset, -256, 255));
5579 }
5580
5581 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5582    of MODE.  */
5583
5584 static inline bool
5585 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5586 {
5587   HOST_WIDE_INT multiple;
5588   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5589           && IN_RANGE (multiple, -256, 255));
5590 }
5591
5592 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5593    of MODE.  */
5594
5595 static inline bool
5596 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5597 {
5598   HOST_WIDE_INT multiple;
5599   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5600           && IN_RANGE (multiple, 0, 4095));
5601 }
5602
5603 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
5604
5605 static sbitmap
5606 aarch64_get_separate_components (void)
5607 {
5608   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5609   bitmap_clear (components);
5610
5611   /* The registers we need saved to the frame.  */
5612   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5613     if (aarch64_register_saved_on_entry (regno))
5614       {
5615         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5616         if (!frame_pointer_needed)
5617           offset += cfun->machine->frame.frame_size
5618                     - cfun->machine->frame.hard_fp_offset;
5619         /* Check that we can access the stack slot of the register with one
5620            direct load with no adjustments needed.  */
5621         if (offset_12bit_unsigned_scaled_p (DImode, offset))
5622           bitmap_set_bit (components, regno);
5623       }
5624
5625   /* Don't mess with the hard frame pointer.  */
5626   if (frame_pointer_needed)
5627     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5628
5629   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5630   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5631   /* If registers have been chosen to be stored/restored with
5632      writeback don't interfere with them to avoid having to output explicit
5633      stack adjustment instructions.  */
5634   if (reg2 != INVALID_REGNUM)
5635     bitmap_clear_bit (components, reg2);
5636   if (reg1 != INVALID_REGNUM)
5637     bitmap_clear_bit (components, reg1);
5638
5639   bitmap_clear_bit (components, LR_REGNUM);
5640   bitmap_clear_bit (components, SP_REGNUM);
5641
5642   return components;
5643 }
5644
5645 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
5646
5647 static sbitmap
5648 aarch64_components_for_bb (basic_block bb)
5649 {
5650   bitmap in = DF_LIVE_IN (bb);
5651   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5652   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5653   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5654
5655   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5656   bitmap_clear (components);
5657
5658   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
5659   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5660     if ((!call_used_regs[regno]
5661         || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5662        && (bitmap_bit_p (in, regno)
5663            || bitmap_bit_p (gen, regno)
5664            || bitmap_bit_p (kill, regno)))
5665       {
5666         unsigned regno2, offset, offset2;
5667         bitmap_set_bit (components, regno);
5668
5669         /* If there is a callee-save at an adjacent offset, add it too
5670            to increase the use of LDP/STP.  */
5671         offset = cfun->machine->frame.reg_offset[regno];
5672         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5673
5674         if (regno2 <= LAST_SAVED_REGNUM)
5675           {
5676             offset2 = cfun->machine->frame.reg_offset[regno2];
5677             if ((offset & ~8) == (offset2 & ~8))
5678               bitmap_set_bit (components, regno2);
5679           }
5680       }
5681
5682   return components;
5683 }
5684
5685 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5686    Nothing to do for aarch64.  */
5687
5688 static void
5689 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5690 {
5691 }
5692
5693 /* Return the next set bit in BMP from START onwards.  Return the total number
5694    of bits in BMP if no set bit is found at or after START.  */
5695
5696 static unsigned int
5697 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5698 {
5699   unsigned int nbits = SBITMAP_SIZE (bmp);
5700   if (start == nbits)
5701     return start;
5702
5703   gcc_assert (start < nbits);
5704   for (unsigned int i = start; i < nbits; i++)
5705     if (bitmap_bit_p (bmp, i))
5706       return i;
5707
5708   return nbits;
5709 }
5710
5711 /* Do the work for aarch64_emit_prologue_components and
5712    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
5713    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5714    for these components or the epilogue sequence.  That is, it determines
5715    whether we should emit stores or loads and what kind of CFA notes to attach
5716    to the insns.  Otherwise the logic for the two sequences is very
5717    similar.  */
5718
5719 static void
5720 aarch64_process_components (sbitmap components, bool prologue_p)
5721 {
5722   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5723                              ? HARD_FRAME_POINTER_REGNUM
5724                              : STACK_POINTER_REGNUM);
5725
5726   unsigned last_regno = SBITMAP_SIZE (components);
5727   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5728   rtx_insn *insn = NULL;
5729
5730   while (regno != last_regno)
5731     {
5732       /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5733          so DFmode for the vector registers is enough.  For simd functions
5734          we want to save the low 128 bits.  */
5735       machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5736
5737       rtx reg = gen_rtx_REG (mode, regno);
5738       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5739       if (!frame_pointer_needed)
5740         offset += cfun->machine->frame.frame_size
5741                   - cfun->machine->frame.hard_fp_offset;
5742       rtx addr = plus_constant (Pmode, ptr_reg, offset);
5743       rtx mem = gen_frame_mem (mode, addr);
5744
5745       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5746       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5747       /* No more registers to handle after REGNO.
5748          Emit a single save/restore and exit.  */
5749       if (regno2 == last_regno)
5750         {
5751           insn = emit_insn (set);
5752           RTX_FRAME_RELATED_P (insn) = 1;
5753           if (prologue_p)
5754             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5755           else
5756             add_reg_note (insn, REG_CFA_RESTORE, reg);
5757           break;
5758         }
5759
5760       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5761       /* The next register is not of the same class or its offset is not
5762          mergeable with the current one into a pair.  */
5763       if (!satisfies_constraint_Ump (mem)
5764           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5765           || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5766           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5767                        GET_MODE_SIZE (mode)))
5768         {
5769           insn = emit_insn (set);
5770           RTX_FRAME_RELATED_P (insn) = 1;
5771           if (prologue_p)
5772             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5773           else
5774             add_reg_note (insn, REG_CFA_RESTORE, reg);
5775
5776           regno = regno2;
5777           continue;
5778         }
5779
5780       /* REGNO2 can be saved/restored in a pair with REGNO.  */
5781       rtx reg2 = gen_rtx_REG (mode, regno2);
5782       if (!frame_pointer_needed)
5783         offset2 += cfun->machine->frame.frame_size
5784                   - cfun->machine->frame.hard_fp_offset;
5785       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5786       rtx mem2 = gen_frame_mem (mode, addr2);
5787       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5788                              : gen_rtx_SET (reg2, mem2);
5789
5790       if (prologue_p)
5791         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
5792       else
5793         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5794
5795       RTX_FRAME_RELATED_P (insn) = 1;
5796       if (prologue_p)
5797         {
5798           add_reg_note (insn, REG_CFA_OFFSET, set);
5799           add_reg_note (insn, REG_CFA_OFFSET, set2);
5800         }
5801       else
5802         {
5803           add_reg_note (insn, REG_CFA_RESTORE, reg);
5804           add_reg_note (insn, REG_CFA_RESTORE, reg2);
5805         }
5806
5807       regno = aarch64_get_next_set_bit (components, regno2 + 1);
5808     }
5809 }
5810
5811 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
5812
5813 static void
5814 aarch64_emit_prologue_components (sbitmap components)
5815 {
5816   aarch64_process_components (components, true);
5817 }
5818
5819 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
5820
5821 static void
5822 aarch64_emit_epilogue_components (sbitmap components)
5823 {
5824   aarch64_process_components (components, false);
5825 }
5826
5827 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
5828
5829 static void
5830 aarch64_set_handled_components (sbitmap components)
5831 {
5832   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5833     if (bitmap_bit_p (components, regno))
5834       cfun->machine->reg_is_wrapped_separately[regno] = true;
5835 }
5836
5837 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
5838    determining the probe offset for alloca.  */
5839
5840 static HOST_WIDE_INT
5841 aarch64_stack_clash_protection_alloca_probe_range (void)
5842 {
5843   return STACK_CLASH_CALLER_GUARD;
5844 }
5845
5846
5847 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5848    registers.  If POLY_SIZE is not large enough to require a probe this function
5849    will only adjust the stack.  When allocating the stack space
5850    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5851    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5852    arguments.  If we are then we ensure that any allocation larger than the ABI
5853    defined buffer needs a probe so that the invariant of having a 1KB buffer is
5854    maintained.
5855
5856    We emit barriers after each stack adjustment to prevent optimizations from
5857    breaking the invariant that we never drop the stack more than a page.  This
5858    invariant is needed to make it easier to correctly handle asynchronous
5859    events, e.g. if we were to allow the stack to be dropped by more than a page
5860    and then have multiple probes up and we take a signal somewhere in between
5861    then the signal handler doesn't know the state of the stack and can make no
5862    assumptions about which pages have been probed.  */
5863
5864 static void
5865 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
5866                                         poly_int64 poly_size,
5867                                         bool frame_related_p,
5868                                         bool final_adjustment_p)
5869 {
5870   HOST_WIDE_INT guard_size
5871     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5872   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5873   /* When doing the final adjustment for the outgoing argument size we can't
5874      assume that LR was saved at position 0.  So subtract it's offset from the
5875      ABI safe buffer so that we don't accidentally allow an adjustment that
5876      would result in an allocation larger than the ABI buffer without
5877      probing.  */
5878   HOST_WIDE_INT min_probe_threshold
5879     = final_adjustment_p
5880       ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
5881       : guard_size - guard_used_by_caller;
5882
5883   poly_int64 frame_size = cfun->machine->frame.frame_size;
5884
5885   /* We should always have a positive probe threshold.  */
5886   gcc_assert (min_probe_threshold > 0);
5887
5888   if (flag_stack_clash_protection && !final_adjustment_p)
5889     {
5890       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5891       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5892
5893       if (known_eq (frame_size, 0))
5894         {
5895           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
5896         }
5897       else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
5898                && known_lt (final_adjust, guard_used_by_caller))
5899         {
5900           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
5901         }
5902     }
5903
5904   /* If SIZE is not large enough to require probing, just adjust the stack and
5905      exit.  */
5906   if (known_lt (poly_size, min_probe_threshold)
5907       || !flag_stack_clash_protection)
5908     {
5909       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
5910       return;
5911     }
5912
5913   HOST_WIDE_INT size;
5914   /* Handle the SVE non-constant case first.  */
5915   if (!poly_size.is_constant (&size))
5916     {
5917      if (dump_file)
5918       {
5919         fprintf (dump_file, "Stack clash SVE prologue: ");
5920         print_dec (poly_size, dump_file);
5921         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
5922       }
5923
5924       /* First calculate the amount of bytes we're actually spilling.  */
5925       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
5926                           poly_size, temp1, temp2, false, true);
5927
5928       rtx_insn *insn = get_last_insn ();
5929
5930       if (frame_related_p)
5931         {
5932           /* This is done to provide unwinding information for the stack
5933              adjustments we're about to do, however to prevent the optimizers
5934              from removing the R11 move and leaving the CFA note (which would be
5935              very wrong) we tie the old and new stack pointer together.
5936              The tie will expand to nothing but the optimizers will not touch
5937              the instruction.  */
5938           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
5939           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
5940           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
5941
5942           /* We want the CFA independent of the stack pointer for the
5943              duration of the loop.  */
5944           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
5945           RTX_FRAME_RELATED_P (insn) = 1;
5946         }
5947
5948       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
5949       rtx guard_const = gen_int_mode (guard_size, Pmode);
5950
5951       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
5952                                                    stack_pointer_rtx, temp1,
5953                                                    probe_const, guard_const));
5954
5955       /* Now reset the CFA register if needed.  */
5956       if (frame_related_p)
5957         {
5958           add_reg_note (insn, REG_CFA_DEF_CFA,
5959                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
5960                                       gen_int_mode (poly_size, Pmode)));
5961           RTX_FRAME_RELATED_P (insn) = 1;
5962         }
5963
5964       return;
5965     }
5966
5967   if (dump_file)
5968     fprintf (dump_file,
5969              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5970              " bytes, probing will be required.\n", size);
5971
5972   /* Round size to the nearest multiple of guard_size, and calculate the
5973      residual as the difference between the original size and the rounded
5974      size.  */
5975   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
5976   HOST_WIDE_INT residual = size - rounded_size;
5977
5978   /* We can handle a small number of allocations/probes inline.  Otherwise
5979      punt to a loop.  */
5980   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
5981     {
5982       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
5983         {
5984           aarch64_sub_sp (NULL, temp2, guard_size, true);
5985           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5986                                            guard_used_by_caller));
5987           emit_insn (gen_blockage ());
5988         }
5989       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
5990     }
5991   else
5992     {
5993       /* Compute the ending address.  */
5994       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
5995                           temp1, NULL, false, true);
5996       rtx_insn *insn = get_last_insn ();
5997
5998       /* For the initial allocation, we don't have a frame pointer
5999          set up, so we always need CFI notes.  If we're doing the
6000          final allocation, then we may have a frame pointer, in which
6001          case it is the CFA, otherwise we need CFI notes.
6002
6003          We can determine which allocation we are doing by looking at
6004          the value of FRAME_RELATED_P since the final allocations are not
6005          frame related.  */
6006       if (frame_related_p)
6007         {
6008           /* We want the CFA independent of the stack pointer for the
6009              duration of the loop.  */
6010           add_reg_note (insn, REG_CFA_DEF_CFA,
6011                         plus_constant (Pmode, temp1, rounded_size));
6012           RTX_FRAME_RELATED_P (insn) = 1;
6013         }
6014
6015       /* This allocates and probes the stack.  Note that this re-uses some of
6016          the existing Ada stack protection code.  However we are guaranteed not
6017          to enter the non loop or residual branches of that code.
6018
6019          The non-loop part won't be entered because if our allocation amount
6020          doesn't require a loop, the case above would handle it.
6021
6022          The residual amount won't be entered because TEMP1 is a mutliple of
6023          the allocation size.  The residual will always be 0.  As such, the only
6024          part we are actually using from that code is the loop setup.  The
6025          actual probing is done in aarch64_output_probe_stack_range.  */
6026       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
6027                                                stack_pointer_rtx, temp1));
6028
6029       /* Now reset the CFA register if needed.  */
6030       if (frame_related_p)
6031         {
6032           add_reg_note (insn, REG_CFA_DEF_CFA,
6033                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
6034           RTX_FRAME_RELATED_P (insn) = 1;
6035         }
6036
6037       emit_insn (gen_blockage ());
6038       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
6039     }
6040
6041   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
6042      be probed.  This maintains the requirement that each page is probed at
6043      least once.  For initial probing we probe only if the allocation is
6044      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
6045      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
6046      GUARD_SIZE.  This works that for any allocation that is large enough to
6047      trigger a probe here, we'll have at least one, and if they're not large
6048      enough for this code to emit anything for them, The page would have been
6049      probed by the saving of FP/LR either by this function or any callees.  If
6050      we don't have any callees then we won't have more stack adjustments and so
6051      are still safe.  */
6052   if (residual)
6053     {
6054       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
6055       /* If we're doing final adjustments, and we've done any full page
6056          allocations then any residual needs to be probed.  */
6057       if (final_adjustment_p && rounded_size != 0)
6058         min_probe_threshold = 0;
6059       /* If doing a small final adjustment, we always probe at offset 0.
6060          This is done to avoid issues when LR is not at position 0 or when
6061          the final adjustment is smaller than the probing offset.  */
6062       else if (final_adjustment_p && rounded_size == 0)
6063         residual_probe_offset = 0;
6064
6065       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
6066       if (residual >= min_probe_threshold)
6067         {
6068           if (dump_file)
6069             fprintf (dump_file,
6070                      "Stack clash AArch64 prologue residuals: "
6071                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
6072                      "\n", residual);
6073
6074             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6075                                              residual_probe_offset));
6076           emit_insn (gen_blockage ());
6077         }
6078     }
6079 }
6080
6081 /* Return 1 if the register is used by the epilogue.  We need to say the
6082    return register is used, but only after epilogue generation is complete.
6083    Note that in the case of sibcalls, the values "used by the epilogue" are
6084    considered live at the start of the called function.
6085
6086    For SIMD functions we need to return 1 for FP registers that are saved and
6087    restored by a function but are not zero in call_used_regs.  If we do not do
6088    this optimizations may remove the restore of the register.  */
6089
6090 int
6091 aarch64_epilogue_uses (int regno)
6092 {
6093   if (epilogue_completed)
6094     {
6095       if (regno == LR_REGNUM)
6096         return 1;
6097       if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
6098         return 1;
6099     }
6100   return 0;
6101 }
6102
6103 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6104    is saved at BASE + OFFSET.  */
6105
6106 static void
6107 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
6108                             rtx base, poly_int64 offset)
6109 {
6110   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
6111   add_reg_note (insn, REG_CFA_EXPRESSION,
6112                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
6113 }
6114
6115 /* AArch64 stack frames generated by this compiler look like:
6116
6117         +-------------------------------+
6118         |                               |
6119         |  incoming stack arguments     |
6120         |                               |
6121         +-------------------------------+
6122         |                               | <-- incoming stack pointer (aligned)
6123         |  callee-allocated save area   |
6124         |  for register varargs         |
6125         |                               |
6126         +-------------------------------+
6127         |  local variables              | <-- frame_pointer_rtx
6128         |                               |
6129         +-------------------------------+
6130         |  padding                      | \
6131         +-------------------------------+  |
6132         |  callee-saved registers       |  | frame.saved_regs_size
6133         +-------------------------------+  |
6134         |  LR'                          |  |
6135         +-------------------------------+  |
6136         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
6137         +-------------------------------+
6138         |  dynamic allocation           |
6139         +-------------------------------+
6140         |  padding                      |
6141         +-------------------------------+
6142         |  outgoing stack arguments     | <-- arg_pointer
6143         |                               |
6144         +-------------------------------+
6145         |                               | <-- stack_pointer_rtx (aligned)
6146
6147    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6148    but leave frame_pointer_rtx and hard_frame_pointer_rtx
6149    unchanged.
6150
6151    By default for stack-clash we assume the guard is at least 64KB, but this
6152    value is configurable to either 4KB or 64KB.  We also force the guard size to
6153    be the same as the probing interval and both values are kept in sync.
6154
6155    With those assumptions the callee can allocate up to 63KB (or 3KB depending
6156    on the guard size) of stack space without probing.
6157
6158    When probing is needed, we emit a probe at the start of the prologue
6159    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6160
6161    We have to track how much space has been allocated and the only stores
6162    to the stack we track as implicit probes are the FP/LR stores.
6163
6164    For outgoing arguments we probe if the size is larger than 1KB, such that
6165    the ABI specified buffer is maintained for the next callee.
6166
6167    The following registers are reserved during frame layout and should not be
6168    used for any other purpose:
6169
6170    - r11: Used by stack clash protection when SVE is enabled.
6171    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6172    - r14 and r15: Used for speculation tracking.
6173    - r16(IP0), r17(IP1): Used by indirect tailcalls.
6174    - r30(LR), r29(FP): Used by standard frame layout.
6175
6176    These registers must be avoided in frame layout related code unless the
6177    explicit intention is to interact with one of the features listed above.  */
6178
6179 /* Generate the prologue instructions for entry into a function.
6180    Establish the stack frame by decreasing the stack pointer with a
6181    properly calculated size and, if necessary, create a frame record
6182    filled with the values of LR and previous frame pointer.  The
6183    current FP is also set up if it is in use.  */
6184
6185 void
6186 aarch64_expand_prologue (void)
6187 {
6188   poly_int64 frame_size = cfun->machine->frame.frame_size;
6189   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6190   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6191   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6192   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6193   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6194   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6195   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
6196   rtx_insn *insn;
6197
6198   /* Sign return address for functions.  */
6199   if (aarch64_return_address_signing_enabled ())
6200     {
6201       switch (aarch64_ra_sign_key)
6202         {
6203           case AARCH64_KEY_A:
6204             insn = emit_insn (gen_paciasp ());
6205             break;
6206           case AARCH64_KEY_B:
6207             insn = emit_insn (gen_pacibsp ());
6208             break;
6209           default:
6210             gcc_unreachable ();
6211         }
6212       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6213       RTX_FRAME_RELATED_P (insn) = 1;
6214     }
6215
6216   if (flag_stack_usage_info)
6217     current_function_static_stack_size = constant_lower_bound (frame_size);
6218
6219   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6220     {
6221       if (crtl->is_leaf && !cfun->calls_alloca)
6222         {
6223           if (maybe_gt (frame_size, PROBE_INTERVAL)
6224               && maybe_gt (frame_size, get_stack_check_protect ()))
6225             aarch64_emit_probe_stack_range (get_stack_check_protect (),
6226                                             (frame_size
6227                                              - get_stack_check_protect ()));
6228         }
6229       else if (maybe_gt (frame_size, 0))
6230         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
6231     }
6232
6233   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6234   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6235
6236   /* In theory we should never have both an initial adjustment
6237      and a callee save adjustment.  Verify that is the case since the
6238      code below does not handle it for -fstack-clash-protection.  */
6239   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
6240
6241   /* Will only probe if the initial adjustment is larger than the guard
6242      less the amount of the guard reserved for use by the caller's
6243      outgoing args.  */
6244   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
6245                                           true, false);
6246
6247   if (callee_adjust != 0)
6248     aarch64_push_regs (reg1, reg2, callee_adjust);
6249
6250   if (emit_frame_chain)
6251     {
6252       poly_int64 reg_offset = callee_adjust;
6253       if (callee_adjust == 0)
6254         {
6255           reg1 = R29_REGNUM;
6256           reg2 = R30_REGNUM;
6257           reg_offset = callee_offset;
6258           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
6259         }
6260       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
6261                           stack_pointer_rtx, callee_offset,
6262                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
6263       if (frame_pointer_needed && !frame_size.is_constant ())
6264         {
6265           /* Variable-sized frames need to describe the save slot
6266              address using DW_CFA_expression rather than DW_CFA_offset.
6267              This means that, without taking further action, the
6268              locations of the registers that we've already saved would
6269              remain based on the stack pointer even after we redefine
6270              the CFA based on the frame pointer.  We therefore need new
6271              DW_CFA_expressions to re-express the save slots with addresses
6272              based on the frame pointer.  */
6273           rtx_insn *insn = get_last_insn ();
6274           gcc_assert (RTX_FRAME_RELATED_P (insn));
6275
6276           /* Add an explicit CFA definition if this was previously
6277              implicit.  */
6278           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
6279             {
6280               rtx src = plus_constant (Pmode, stack_pointer_rtx,
6281                                        callee_offset);
6282               add_reg_note (insn, REG_CFA_ADJUST_CFA,
6283                             gen_rtx_SET (hard_frame_pointer_rtx, src));
6284             }
6285
6286           /* Change the save slot expressions for the registers that
6287              we've already saved.  */
6288           reg_offset -= callee_offset;
6289           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
6290                                       reg_offset + UNITS_PER_WORD);
6291           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
6292                                       reg_offset);
6293         }
6294       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
6295     }
6296
6297   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6298                              callee_adjust != 0 || emit_frame_chain);
6299   if (aarch64_simd_decl_p (cfun->decl))
6300     aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6301                                callee_adjust != 0 || emit_frame_chain);
6302   else
6303     aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6304                                callee_adjust != 0 || emit_frame_chain);
6305
6306   /* We may need to probe the final adjustment if it is larger than the guard
6307      that is assumed by the called.  */
6308   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
6309                                           !frame_pointer_needed, true);
6310 }
6311
6312 /* Return TRUE if we can use a simple_return insn.
6313
6314    This function checks whether the callee saved stack is empty, which
6315    means no restore actions are need. The pro_and_epilogue will use
6316    this to check whether shrink-wrapping opt is feasible.  */
6317
6318 bool
6319 aarch64_use_return_insn_p (void)
6320 {
6321   if (!reload_completed)
6322     return false;
6323
6324   if (crtl->profile)
6325     return false;
6326
6327   return known_eq (cfun->machine->frame.frame_size, 0);
6328 }
6329
6330 /* Return false for non-leaf SIMD functions in order to avoid
6331    shrink-wrapping them.  Doing this will lose the necessary
6332    save/restore of FP registers.  */
6333
6334 bool
6335 aarch64_use_simple_return_insn_p (void)
6336 {
6337   if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
6338     return false;
6339
6340   return true;
6341 }
6342
6343 /* Generate the epilogue instructions for returning from a function.
6344    This is almost exactly the reverse of the prolog sequence, except
6345    that we need to insert barriers to avoid scheduling loads that read
6346    from a deallocated stack, and we optimize the unwind records by
6347    emitting them all together if possible.  */
6348 void
6349 aarch64_expand_epilogue (bool for_sibcall)
6350 {
6351   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6352   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6353   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6354   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6355   unsigned reg1 = cfun->machine->frame.wb_candidate1;
6356   unsigned reg2 = cfun->machine->frame.wb_candidate2;
6357   rtx cfi_ops = NULL;
6358   rtx_insn *insn;
6359   /* A stack clash protection prologue may not have left EP0_REGNUM or
6360      EP1_REGNUM in a usable state.  The same is true for allocations
6361      with an SVE component, since we then need both temporary registers
6362      for each allocation.  For stack clash we are in a usable state if
6363      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
6364   HOST_WIDE_INT guard_size
6365     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6366   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6367
6368   /* We can re-use the registers when the allocation amount is smaller than
6369      guard_size - guard_used_by_caller because we won't be doing any probes
6370      then.  In such situations the register should remain live with the correct
6371      value.  */
6372   bool can_inherit_p = (initial_adjust.is_constant ()
6373                         && final_adjust.is_constant ())
6374                         && (!flag_stack_clash_protection
6375                             || known_lt (initial_adjust,
6376                                          guard_size - guard_used_by_caller));
6377
6378   /* We need to add memory barrier to prevent read from deallocated stack.  */
6379   bool need_barrier_p
6380     = maybe_ne (get_frame_size ()
6381                 + cfun->machine->frame.saved_varargs_size, 0);
6382
6383   /* Emit a barrier to prevent loads from a deallocated stack.  */
6384   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
6385       || cfun->calls_alloca
6386       || crtl->calls_eh_return)
6387     {
6388       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6389       need_barrier_p = false;
6390     }
6391
6392   /* Restore the stack pointer from the frame pointer if it may not
6393      be the same as the stack pointer.  */
6394   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6395   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6396   if (frame_pointer_needed
6397       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
6398     /* If writeback is used when restoring callee-saves, the CFA
6399        is restored on the instruction doing the writeback.  */
6400     aarch64_add_offset (Pmode, stack_pointer_rtx,
6401                         hard_frame_pointer_rtx, -callee_offset,
6402                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
6403   else
6404      /* The case where we need to re-use the register here is very rare, so
6405         avoid the complicated condition and just always emit a move if the
6406         immediate doesn't fit.  */
6407      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
6408
6409   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6410                                 callee_adjust != 0, &cfi_ops);
6411   if (aarch64_simd_decl_p (cfun->decl))
6412     aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6413                                   callee_adjust != 0, &cfi_ops);
6414   else
6415     aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6416                                   callee_adjust != 0, &cfi_ops);
6417
6418   if (need_barrier_p)
6419     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6420
6421   if (callee_adjust != 0)
6422     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
6423
6424   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
6425     {
6426       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
6427       insn = get_last_insn ();
6428       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
6429       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
6430       RTX_FRAME_RELATED_P (insn) = 1;
6431       cfi_ops = NULL;
6432     }
6433
6434   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6435      add restriction on emit_move optimization to leaf functions.  */
6436   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
6437                   (!can_inherit_p || !crtl->is_leaf
6438                    || df_regs_ever_live_p (EP0_REGNUM)));
6439
6440   if (cfi_ops)
6441     {
6442       /* Emit delayed restores and reset the CFA to be SP.  */
6443       insn = get_last_insn ();
6444       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
6445       REG_NOTES (insn) = cfi_ops;
6446       RTX_FRAME_RELATED_P (insn) = 1;
6447     }
6448
6449   /* We prefer to emit the combined return/authenticate instruction RETAA,
6450      however there are three cases in which we must instead emit an explicit
6451      authentication instruction.
6452
6453         1) Sibcalls don't return in a normal way, so if we're about to call one
6454            we must authenticate.
6455
6456         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6457            generating code for !TARGET_ARMV8_3 we can't use it and must
6458            explicitly authenticate.
6459
6460         3) On an eh_return path we make extra stack adjustments to update the
6461            canonical frame address to be the exception handler's CFA.  We want
6462            to authenticate using the CFA of the function which calls eh_return.
6463     */
6464   if (aarch64_return_address_signing_enabled ()
6465       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
6466     {
6467       switch (aarch64_ra_sign_key)
6468         {
6469           case AARCH64_KEY_A:
6470             insn = emit_insn (gen_autiasp ());
6471             break;
6472           case AARCH64_KEY_B:
6473             insn = emit_insn (gen_autibsp ());
6474             break;
6475           default:
6476             gcc_unreachable ();
6477         }
6478       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6479       RTX_FRAME_RELATED_P (insn) = 1;
6480     }
6481
6482   /* Stack adjustment for exception handler.  */
6483   if (crtl->calls_eh_return && !for_sibcall)
6484     {
6485       /* We need to unwind the stack by the offset computed by
6486          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
6487          to be SP; letting the CFA move during this adjustment
6488          is just as correct as retaining the CFA from the body
6489          of the function.  Therefore, do nothing special.  */
6490       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
6491     }
6492
6493   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
6494   if (!for_sibcall)
6495     emit_jump_insn (ret_rtx);
6496 }
6497
6498 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
6499    normally or return to a previous frame after unwinding.
6500
6501    An EH return uses a single shared return sequence.  The epilogue is
6502    exactly like a normal epilogue except that it has an extra input
6503    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6504    that must be applied after the frame has been destroyed.  An extra label
6505    is inserted before the epilogue which initializes this register to zero,
6506    and this is the entry point for a normal return.
6507
6508    An actual EH return updates the return address, initializes the stack
6509    adjustment and jumps directly into the epilogue (bypassing the zeroing
6510    of the adjustment).  Since the return address is typically saved on the
6511    stack when a function makes a call, the saved LR must be updated outside
6512    the epilogue.
6513
6514    This poses problems as the store is generated well before the epilogue,
6515    so the offset of LR is not known yet.  Also optimizations will remove the
6516    store as it appears dead, even after the epilogue is generated (as the
6517    base or offset for loading LR is different in many cases).
6518
6519    To avoid these problems this implementation forces the frame pointer
6520    in eh_return functions so that the location of LR is fixed and known early.
6521    It also marks the store volatile, so no optimization is permitted to
6522    remove the store.  */
6523 rtx
6524 aarch64_eh_return_handler_rtx (void)
6525 {
6526   rtx tmp = gen_frame_mem (Pmode,
6527     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6528
6529   /* Mark the store volatile, so no optimization is permitted to remove it.  */
6530   MEM_VOLATILE_P (tmp) = true;
6531   return tmp;
6532 }
6533
6534 /* Output code to add DELTA to the first argument, and then jump
6535    to FUNCTION.  Used for C++ multiple inheritance.  */
6536 static void
6537 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6538                          HOST_WIDE_INT delta,
6539                          HOST_WIDE_INT vcall_offset,
6540                          tree function)
6541 {
6542   /* The this pointer is always in x0.  Note that this differs from
6543      Arm where the this pointer maybe bumped to r1 if r0 is required
6544      to return a pointer to an aggregate.  On AArch64 a result value
6545      pointer will be in x8.  */
6546   int this_regno = R0_REGNUM;
6547   rtx this_rtx, temp0, temp1, addr, funexp;
6548   rtx_insn *insn;
6549   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6550
6551   if (aarch64_bti_enabled ())
6552     emit_insn (gen_bti_c());
6553
6554   reload_completed = 1;
6555   emit_note (NOTE_INSN_PROLOGUE_END);
6556
6557   this_rtx = gen_rtx_REG (Pmode, this_regno);
6558   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6559   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6560
6561   if (vcall_offset == 0)
6562     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6563   else
6564     {
6565       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6566
6567       addr = this_rtx;
6568       if (delta != 0)
6569         {
6570           if (delta >= -256 && delta < 256)
6571             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6572                                        plus_constant (Pmode, this_rtx, delta));
6573           else
6574             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6575                                 temp1, temp0, false);
6576         }
6577
6578       if (Pmode == ptr_mode)
6579         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6580       else
6581         aarch64_emit_move (temp0,
6582                            gen_rtx_ZERO_EXTEND (Pmode,
6583                                                 gen_rtx_MEM (ptr_mode, addr)));
6584
6585       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6586           addr = plus_constant (Pmode, temp0, vcall_offset);
6587       else
6588         {
6589           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6590                                           Pmode);
6591           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6592         }
6593
6594       if (Pmode == ptr_mode)
6595         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6596       else
6597         aarch64_emit_move (temp1,
6598                            gen_rtx_SIGN_EXTEND (Pmode,
6599                                                 gen_rtx_MEM (ptr_mode, addr)));
6600
6601       emit_insn (gen_add2_insn (this_rtx, temp1));
6602     }
6603
6604   /* Generate a tail call to the target function.  */
6605   if (!TREE_USED (function))
6606     {
6607       assemble_external (function);
6608       TREE_USED (function) = 1;
6609     }
6610   funexp = XEXP (DECL_RTL (function), 0);
6611   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6612   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6613   SIBLING_CALL_P (insn) = 1;
6614
6615   insn = get_insns ();
6616   shorten_branches (insn);
6617
6618   assemble_start_function (thunk, fnname);
6619   final_start_function (insn, file, 1);
6620   final (insn, file, 1);
6621   final_end_function ();
6622   assemble_end_function (thunk, fnname);
6623
6624   /* Stop pretending to be a post-reload pass.  */
6625   reload_completed = 0;
6626 }
6627
6628 static bool
6629 aarch64_tls_referenced_p (rtx x)
6630 {
6631   if (!TARGET_HAVE_TLS)
6632     return false;
6633   subrtx_iterator::array_type array;
6634   FOR_EACH_SUBRTX (iter, array, x, ALL)
6635     {
6636       const_rtx x = *iter;
6637       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6638         return true;
6639       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6640          TLS offsets, not real symbol references.  */
6641       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6642         iter.skip_subrtxes ();
6643     }
6644   return false;
6645 }
6646
6647
6648 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6649    a left shift of 0 or 12 bits.  */
6650 bool
6651 aarch64_uimm12_shift (HOST_WIDE_INT val)
6652 {
6653   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6654           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6655           );
6656 }
6657
6658 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6659    that can be created with a left shift of 0 or 12.  */
6660 static HOST_WIDE_INT
6661 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6662 {
6663   /* Check to see if the value fits in 24 bits, as that is the maximum we can
6664      handle correctly.  */
6665   gcc_assert ((val & 0xffffff) == val);
6666
6667   if (((val & 0xfff) << 0) == val)
6668     return val;
6669
6670   return val & (0xfff << 12);
6671 }
6672
6673 /* Return true if val is an immediate that can be loaded into a
6674    register by a MOVZ instruction.  */
6675 static bool
6676 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6677 {
6678   if (GET_MODE_SIZE (mode) > 4)
6679     {
6680       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6681           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6682         return 1;
6683     }
6684   else
6685     {
6686       /* Ignore sign extension.  */
6687       val &= (HOST_WIDE_INT) 0xffffffff;
6688     }
6689   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6690           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6691 }
6692
6693 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
6694    64-bit (DImode) integer.  */
6695
6696 static unsigned HOST_WIDE_INT
6697 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6698 {
6699   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6700   while (size < 64)
6701     {
6702       val &= (HOST_WIDE_INT_1U << size) - 1;
6703       val |= val << size;
6704       size *= 2;
6705     }
6706   return val;
6707 }
6708
6709 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
6710
6711 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6712   {
6713     0x0000000100000001ull,
6714     0x0001000100010001ull,
6715     0x0101010101010101ull,
6716     0x1111111111111111ull,
6717     0x5555555555555555ull,
6718   };
6719
6720
6721 /* Return true if val is a valid bitmask immediate.  */
6722
6723 bool
6724 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6725 {
6726   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6727   int bits;
6728
6729   /* Check for a single sequence of one bits and return quickly if so.
6730      The special cases of all ones and all zeroes returns false.  */
6731   val = aarch64_replicate_bitmask_imm (val_in, mode);
6732   tmp = val + (val & -val);
6733
6734   if (tmp == (tmp & -tmp))
6735     return (val + 1) > 1;
6736
6737   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
6738   if (mode == SImode)
6739     val = (val << 32) | (val & 0xffffffff);
6740
6741   /* Invert if the immediate doesn't start with a zero bit - this means we
6742      only need to search for sequences of one bits.  */
6743   if (val & 1)
6744     val = ~val;
6745
6746   /* Find the first set bit and set tmp to val with the first sequence of one
6747      bits removed.  Return success if there is a single sequence of ones.  */
6748   first_one = val & -val;
6749   tmp = val & (val + first_one);
6750
6751   if (tmp == 0)
6752     return true;
6753
6754   /* Find the next set bit and compute the difference in bit position.  */
6755   next_one = tmp & -tmp;
6756   bits = clz_hwi (first_one) - clz_hwi (next_one);
6757   mask = val ^ tmp;
6758
6759   /* Check the bit position difference is a power of 2, and that the first
6760      sequence of one bits fits within 'bits' bits.  */
6761   if ((mask >> bits) != 0 || bits != (bits & -bits))
6762     return false;
6763
6764   /* Check the sequence of one bits is repeated 64/bits times.  */
6765   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6766 }
6767
6768 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6769    Assumed precondition: VAL_IN Is not zero.  */
6770
6771 unsigned HOST_WIDE_INT
6772 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6773 {
6774   int lowest_bit_set = ctz_hwi (val_in);
6775   int highest_bit_set = floor_log2 (val_in);
6776   gcc_assert (val_in != 0);
6777
6778   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6779           (HOST_WIDE_INT_1U << lowest_bit_set));
6780 }
6781
6782 /* Create constant where bits outside of lowest bit set to highest bit set
6783    are set to 1.  */
6784
6785 unsigned HOST_WIDE_INT
6786 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6787 {
6788   return val_in | ~aarch64_and_split_imm1 (val_in);
6789 }
6790
6791 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
6792
6793 bool
6794 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
6795 {
6796   scalar_int_mode int_mode;
6797   if (!is_a <scalar_int_mode> (mode, &int_mode))
6798     return false;
6799
6800   if (aarch64_bitmask_imm (val_in, int_mode))
6801     return false;
6802
6803   if (aarch64_move_imm (val_in, int_mode))
6804     return false;
6805
6806   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
6807
6808   return aarch64_bitmask_imm (imm2, int_mode);
6809 }
6810
6811 /* Return true if val is an immediate that can be loaded into a
6812    register in a single instruction.  */
6813 bool
6814 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
6815 {
6816   scalar_int_mode int_mode;
6817   if (!is_a <scalar_int_mode> (mode, &int_mode))
6818     return false;
6819
6820   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
6821     return 1;
6822   return aarch64_bitmask_imm (val, int_mode);
6823 }
6824
6825 static bool
6826 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
6827 {
6828   rtx base, offset;
6829
6830   if (GET_CODE (x) == HIGH)
6831     return true;
6832
6833   /* There's no way to calculate VL-based values using relocations.  */
6834   subrtx_iterator::array_type array;
6835   FOR_EACH_SUBRTX (iter, array, x, ALL)
6836     if (GET_CODE (*iter) == CONST_POLY_INT)
6837       return true;
6838
6839   split_const (x, &base, &offset);
6840   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
6841     {
6842       if (aarch64_classify_symbol (base, INTVAL (offset))
6843           != SYMBOL_FORCE_TO_MEM)
6844         return true;
6845       else
6846         /* Avoid generating a 64-bit relocation in ILP32; leave
6847            to aarch64_expand_mov_immediate to handle it properly.  */
6848         return mode != ptr_mode;
6849     }
6850
6851   return aarch64_tls_referenced_p (x);
6852 }
6853
6854 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6855    The expansion for a table switch is quite expensive due to the number
6856    of instructions, the table lookup and hard to predict indirect jump.
6857    When optimizing for speed, and -O3 enabled, use the per-core tuning if
6858    set, otherwise use tables for > 16 cases as a tradeoff between size and
6859    performance.  When optimizing for size, use the default setting.  */
6860
6861 static unsigned int
6862 aarch64_case_values_threshold (void)
6863 {
6864   /* Use the specified limit for the number of cases before using jump
6865      tables at higher optimization levels.  */
6866   if (optimize > 2
6867       && selected_cpu->tune->max_case_values != 0)
6868     return selected_cpu->tune->max_case_values;
6869   else
6870     return optimize_size ? default_case_values_threshold () : 17;
6871 }
6872
6873 /* Return true if register REGNO is a valid index register.
6874    STRICT_P is true if REG_OK_STRICT is in effect.  */
6875
6876 bool
6877 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
6878 {
6879   if (!HARD_REGISTER_NUM_P (regno))
6880     {
6881       if (!strict_p)
6882         return true;
6883
6884       if (!reg_renumber)
6885         return false;
6886
6887       regno = reg_renumber[regno];
6888     }
6889   return GP_REGNUM_P (regno);
6890 }
6891
6892 /* Return true if register REGNO is a valid base register for mode MODE.
6893    STRICT_P is true if REG_OK_STRICT is in effect.  */
6894
6895 bool
6896 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
6897 {
6898   if (!HARD_REGISTER_NUM_P (regno))
6899     {
6900       if (!strict_p)
6901         return true;
6902
6903       if (!reg_renumber)
6904         return false;
6905
6906       regno = reg_renumber[regno];
6907     }
6908
6909   /* The fake registers will be eliminated to either the stack or
6910      hard frame pointer, both of which are usually valid base registers.
6911      Reload deals with the cases where the eliminated form isn't valid.  */
6912   return (GP_REGNUM_P (regno)
6913           || regno == SP_REGNUM
6914           || regno == FRAME_POINTER_REGNUM
6915           || regno == ARG_POINTER_REGNUM);
6916 }
6917
6918 /* Return true if X is a valid base register for mode MODE.
6919    STRICT_P is true if REG_OK_STRICT is in effect.  */
6920
6921 static bool
6922 aarch64_base_register_rtx_p (rtx x, bool strict_p)
6923 {
6924   if (!strict_p
6925       && GET_CODE (x) == SUBREG
6926       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
6927     x = SUBREG_REG (x);
6928
6929   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
6930 }
6931
6932 /* Return true if address offset is a valid index.  If it is, fill in INFO
6933    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6934
6935 static bool
6936 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
6937                         machine_mode mode, bool strict_p)
6938 {
6939   enum aarch64_address_type type;
6940   rtx index;
6941   int shift;
6942
6943   /* (reg:P) */
6944   if ((REG_P (x) || GET_CODE (x) == SUBREG)
6945       && GET_MODE (x) == Pmode)
6946     {
6947       type = ADDRESS_REG_REG;
6948       index = x;
6949       shift = 0;
6950     }
6951   /* (sign_extend:DI (reg:SI)) */
6952   else if ((GET_CODE (x) == SIGN_EXTEND
6953             || GET_CODE (x) == ZERO_EXTEND)
6954            && GET_MODE (x) == DImode
6955            && GET_MODE (XEXP (x, 0)) == SImode)
6956     {
6957       type = (GET_CODE (x) == SIGN_EXTEND)
6958         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6959       index = XEXP (x, 0);
6960       shift = 0;
6961     }
6962   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6963   else if (GET_CODE (x) == MULT
6964            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6965                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6966            && GET_MODE (XEXP (x, 0)) == DImode
6967            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6968            && CONST_INT_P (XEXP (x, 1)))
6969     {
6970       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6971         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6972       index = XEXP (XEXP (x, 0), 0);
6973       shift = exact_log2 (INTVAL (XEXP (x, 1)));
6974     }
6975   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6976   else if (GET_CODE (x) == ASHIFT
6977            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6978                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6979            && GET_MODE (XEXP (x, 0)) == DImode
6980            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6981            && CONST_INT_P (XEXP (x, 1)))
6982     {
6983       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6984         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6985       index = XEXP (XEXP (x, 0), 0);
6986       shift = INTVAL (XEXP (x, 1));
6987     }
6988   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6989   else if ((GET_CODE (x) == SIGN_EXTRACT
6990             || GET_CODE (x) == ZERO_EXTRACT)
6991            && GET_MODE (x) == DImode
6992            && GET_CODE (XEXP (x, 0)) == MULT
6993            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6994            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6995     {
6996       type = (GET_CODE (x) == SIGN_EXTRACT)
6997         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6998       index = XEXP (XEXP (x, 0), 0);
6999       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7000       if (INTVAL (XEXP (x, 1)) != 32 + shift
7001           || INTVAL (XEXP (x, 2)) != 0)
7002         shift = -1;
7003     }
7004   /* (and:DI (mult:DI (reg:DI) (const_int scale))
7005      (const_int 0xffffffff<<shift)) */
7006   else if (GET_CODE (x) == AND
7007            && GET_MODE (x) == DImode
7008            && GET_CODE (XEXP (x, 0)) == MULT
7009            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7010            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7011            && CONST_INT_P (XEXP (x, 1)))
7012     {
7013       type = ADDRESS_REG_UXTW;
7014       index = XEXP (XEXP (x, 0), 0);
7015       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7016       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7017         shift = -1;
7018     }
7019   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
7020   else if ((GET_CODE (x) == SIGN_EXTRACT
7021             || GET_CODE (x) == ZERO_EXTRACT)
7022            && GET_MODE (x) == DImode
7023            && GET_CODE (XEXP (x, 0)) == ASHIFT
7024            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7025            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7026     {
7027       type = (GET_CODE (x) == SIGN_EXTRACT)
7028         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7029       index = XEXP (XEXP (x, 0), 0);
7030       shift = INTVAL (XEXP (XEXP (x, 0), 1));
7031       if (INTVAL (XEXP (x, 1)) != 32 + shift
7032           || INTVAL (XEXP (x, 2)) != 0)
7033         shift = -1;
7034     }
7035   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
7036      (const_int 0xffffffff<<shift)) */
7037   else if (GET_CODE (x) == AND
7038            && GET_MODE (x) == DImode
7039            && GET_CODE (XEXP (x, 0)) == ASHIFT
7040            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7041            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7042            && CONST_INT_P (XEXP (x, 1)))
7043     {
7044       type = ADDRESS_REG_UXTW;
7045       index = XEXP (XEXP (x, 0), 0);
7046       shift = INTVAL (XEXP (XEXP (x, 0), 1));
7047       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7048         shift = -1;
7049     }
7050   /* (mult:P (reg:P) (const_int scale)) */
7051   else if (GET_CODE (x) == MULT
7052            && GET_MODE (x) == Pmode
7053            && GET_MODE (XEXP (x, 0)) == Pmode
7054            && CONST_INT_P (XEXP (x, 1)))
7055     {
7056       type = ADDRESS_REG_REG;
7057       index = XEXP (x, 0);
7058       shift = exact_log2 (INTVAL (XEXP (x, 1)));
7059     }
7060   /* (ashift:P (reg:P) (const_int shift)) */
7061   else if (GET_CODE (x) == ASHIFT
7062            && GET_MODE (x) == Pmode
7063            && GET_MODE (XEXP (x, 0)) == Pmode
7064            && CONST_INT_P (XEXP (x, 1)))
7065     {
7066       type = ADDRESS_REG_REG;
7067       index = XEXP (x, 0);
7068       shift = INTVAL (XEXP (x, 1));
7069     }
7070   else
7071     return false;
7072
7073   if (!strict_p
7074       && GET_CODE (index) == SUBREG
7075       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
7076     index = SUBREG_REG (index);
7077
7078   if (aarch64_sve_data_mode_p (mode))
7079     {
7080       if (type != ADDRESS_REG_REG
7081           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
7082         return false;
7083     }
7084   else
7085     {
7086       if (shift != 0
7087           && !(IN_RANGE (shift, 1, 3)
7088                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
7089         return false;
7090     }
7091
7092   if (REG_P (index)
7093       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
7094     {
7095       info->type = type;
7096       info->offset = index;
7097       info->shift = shift;
7098       return true;
7099     }
7100
7101   return false;
7102 }
7103
7104 /* Return true if MODE is one of the modes for which we
7105    support LDP/STP operations.  */
7106
7107 static bool
7108 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
7109 {
7110   return mode == SImode || mode == DImode
7111          || mode == SFmode || mode == DFmode
7112          || (aarch64_vector_mode_supported_p (mode)
7113              && (known_eq (GET_MODE_SIZE (mode), 8)
7114                  || (known_eq (GET_MODE_SIZE (mode), 16)
7115                     && (aarch64_tune_params.extra_tuning_flags
7116                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
7117 }
7118
7119 /* Return true if REGNO is a virtual pointer register, or an eliminable
7120    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
7121    include stack_pointer or hard_frame_pointer.  */
7122 static bool
7123 virt_or_elim_regno_p (unsigned regno)
7124 {
7125   return ((regno >= FIRST_VIRTUAL_REGISTER
7126            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
7127           || regno == FRAME_POINTER_REGNUM
7128           || regno == ARG_POINTER_REGNUM);
7129 }
7130
7131 /* Return true if X is a valid address of type TYPE for machine mode MODE.
7132    If it is, fill in INFO appropriately.  STRICT_P is true if
7133    REG_OK_STRICT is in effect.  */
7134
7135 bool
7136 aarch64_classify_address (struct aarch64_address_info *info,
7137                           rtx x, machine_mode mode, bool strict_p,
7138                           aarch64_addr_query_type type)
7139 {
7140   enum rtx_code code = GET_CODE (x);
7141   rtx op0, op1;
7142   poly_int64 offset;
7143
7144   HOST_WIDE_INT const_size;
7145
7146   /* On BE, we use load/store pair for all large int mode load/stores.
7147      TI/TFmode may also use a load/store pair.  */
7148   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7149   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
7150   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
7151                             || type == ADDR_QUERY_LDP_STP_N
7152                             || mode == TImode
7153                             || mode == TFmode
7154                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
7155
7156   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7157      corresponds to the actual size of the memory being loaded/stored and the
7158      mode of the corresponding addressing mode is half of that.  */
7159   if (type == ADDR_QUERY_LDP_STP_N
7160       && known_eq (GET_MODE_SIZE (mode), 16))
7161     mode = DFmode;
7162
7163   bool allow_reg_index_p = (!load_store_pair_p
7164                             && (known_lt (GET_MODE_SIZE (mode), 16)
7165                                 || vec_flags == VEC_ADVSIMD
7166                                 || vec_flags & VEC_SVE_DATA));
7167
7168   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7169      [Rn, #offset, MUL VL].  */
7170   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
7171       && (code != REG && code != PLUS))
7172     return false;
7173
7174   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7175      REG addressing.  */
7176   if (advsimd_struct_p
7177       && !BYTES_BIG_ENDIAN
7178       && (code != POST_INC && code != REG))
7179     return false;
7180
7181   gcc_checking_assert (GET_MODE (x) == VOIDmode
7182                        || SCALAR_INT_MODE_P (GET_MODE (x)));
7183
7184   switch (code)
7185     {
7186     case REG:
7187     case SUBREG:
7188       info->type = ADDRESS_REG_IMM;
7189       info->base = x;
7190       info->offset = const0_rtx;
7191       info->const_offset = 0;
7192       return aarch64_base_register_rtx_p (x, strict_p);
7193
7194     case PLUS:
7195       op0 = XEXP (x, 0);
7196       op1 = XEXP (x, 1);
7197
7198       if (! strict_p
7199           && REG_P (op0)
7200           && virt_or_elim_regno_p (REGNO (op0))
7201           && poly_int_rtx_p (op1, &offset))
7202         {
7203           info->type = ADDRESS_REG_IMM;
7204           info->base = op0;
7205           info->offset = op1;
7206           info->const_offset = offset;
7207
7208           return true;
7209         }
7210
7211       if (maybe_ne (GET_MODE_SIZE (mode), 0)
7212           && aarch64_base_register_rtx_p (op0, strict_p)
7213           && poly_int_rtx_p (op1, &offset))
7214         {
7215           info->type = ADDRESS_REG_IMM;
7216           info->base = op0;
7217           info->offset = op1;
7218           info->const_offset = offset;
7219
7220           /* TImode and TFmode values are allowed in both pairs of X
7221              registers and individual Q registers.  The available
7222              address modes are:
7223              X,X: 7-bit signed scaled offset
7224              Q:   9-bit signed offset
7225              We conservatively require an offset representable in either mode.
7226              When performing the check for pairs of X registers i.e.  LDP/STP
7227              pass down DImode since that is the natural size of the LDP/STP
7228              instruction memory accesses.  */
7229           if (mode == TImode || mode == TFmode)
7230             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
7231                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7232                         || offset_12bit_unsigned_scaled_p (mode, offset)));
7233
7234           /* A 7bit offset check because OImode will emit a ldp/stp
7235              instruction (only big endian will get here).
7236              For ldp/stp instructions, the offset is scaled for the size of a
7237              single element of the pair.  */
7238           if (mode == OImode)
7239             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
7240
7241           /* Three 9/12 bit offsets checks because CImode will emit three
7242              ldr/str instructions (only big endian will get here).  */
7243           if (mode == CImode)
7244             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7245                     && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
7246                                                                offset + 32)
7247                         || offset_12bit_unsigned_scaled_p (V16QImode,
7248                                                            offset + 32)));
7249
7250           /* Two 7bit offsets checks because XImode will emit two ldp/stp
7251              instructions (only big endian will get here).  */
7252           if (mode == XImode)
7253             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7254                     && aarch64_offset_7bit_signed_scaled_p (TImode,
7255                                                             offset + 32));
7256
7257           /* Make "m" use the LD1 offset range for SVE data modes, so
7258              that pre-RTL optimizers like ivopts will work to that
7259              instead of the wider LDR/STR range.  */
7260           if (vec_flags == VEC_SVE_DATA)
7261             return (type == ADDR_QUERY_M
7262                     ? offset_4bit_signed_scaled_p (mode, offset)
7263                     : offset_9bit_signed_scaled_p (mode, offset));
7264
7265           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
7266             {
7267               poly_int64 end_offset = (offset
7268                                        + GET_MODE_SIZE (mode)
7269                                        - BYTES_PER_SVE_VECTOR);
7270               return (type == ADDR_QUERY_M
7271                       ? offset_4bit_signed_scaled_p (mode, offset)
7272                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
7273                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
7274                                                          end_offset)));
7275             }
7276
7277           if (vec_flags == VEC_SVE_PRED)
7278             return offset_9bit_signed_scaled_p (mode, offset);
7279
7280           if (load_store_pair_p)
7281             return ((known_eq (GET_MODE_SIZE (mode), 4)
7282                      || known_eq (GET_MODE_SIZE (mode), 8)
7283                      || known_eq (GET_MODE_SIZE (mode), 16))
7284                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7285           else
7286             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7287                     || offset_12bit_unsigned_scaled_p (mode, offset));
7288         }
7289
7290       if (allow_reg_index_p)
7291         {
7292           /* Look for base + (scaled/extended) index register.  */
7293           if (aarch64_base_register_rtx_p (op0, strict_p)
7294               && aarch64_classify_index (info, op1, mode, strict_p))
7295             {
7296               info->base = op0;
7297               return true;
7298             }
7299           if (aarch64_base_register_rtx_p (op1, strict_p)
7300               && aarch64_classify_index (info, op0, mode, strict_p))
7301             {
7302               info->base = op1;
7303               return true;
7304             }
7305         }
7306
7307       return false;
7308
7309     case POST_INC:
7310     case POST_DEC:
7311     case PRE_INC:
7312     case PRE_DEC:
7313       info->type = ADDRESS_REG_WB;
7314       info->base = XEXP (x, 0);
7315       info->offset = NULL_RTX;
7316       return aarch64_base_register_rtx_p (info->base, strict_p);
7317
7318     case POST_MODIFY:
7319     case PRE_MODIFY:
7320       info->type = ADDRESS_REG_WB;
7321       info->base = XEXP (x, 0);
7322       if (GET_CODE (XEXP (x, 1)) == PLUS
7323           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
7324           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
7325           && aarch64_base_register_rtx_p (info->base, strict_p))
7326         {
7327           info->offset = XEXP (XEXP (x, 1), 1);
7328           info->const_offset = offset;
7329
7330           /* TImode and TFmode values are allowed in both pairs of X
7331              registers and individual Q registers.  The available
7332              address modes are:
7333              X,X: 7-bit signed scaled offset
7334              Q:   9-bit signed offset
7335              We conservatively require an offset representable in either mode.
7336            */
7337           if (mode == TImode || mode == TFmode)
7338             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
7339                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
7340
7341           if (load_store_pair_p)
7342             return ((known_eq (GET_MODE_SIZE (mode), 4)
7343                      || known_eq (GET_MODE_SIZE (mode), 8)
7344                      || known_eq (GET_MODE_SIZE (mode), 16))
7345                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7346           else
7347             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
7348         }
7349       return false;
7350
7351     case CONST:
7352     case SYMBOL_REF:
7353     case LABEL_REF:
7354       /* load literal: pc-relative constant pool entry.  Only supported
7355          for SI mode or larger.  */
7356       info->type = ADDRESS_SYMBOLIC;
7357
7358       if (!load_store_pair_p
7359           && GET_MODE_SIZE (mode).is_constant (&const_size)
7360           && const_size >= 4)
7361         {
7362           rtx sym, addend;
7363
7364           split_const (x, &sym, &addend);
7365           return ((GET_CODE (sym) == LABEL_REF
7366                    || (GET_CODE (sym) == SYMBOL_REF
7367                        && CONSTANT_POOL_ADDRESS_P (sym)
7368                        && aarch64_pcrelative_literal_loads)));
7369         }
7370       return false;
7371
7372     case LO_SUM:
7373       info->type = ADDRESS_LO_SUM;
7374       info->base = XEXP (x, 0);
7375       info->offset = XEXP (x, 1);
7376       if (allow_reg_index_p
7377           && aarch64_base_register_rtx_p (info->base, strict_p))
7378         {
7379           rtx sym, offs;
7380           split_const (info->offset, &sym, &offs);
7381           if (GET_CODE (sym) == SYMBOL_REF
7382               && (aarch64_classify_symbol (sym, INTVAL (offs))
7383                   == SYMBOL_SMALL_ABSOLUTE))
7384             {
7385               /* The symbol and offset must be aligned to the access size.  */
7386               unsigned int align;
7387
7388               if (CONSTANT_POOL_ADDRESS_P (sym))
7389                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
7390               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
7391                 {
7392                   tree exp = SYMBOL_REF_DECL (sym);
7393                   align = TYPE_ALIGN (TREE_TYPE (exp));
7394                   align = aarch64_constant_alignment (exp, align);
7395                 }
7396               else if (SYMBOL_REF_DECL (sym))
7397                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
7398               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
7399                        && SYMBOL_REF_BLOCK (sym) != NULL)
7400                 align = SYMBOL_REF_BLOCK (sym)->alignment;
7401               else
7402                 align = BITS_PER_UNIT;
7403
7404               poly_int64 ref_size = GET_MODE_SIZE (mode);
7405               if (known_eq (ref_size, 0))
7406                 ref_size = GET_MODE_SIZE (DImode);
7407
7408               return (multiple_p (INTVAL (offs), ref_size)
7409                       && multiple_p (align / BITS_PER_UNIT, ref_size));
7410             }
7411         }
7412       return false;
7413
7414     default:
7415       return false;
7416     }
7417 }
7418
7419 /* Return true if the address X is valid for a PRFM instruction.
7420    STRICT_P is true if we should do strict checking with
7421    aarch64_classify_address.  */
7422
7423 bool
7424 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
7425 {
7426   struct aarch64_address_info addr;
7427
7428   /* PRFM accepts the same addresses as DImode...  */
7429   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
7430   if (!res)
7431     return false;
7432
7433   /* ... except writeback forms.  */
7434   return addr.type != ADDRESS_REG_WB;
7435 }
7436
7437 bool
7438 aarch64_symbolic_address_p (rtx x)
7439 {
7440   rtx offset;
7441
7442   split_const (x, &x, &offset);
7443   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
7444 }
7445
7446 /* Classify the base of symbolic expression X.  */
7447
7448 enum aarch64_symbol_type
7449 aarch64_classify_symbolic_expression (rtx x)
7450 {
7451   rtx offset;
7452
7453   split_const (x, &x, &offset);
7454   return aarch64_classify_symbol (x, INTVAL (offset));
7455 }
7456
7457
7458 /* Return TRUE if X is a legitimate address for accessing memory in
7459    mode MODE.  */
7460 static bool
7461 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
7462 {
7463   struct aarch64_address_info addr;
7464
7465   return aarch64_classify_address (&addr, x, mode, strict_p);
7466 }
7467
7468 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7469    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
7470 bool
7471 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
7472                               aarch64_addr_query_type type)
7473 {
7474   struct aarch64_address_info addr;
7475
7476   return aarch64_classify_address (&addr, x, mode, strict_p, type);
7477 }
7478
7479 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
7480
7481 static bool
7482 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
7483                                          poly_int64 orig_offset,
7484                                          machine_mode mode)
7485 {
7486   HOST_WIDE_INT size;
7487   if (GET_MODE_SIZE (mode).is_constant (&size))
7488     {
7489       HOST_WIDE_INT const_offset, second_offset;
7490
7491       /* A general SVE offset is A * VQ + B.  Remove the A component from
7492          coefficient 0 in order to get the constant B.  */
7493       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
7494
7495       /* Split an out-of-range address displacement into a base and
7496          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
7497          range otherwise to increase opportunities for sharing the base
7498          address of different sizes.  Unaligned accesses use the signed
7499          9-bit range, TImode/TFmode use the intersection of signed
7500          scaled 7-bit and signed 9-bit offset.  */
7501       if (mode == TImode || mode == TFmode)
7502         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
7503       else if ((const_offset & (size - 1)) != 0)
7504         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
7505       else
7506         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
7507
7508       if (second_offset == 0 || known_eq (orig_offset, second_offset))
7509         return false;
7510
7511       /* Split the offset into second_offset and the rest.  */
7512       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7513       *offset2 = gen_int_mode (second_offset, Pmode);
7514       return true;
7515     }
7516   else
7517     {
7518       /* Get the mode we should use as the basis of the range.  For structure
7519          modes this is the mode of one vector.  */
7520       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7521       machine_mode step_mode
7522         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7523
7524       /* Get the "mul vl" multiplier we'd like to use.  */
7525       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7526       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7527       if (vec_flags & VEC_SVE_DATA)
7528         /* LDR supports a 9-bit range, but the move patterns for
7529            structure modes require all vectors to be in range of the
7530            same base.  The simplest way of accomodating that while still
7531            promoting reuse of anchor points between different modes is
7532            to use an 8-bit range unconditionally.  */
7533         vnum = ((vnum + 128) & 255) - 128;
7534       else
7535         /* Predicates are only handled singly, so we might as well use
7536            the full range.  */
7537         vnum = ((vnum + 256) & 511) - 256;
7538       if (vnum == 0)
7539         return false;
7540
7541       /* Convert the "mul vl" multiplier into a byte offset.  */
7542       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7543       if (known_eq (second_offset, orig_offset))
7544         return false;
7545
7546       /* Split the offset into second_offset and the rest.  */
7547       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7548       *offset2 = gen_int_mode (second_offset, Pmode);
7549       return true;
7550     }
7551 }
7552
7553 /* Return the binary representation of floating point constant VALUE in INTVAL.
7554    If the value cannot be converted, return false without setting INTVAL.
7555    The conversion is done in the given MODE.  */
7556 bool
7557 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7558 {
7559
7560   /* We make a general exception for 0.  */
7561   if (aarch64_float_const_zero_rtx_p (value))
7562     {
7563       *intval = 0;
7564       return true;
7565     }
7566
7567   scalar_float_mode mode;
7568   if (GET_CODE (value) != CONST_DOUBLE
7569       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7570       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7571       /* Only support up to DF mode.  */
7572       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7573     return false;
7574
7575   unsigned HOST_WIDE_INT ival = 0;
7576
7577   long res[2];
7578   real_to_target (res,
7579                   CONST_DOUBLE_REAL_VALUE (value),
7580                   REAL_MODE_FORMAT (mode));
7581
7582   if (mode == DFmode)
7583     {
7584       int order = BYTES_BIG_ENDIAN ? 1 : 0;
7585       ival = zext_hwi (res[order], 32);
7586       ival |= (zext_hwi (res[1 - order], 32) << 32);
7587     }
7588   else
7589       ival = zext_hwi (res[0], 32);
7590
7591   *intval = ival;
7592   return true;
7593 }
7594
7595 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7596    single MOV(+MOVK) followed by an FMOV.  */
7597 bool
7598 aarch64_float_const_rtx_p (rtx x)
7599 {
7600   machine_mode mode = GET_MODE (x);
7601   if (mode == VOIDmode)
7602     return false;
7603
7604   /* Determine whether it's cheaper to write float constants as
7605      mov/movk pairs over ldr/adrp pairs.  */
7606   unsigned HOST_WIDE_INT ival;
7607
7608   if (GET_CODE (x) == CONST_DOUBLE
7609       && SCALAR_FLOAT_MODE_P (mode)
7610       && aarch64_reinterpret_float_as_int (x, &ival))
7611     {
7612       scalar_int_mode imode = (mode == HFmode
7613                                ? SImode
7614                                : int_mode_for_mode (mode).require ());
7615       int num_instr = aarch64_internal_mov_immediate
7616                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7617       return num_instr < 3;
7618     }
7619
7620   return false;
7621 }
7622
7623 /* Return TRUE if rtx X is immediate constant 0.0 */
7624 bool
7625 aarch64_float_const_zero_rtx_p (rtx x)
7626 {
7627   if (GET_MODE (x) == VOIDmode)
7628     return false;
7629
7630   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7631     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7632   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7633 }
7634
7635 /* Return TRUE if rtx X is immediate constant that fits in a single
7636    MOVI immediate operation.  */
7637 bool
7638 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7639 {
7640   if (!TARGET_SIMD)
7641      return false;
7642
7643   machine_mode vmode;
7644   scalar_int_mode imode;
7645   unsigned HOST_WIDE_INT ival;
7646
7647   if (GET_CODE (x) == CONST_DOUBLE
7648       && SCALAR_FLOAT_MODE_P (mode))
7649     {
7650       if (!aarch64_reinterpret_float_as_int (x, &ival))
7651         return false;
7652
7653       /* We make a general exception for 0.  */
7654       if (aarch64_float_const_zero_rtx_p (x))
7655         return true;
7656
7657       imode = int_mode_for_mode (mode).require ();
7658     }
7659   else if (GET_CODE (x) == CONST_INT
7660            && is_a <scalar_int_mode> (mode, &imode))
7661     ival = INTVAL (x);
7662   else
7663     return false;
7664
7665    /* use a 64 bit mode for everything except for DI/DF mode, where we use
7666      a 128 bit vector mode.  */
7667   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7668
7669   vmode = aarch64_simd_container_mode (imode, width);
7670   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7671
7672   return aarch64_simd_valid_immediate (v_op, NULL);
7673 }
7674
7675
7676 /* Return the fixed registers used for condition codes.  */
7677
7678 static bool
7679 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7680 {
7681   *p1 = CC_REGNUM;
7682   *p2 = INVALID_REGNUM;
7683   return true;
7684 }
7685
7686 /* This function is used by the call expanders of the machine description.
7687    RESULT is the register in which the result is returned.  It's NULL for
7688    "call" and "sibcall".
7689    MEM is the location of the function call.
7690    SIBCALL indicates whether this function call is normal call or sibling call.
7691    It will generate different pattern accordingly.  */
7692
7693 void
7694 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7695 {
7696   rtx call, callee, tmp;
7697   rtvec vec;
7698   machine_mode mode;
7699
7700   gcc_assert (MEM_P (mem));
7701   callee = XEXP (mem, 0);
7702   mode = GET_MODE (callee);
7703   gcc_assert (mode == Pmode);
7704
7705   /* Decide if we should generate indirect calls by loading the
7706      address of the callee into a register before performing
7707      the branch-and-link.  */
7708   if (SYMBOL_REF_P (callee)
7709       ? (aarch64_is_long_call_p (callee)
7710          || aarch64_is_noplt_call_p (callee))
7711       : !REG_P (callee))
7712     XEXP (mem, 0) = force_reg (mode, callee);
7713
7714   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7715
7716   if (result != NULL_RTX)
7717     call = gen_rtx_SET (result, call);
7718
7719   if (sibcall)
7720     tmp = ret_rtx;
7721   else
7722     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7723
7724   vec = gen_rtvec (2, call, tmp);
7725   call = gen_rtx_PARALLEL (VOIDmode, vec);
7726
7727   aarch64_emit_call_insn (call);
7728 }
7729
7730 /* Emit call insn with PAT and do aarch64-specific handling.  */
7731
7732 void
7733 aarch64_emit_call_insn (rtx pat)
7734 {
7735   rtx insn = emit_call_insn (pat);
7736
7737   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7738   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7739   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7740 }
7741
7742 machine_mode
7743 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7744 {
7745   machine_mode mode_x = GET_MODE (x);
7746   rtx_code code_x = GET_CODE (x);
7747
7748   /* All floating point compares return CCFP if it is an equality
7749      comparison, and CCFPE otherwise.  */
7750   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
7751     {
7752       switch (code)
7753         {
7754         case EQ:
7755         case NE:
7756         case UNORDERED:
7757         case ORDERED:
7758         case UNLT:
7759         case UNLE:
7760         case UNGT:
7761         case UNGE:
7762         case UNEQ:
7763           return CCFPmode;
7764
7765         case LT:
7766         case LE:
7767         case GT:
7768         case GE:
7769         case LTGT:
7770           return CCFPEmode;
7771
7772         default:
7773           gcc_unreachable ();
7774         }
7775     }
7776
7777   /* Equality comparisons of short modes against zero can be performed
7778      using the TST instruction with the appropriate bitmask.  */
7779   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
7780       && (code == EQ || code == NE)
7781       && (mode_x == HImode || mode_x == QImode))
7782     return CC_NZmode;
7783
7784   /* Similarly, comparisons of zero_extends from shorter modes can
7785      be performed using an ANDS with an immediate mask.  */
7786   if (y == const0_rtx && code_x == ZERO_EXTEND
7787       && (mode_x == SImode || mode_x == DImode)
7788       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
7789       && (code == EQ || code == NE))
7790     return CC_NZmode;
7791
7792   if ((mode_x == SImode || mode_x == DImode)
7793       && y == const0_rtx
7794       && (code == EQ || code == NE || code == LT || code == GE)
7795       && (code_x == PLUS || code_x == MINUS || code_x == AND
7796           || code_x == NEG
7797           || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7798               && CONST_INT_P (XEXP (x, 2)))))
7799     return CC_NZmode;
7800
7801   /* A compare with a shifted operand.  Because of canonicalization,
7802      the comparison will have to be swapped when we emit the assembly
7803      code.  */
7804   if ((mode_x == SImode || mode_x == DImode)
7805       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
7806       && (code_x == ASHIFT || code_x == ASHIFTRT
7807           || code_x == LSHIFTRT
7808           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
7809     return CC_SWPmode;
7810
7811   /* Similarly for a negated operand, but we can only do this for
7812      equalities.  */
7813   if ((mode_x == SImode || mode_x == DImode)
7814       && (REG_P (y) || GET_CODE (y) == SUBREG)
7815       && (code == EQ || code == NE)
7816       && code_x == NEG)
7817     return CC_Zmode;
7818
7819   /* A test for unsigned overflow from an addition.  */
7820   if ((mode_x == DImode || mode_x == TImode)
7821       && (code == LTU || code == GEU)
7822       && code_x == PLUS
7823       && rtx_equal_p (XEXP (x, 0), y))
7824     return CC_Cmode;
7825
7826   /* A test for unsigned overflow from an add with carry.  */
7827   if ((mode_x == DImode || mode_x == TImode)
7828       && (code == LTU || code == GEU)
7829       && code_x == PLUS
7830       && CONST_SCALAR_INT_P (y)
7831       && (rtx_mode_t (y, mode_x)
7832           == (wi::shwi (1, mode_x)
7833               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
7834     return CC_ADCmode;
7835
7836   /* A test for signed overflow.  */
7837   if ((mode_x == DImode || mode_x == TImode)
7838       && code == NE
7839       && code_x == PLUS
7840       && GET_CODE (y) == SIGN_EXTEND)
7841     return CC_Vmode;
7842
7843   /* For everything else, return CCmode.  */
7844   return CCmode;
7845 }
7846
7847 static int
7848 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
7849
7850 int
7851 aarch64_get_condition_code (rtx x)
7852 {
7853   machine_mode mode = GET_MODE (XEXP (x, 0));
7854   enum rtx_code comp_code = GET_CODE (x);
7855
7856   if (GET_MODE_CLASS (mode) != MODE_CC)
7857     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
7858   return aarch64_get_condition_code_1 (mode, comp_code);
7859 }
7860
7861 static int
7862 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
7863 {
7864   switch (mode)
7865     {
7866     case E_CCFPmode:
7867     case E_CCFPEmode:
7868       switch (comp_code)
7869         {
7870         case GE: return AARCH64_GE;
7871         case GT: return AARCH64_GT;
7872         case LE: return AARCH64_LS;
7873         case LT: return AARCH64_MI;
7874         case NE: return AARCH64_NE;
7875         case EQ: return AARCH64_EQ;
7876         case ORDERED: return AARCH64_VC;
7877         case UNORDERED: return AARCH64_VS;
7878         case UNLT: return AARCH64_LT;
7879         case UNLE: return AARCH64_LE;
7880         case UNGT: return AARCH64_HI;
7881         case UNGE: return AARCH64_PL;
7882         default: return -1;
7883         }
7884       break;
7885
7886     case E_CCmode:
7887       switch (comp_code)
7888         {
7889         case NE: return AARCH64_NE;
7890         case EQ: return AARCH64_EQ;
7891         case GE: return AARCH64_GE;
7892         case GT: return AARCH64_GT;
7893         case LE: return AARCH64_LE;
7894         case LT: return AARCH64_LT;
7895         case GEU: return AARCH64_CS;
7896         case GTU: return AARCH64_HI;
7897         case LEU: return AARCH64_LS;
7898         case LTU: return AARCH64_CC;
7899         default: return -1;
7900         }
7901       break;
7902
7903     case E_CC_SWPmode:
7904       switch (comp_code)
7905         {
7906         case NE: return AARCH64_NE;
7907         case EQ: return AARCH64_EQ;
7908         case GE: return AARCH64_LE;
7909         case GT: return AARCH64_LT;
7910         case LE: return AARCH64_GE;
7911         case LT: return AARCH64_GT;
7912         case GEU: return AARCH64_LS;
7913         case GTU: return AARCH64_CC;
7914         case LEU: return AARCH64_CS;
7915         case LTU: return AARCH64_HI;
7916         default: return -1;
7917         }
7918       break;
7919
7920     case E_CC_NZCmode:
7921       switch (comp_code)
7922         {
7923         case NE: return AARCH64_NE; /* = any */
7924         case EQ: return AARCH64_EQ; /* = none */
7925         case GE: return AARCH64_PL; /* = nfrst */
7926         case LT: return AARCH64_MI; /* = first */
7927         case GEU: return AARCH64_CS; /* = nlast */
7928         case GTU: return AARCH64_HI; /* = pmore */
7929         case LEU: return AARCH64_LS; /* = plast */
7930         case LTU: return AARCH64_CC; /* = last */
7931         default: return -1;
7932         }
7933       break;
7934
7935     case E_CC_NZmode:
7936       switch (comp_code)
7937         {
7938         case NE: return AARCH64_NE;
7939         case EQ: return AARCH64_EQ;
7940         case GE: return AARCH64_PL;
7941         case LT: return AARCH64_MI;
7942         default: return -1;
7943         }
7944       break;
7945
7946     case E_CC_Zmode:
7947       switch (comp_code)
7948         {
7949         case NE: return AARCH64_NE;
7950         case EQ: return AARCH64_EQ;
7951         default: return -1;
7952         }
7953       break;
7954
7955     case E_CC_Cmode:
7956       switch (comp_code)
7957         {
7958         case LTU: return AARCH64_CS;
7959         case GEU: return AARCH64_CC;
7960         default: return -1;
7961         }
7962       break;
7963
7964     case E_CC_ADCmode:
7965       switch (comp_code)
7966         {
7967         case GEU: return AARCH64_CS;
7968         case LTU: return AARCH64_CC;
7969         default: return -1;
7970         }
7971       break;
7972
7973     case E_CC_Vmode:
7974       switch (comp_code)
7975         {
7976         case NE: return AARCH64_VS;
7977         case EQ: return AARCH64_VC;
7978         default: return -1;
7979         }
7980       break;
7981
7982     default:
7983       return -1;
7984     }
7985
7986   return -1;
7987 }
7988
7989 bool
7990 aarch64_const_vec_all_same_in_range_p (rtx x,
7991                                        HOST_WIDE_INT minval,
7992                                        HOST_WIDE_INT maxval)
7993 {
7994   rtx elt;
7995   return (const_vec_duplicate_p (x, &elt)
7996           && CONST_INT_P (elt)
7997           && IN_RANGE (INTVAL (elt), minval, maxval));
7998 }
7999
8000 bool
8001 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
8002 {
8003   return aarch64_const_vec_all_same_in_range_p (x, val, val);
8004 }
8005
8006 /* Return true if VEC is a constant in which every element is in the range
8007    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
8008
8009 static bool
8010 aarch64_const_vec_all_in_range_p (rtx vec,
8011                                   HOST_WIDE_INT minval,
8012                                   HOST_WIDE_INT maxval)
8013 {
8014   if (GET_CODE (vec) != CONST_VECTOR
8015       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
8016     return false;
8017
8018   int nunits;
8019   if (!CONST_VECTOR_STEPPED_P (vec))
8020     nunits = const_vector_encoded_nelts (vec);
8021   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
8022     return false;
8023
8024   for (int i = 0; i < nunits; i++)
8025     {
8026       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
8027       if (!CONST_INT_P (vec_elem)
8028           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
8029         return false;
8030     }
8031   return true;
8032 }
8033
8034 /* N Z C V.  */
8035 #define AARCH64_CC_V 1
8036 #define AARCH64_CC_C (1 << 1)
8037 #define AARCH64_CC_Z (1 << 2)
8038 #define AARCH64_CC_N (1 << 3)
8039
8040 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
8041 static const int aarch64_nzcv_codes[] =
8042 {
8043   0,            /* EQ, Z == 1.  */
8044   AARCH64_CC_Z, /* NE, Z == 0.  */
8045   0,            /* CS, C == 1.  */
8046   AARCH64_CC_C, /* CC, C == 0.  */
8047   0,            /* MI, N == 1.  */
8048   AARCH64_CC_N, /* PL, N == 0.  */
8049   0,            /* VS, V == 1.  */
8050   AARCH64_CC_V, /* VC, V == 0.  */
8051   0,            /* HI, C ==1 && Z == 0.  */
8052   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
8053   AARCH64_CC_V, /* GE, N == V.  */
8054   0,            /* LT, N != V.  */
8055   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
8056   0,            /* LE, !(Z == 0 && N == V).  */
8057   0,            /* AL, Any.  */
8058   0             /* NV, Any.  */
8059 };
8060
8061 /* Print floating-point vector immediate operand X to F, negating it
8062    first if NEGATE is true.  Return true on success, false if it isn't
8063    a constant we can handle.  */
8064
8065 static bool
8066 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
8067 {
8068   rtx elt;
8069
8070   if (!const_vec_duplicate_p (x, &elt))
8071     return false;
8072
8073   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
8074   if (negate)
8075     r = real_value_negate (&r);
8076
8077   /* We only handle the SVE single-bit immediates here.  */
8078   if (real_equal (&r, &dconst0))
8079     asm_fprintf (f, "0.0");
8080   else if (real_equal (&r, &dconst1))
8081     asm_fprintf (f, "1.0");
8082   else if (real_equal (&r, &dconsthalf))
8083     asm_fprintf (f, "0.5");
8084   else
8085     return false;
8086
8087   return true;
8088 }
8089
8090 /* Return the equivalent letter for size.  */
8091 static char
8092 sizetochar (int size)
8093 {
8094   switch (size)
8095     {
8096     case 64: return 'd';
8097     case 32: return 's';
8098     case 16: return 'h';
8099     case 8 : return 'b';
8100     default: gcc_unreachable ();
8101     }
8102 }
8103
8104 /* Print operand X to file F in a target specific manner according to CODE.
8105    The acceptable formatting commands given by CODE are:
8106      'c':               An integer or symbol address without a preceding #
8107                         sign.
8108      'C':               Take the duplicated element in a vector constant
8109                         and print it in hex.
8110      'D':               Take the duplicated element in a vector constant
8111                         and print it as an unsigned integer, in decimal.
8112      'e':               Print the sign/zero-extend size as a character 8->b,
8113                         16->h, 32->w.
8114      'p':               Prints N such that 2^N == X (X must be power of 2 and
8115                         const int).
8116      'P':               Print the number of non-zero bits in X (a const_int).
8117      'H':               Print the higher numbered register of a pair (TImode)
8118                         of regs.
8119      'm':               Print a condition (eq, ne, etc).
8120      'M':               Same as 'm', but invert condition.
8121      'N':               Take the duplicated element in a vector constant
8122                         and print the negative of it in decimal.
8123      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
8124      'S/T/U/V':         Print a FP/SIMD register name for a register list.
8125                         The register printed is the FP/SIMD register name
8126                         of X + 0/1/2/3 for S/T/U/V.
8127      'R':               Print a scalar FP/SIMD register name + 1.
8128      'X':               Print bottom 16 bits of integer constant in hex.
8129      'w/x':             Print a general register name or the zero register
8130                         (32-bit or 64-bit).
8131      '0':               Print a normal operand, if it's a general register,
8132                         then we assume DImode.
8133      'k':               Print NZCV for conditional compare instructions.
8134      'A':               Output address constant representing the first
8135                         argument of X, specifying a relocation offset
8136                         if appropriate.
8137      'L':               Output constant address specified by X
8138                         with a relocation offset if appropriate.
8139      'G':               Prints address of X, specifying a PC relative
8140                         relocation mode if appropriate.
8141      'y':               Output address of LDP or STP - this is used for
8142                         some LDP/STPs which don't use a PARALLEL in their
8143                         pattern (so the mode needs to be adjusted).
8144      'z':               Output address of a typical LDP or STP.  */
8145
8146 static void
8147 aarch64_print_operand (FILE *f, rtx x, int code)
8148 {
8149   rtx elt;
8150   switch (code)
8151     {
8152     case 'c':
8153       switch (GET_CODE (x))
8154         {
8155         case CONST_INT:
8156           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
8157           break;
8158
8159         case SYMBOL_REF:
8160           output_addr_const (f, x);
8161           break;
8162
8163         case CONST:
8164           if (GET_CODE (XEXP (x, 0)) == PLUS
8165               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
8166             {
8167               output_addr_const (f, x);
8168               break;
8169             }
8170           /* Fall through.  */
8171
8172         default:
8173           output_operand_lossage ("unsupported operand for code '%c'", code);
8174         }
8175       break;
8176
8177     case 'e':
8178       {
8179         int n;
8180
8181         if (!CONST_INT_P (x)
8182             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
8183           {
8184             output_operand_lossage ("invalid operand for '%%%c'", code);
8185             return;
8186           }
8187
8188         switch (n)
8189           {
8190           case 3:
8191             fputc ('b', f);
8192             break;
8193           case 4:
8194             fputc ('h', f);
8195             break;
8196           case 5:
8197             fputc ('w', f);
8198             break;
8199           default:
8200             output_operand_lossage ("invalid operand for '%%%c'", code);
8201             return;
8202           }
8203       }
8204       break;
8205
8206     case 'p':
8207       {
8208         int n;
8209
8210         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
8211           {
8212             output_operand_lossage ("invalid operand for '%%%c'", code);
8213             return;
8214           }
8215
8216         asm_fprintf (f, "%d", n);
8217       }
8218       break;
8219
8220     case 'P':
8221       if (!CONST_INT_P (x))
8222         {
8223           output_operand_lossage ("invalid operand for '%%%c'", code);
8224           return;
8225         }
8226
8227       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
8228       break;
8229
8230     case 'H':
8231       if (x == const0_rtx)
8232         {
8233           asm_fprintf (f, "xzr");
8234           break;
8235         }
8236
8237       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
8238         {
8239           output_operand_lossage ("invalid operand for '%%%c'", code);
8240           return;
8241         }
8242
8243       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
8244       break;
8245
8246     case 'M':
8247     case 'm':
8248       {
8249         int cond_code;
8250         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
8251         if (x == const_true_rtx)
8252           {
8253             if (code == 'M')
8254               fputs ("nv", f);
8255             return;
8256           }
8257
8258         if (!COMPARISON_P (x))
8259           {
8260             output_operand_lossage ("invalid operand for '%%%c'", code);
8261             return;
8262           }
8263
8264         cond_code = aarch64_get_condition_code (x);
8265         gcc_assert (cond_code >= 0);
8266         if (code == 'M')
8267           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
8268         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
8269           fputs (aarch64_sve_condition_codes[cond_code], f);
8270         else
8271           fputs (aarch64_condition_codes[cond_code], f);
8272       }
8273       break;
8274
8275     case 'N':
8276       if (!const_vec_duplicate_p (x, &elt))
8277         {
8278           output_operand_lossage ("invalid vector constant");
8279           return;
8280         }
8281
8282       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8283         asm_fprintf (f, "%wd", -INTVAL (elt));
8284       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8285                && aarch64_print_vector_float_operand (f, x, true))
8286         ;
8287       else
8288         {
8289           output_operand_lossage ("invalid vector constant");
8290           return;
8291         }
8292       break;
8293
8294     case 'b':
8295     case 'h':
8296     case 's':
8297     case 'd':
8298     case 'q':
8299       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8300         {
8301           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8302           return;
8303         }
8304       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
8305       break;
8306
8307     case 'S':
8308     case 'T':
8309     case 'U':
8310     case 'V':
8311       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8312         {
8313           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8314           return;
8315         }
8316       asm_fprintf (f, "%c%d",
8317                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
8318                    REGNO (x) - V0_REGNUM + (code - 'S'));
8319       break;
8320
8321     case 'R':
8322       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8323         {
8324           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8325           return;
8326         }
8327       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
8328       break;
8329
8330     case 'X':
8331       if (!CONST_INT_P (x))
8332         {
8333           output_operand_lossage ("invalid operand for '%%%c'", code);
8334           return;
8335         }
8336       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
8337       break;
8338
8339     case 'C':
8340       {
8341         /* Print a replicated constant in hex.  */
8342         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8343           {
8344             output_operand_lossage ("invalid operand for '%%%c'", code);
8345             return;
8346           }
8347         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8348         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8349       }
8350       break;
8351
8352     case 'D':
8353       {
8354         /* Print a replicated constant in decimal, treating it as
8355            unsigned.  */
8356         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8357           {
8358             output_operand_lossage ("invalid operand for '%%%c'", code);
8359             return;
8360           }
8361         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8362         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8363       }
8364       break;
8365
8366     case 'w':
8367     case 'x':
8368       if (x == const0_rtx
8369           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
8370         {
8371           asm_fprintf (f, "%czr", code);
8372           break;
8373         }
8374
8375       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8376         {
8377           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
8378           break;
8379         }
8380
8381       if (REG_P (x) && REGNO (x) == SP_REGNUM)
8382         {
8383           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
8384           break;
8385         }
8386
8387       /* Fall through */
8388
8389     case 0:
8390       if (x == NULL)
8391         {
8392           output_operand_lossage ("missing operand");
8393           return;
8394         }
8395
8396       switch (GET_CODE (x))
8397         {
8398         case REG:
8399           if (aarch64_sve_data_mode_p (GET_MODE (x)))
8400             {
8401               if (REG_NREGS (x) == 1)
8402                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
8403               else
8404                 {
8405                   char suffix
8406                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
8407                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
8408                                REGNO (x) - V0_REGNUM, suffix,
8409                                END_REGNO (x) - V0_REGNUM - 1, suffix);
8410                 }
8411             }
8412           else
8413             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
8414           break;
8415
8416         case MEM:
8417           output_address (GET_MODE (x), XEXP (x, 0));
8418           break;
8419
8420         case LABEL_REF:
8421         case SYMBOL_REF:
8422           output_addr_const (asm_out_file, x);
8423           break;
8424
8425         case CONST_INT:
8426           asm_fprintf (f, "%wd", INTVAL (x));
8427           break;
8428
8429         case CONST:
8430           if (!VECTOR_MODE_P (GET_MODE (x)))
8431             {
8432               output_addr_const (asm_out_file, x);
8433               break;
8434             }
8435           /* fall through */
8436
8437         case CONST_VECTOR:
8438           if (!const_vec_duplicate_p (x, &elt))
8439             {
8440               output_operand_lossage ("invalid vector constant");
8441               return;
8442             }
8443
8444           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8445             asm_fprintf (f, "%wd", INTVAL (elt));
8446           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8447                    && aarch64_print_vector_float_operand (f, x, false))
8448             ;
8449           else
8450             {
8451               output_operand_lossage ("invalid vector constant");
8452               return;
8453             }
8454           break;
8455
8456         case CONST_DOUBLE:
8457           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8458              be getting CONST_DOUBLEs holding integers.  */
8459           gcc_assert (GET_MODE (x) != VOIDmode);
8460           if (aarch64_float_const_zero_rtx_p (x))
8461             {
8462               fputc ('0', f);
8463               break;
8464             }
8465           else if (aarch64_float_const_representable_p (x))
8466             {
8467 #define buf_size 20
8468               char float_buf[buf_size] = {'\0'};
8469               real_to_decimal_for_mode (float_buf,
8470                                         CONST_DOUBLE_REAL_VALUE (x),
8471                                         buf_size, buf_size,
8472                                         1, GET_MODE (x));
8473               asm_fprintf (asm_out_file, "%s", float_buf);
8474               break;
8475 #undef buf_size
8476             }
8477           output_operand_lossage ("invalid constant");
8478           return;
8479         default:
8480           output_operand_lossage ("invalid operand");
8481           return;
8482         }
8483       break;
8484
8485     case 'A':
8486       if (GET_CODE (x) == HIGH)
8487         x = XEXP (x, 0);
8488
8489       switch (aarch64_classify_symbolic_expression (x))
8490         {
8491         case SYMBOL_SMALL_GOT_4G:
8492           asm_fprintf (asm_out_file, ":got:");
8493           break;
8494
8495         case SYMBOL_SMALL_TLSGD:
8496           asm_fprintf (asm_out_file, ":tlsgd:");
8497           break;
8498
8499         case SYMBOL_SMALL_TLSDESC:
8500           asm_fprintf (asm_out_file, ":tlsdesc:");
8501           break;
8502
8503         case SYMBOL_SMALL_TLSIE:
8504           asm_fprintf (asm_out_file, ":gottprel:");
8505           break;
8506
8507         case SYMBOL_TLSLE24:
8508           asm_fprintf (asm_out_file, ":tprel:");
8509           break;
8510
8511         case SYMBOL_TINY_GOT:
8512           gcc_unreachable ();
8513           break;
8514
8515         default:
8516           break;
8517         }
8518       output_addr_const (asm_out_file, x);
8519       break;
8520
8521     case 'L':
8522       switch (aarch64_classify_symbolic_expression (x))
8523         {
8524         case SYMBOL_SMALL_GOT_4G:
8525           asm_fprintf (asm_out_file, ":lo12:");
8526           break;
8527
8528         case SYMBOL_SMALL_TLSGD:
8529           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8530           break;
8531
8532         case SYMBOL_SMALL_TLSDESC:
8533           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8534           break;
8535
8536         case SYMBOL_SMALL_TLSIE:
8537           asm_fprintf (asm_out_file, ":gottprel_lo12:");
8538           break;
8539
8540         case SYMBOL_TLSLE12:
8541           asm_fprintf (asm_out_file, ":tprel_lo12:");
8542           break;
8543
8544         case SYMBOL_TLSLE24:
8545           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8546           break;
8547
8548         case SYMBOL_TINY_GOT:
8549           asm_fprintf (asm_out_file, ":got:");
8550           break;
8551
8552         case SYMBOL_TINY_TLSIE:
8553           asm_fprintf (asm_out_file, ":gottprel:");
8554           break;
8555
8556         default:
8557           break;
8558         }
8559       output_addr_const (asm_out_file, x);
8560       break;
8561
8562     case 'G':
8563       switch (aarch64_classify_symbolic_expression (x))
8564         {
8565         case SYMBOL_TLSLE24:
8566           asm_fprintf (asm_out_file, ":tprel_hi12:");
8567           break;
8568         default:
8569           break;
8570         }
8571       output_addr_const (asm_out_file, x);
8572       break;
8573
8574     case 'k':
8575       {
8576         HOST_WIDE_INT cond_code;
8577
8578         if (!CONST_INT_P (x))
8579           {
8580             output_operand_lossage ("invalid operand for '%%%c'", code);
8581             return;
8582           }
8583
8584         cond_code = INTVAL (x);
8585         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8586         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8587       }
8588       break;
8589
8590     case 'y':
8591     case 'z':
8592       {
8593         machine_mode mode = GET_MODE (x);
8594
8595         if (GET_CODE (x) != MEM
8596             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8597           {
8598             output_operand_lossage ("invalid operand for '%%%c'", code);
8599             return;
8600           }
8601
8602         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8603                                             code == 'y'
8604                                             ? ADDR_QUERY_LDP_STP_N
8605                                             : ADDR_QUERY_LDP_STP))
8606           output_operand_lossage ("invalid operand prefix '%%%c'", code);
8607       }
8608       break;
8609
8610     default:
8611       output_operand_lossage ("invalid operand prefix '%%%c'", code);
8612       return;
8613     }
8614 }
8615
8616 /* Print address 'x' of a memory access with mode 'mode'.
8617    'op' is the context required by aarch64_classify_address.  It can either be
8618    MEM for a normal memory access or PARALLEL for LDP/STP.  */
8619 static bool
8620 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8621                                 aarch64_addr_query_type type)
8622 {
8623   struct aarch64_address_info addr;
8624   unsigned int size;
8625
8626   /* Check all addresses are Pmode - including ILP32.  */
8627   if (GET_MODE (x) != Pmode
8628       && (!CONST_INT_P (x)
8629           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8630     {
8631       output_operand_lossage ("invalid address mode");
8632       return false;
8633     }
8634
8635   if (aarch64_classify_address (&addr, x, mode, true, type))
8636     switch (addr.type)
8637       {
8638       case ADDRESS_REG_IMM:
8639         if (known_eq (addr.const_offset, 0))
8640           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8641         else if (aarch64_sve_data_mode_p (mode))
8642           {
8643             HOST_WIDE_INT vnum
8644               = exact_div (addr.const_offset,
8645                            BYTES_PER_SVE_VECTOR).to_constant ();
8646             asm_fprintf (f, "[%s, #%wd, mul vl]",
8647                          reg_names[REGNO (addr.base)], vnum);
8648           }
8649         else if (aarch64_sve_pred_mode_p (mode))
8650           {
8651             HOST_WIDE_INT vnum
8652               = exact_div (addr.const_offset,
8653                            BYTES_PER_SVE_PRED).to_constant ();
8654             asm_fprintf (f, "[%s, #%wd, mul vl]",
8655                          reg_names[REGNO (addr.base)], vnum);
8656           }
8657         else
8658           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8659                        INTVAL (addr.offset));
8660         return true;
8661
8662       case ADDRESS_REG_REG:
8663         if (addr.shift == 0)
8664           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8665                        reg_names [REGNO (addr.offset)]);
8666         else
8667           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8668                        reg_names [REGNO (addr.offset)], addr.shift);
8669         return true;
8670
8671       case ADDRESS_REG_UXTW:
8672         if (addr.shift == 0)
8673           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8674                        REGNO (addr.offset) - R0_REGNUM);
8675         else
8676           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8677                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8678         return true;
8679
8680       case ADDRESS_REG_SXTW:
8681         if (addr.shift == 0)
8682           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8683                        REGNO (addr.offset) - R0_REGNUM);
8684         else
8685           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8686                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
8687         return true;
8688
8689       case ADDRESS_REG_WB:
8690         /* Writeback is only supported for fixed-width modes.  */
8691         size = GET_MODE_SIZE (mode).to_constant ();
8692         switch (GET_CODE (x))
8693           {
8694           case PRE_INC:
8695             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8696             return true;
8697           case POST_INC:
8698             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8699             return true;
8700           case PRE_DEC:
8701             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8702             return true;
8703           case POST_DEC:
8704             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
8705             return true;
8706           case PRE_MODIFY:
8707             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
8708                          INTVAL (addr.offset));
8709             return true;
8710           case POST_MODIFY:
8711             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8712                          INTVAL (addr.offset));
8713             return true;
8714           default:
8715             break;
8716           }
8717         break;
8718
8719       case ADDRESS_LO_SUM:
8720         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8721         output_addr_const (f, addr.offset);
8722         asm_fprintf (f, "]");
8723         return true;
8724
8725       case ADDRESS_SYMBOLIC:
8726         output_addr_const (f, x);
8727         return true;
8728       }
8729
8730   return false;
8731 }
8732
8733 /* Print address 'x' of a memory access with mode 'mode'.  */
8734 static void
8735 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8736 {
8737   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
8738     output_addr_const (f, x);
8739 }
8740
8741 bool
8742 aarch64_label_mentioned_p (rtx x)
8743 {
8744   const char *fmt;
8745   int i;
8746
8747   if (GET_CODE (x) == LABEL_REF)
8748     return true;
8749
8750   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8751      referencing instruction, but they are constant offsets, not
8752      symbols.  */
8753   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8754     return false;
8755
8756   fmt = GET_RTX_FORMAT (GET_CODE (x));
8757   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8758     {
8759       if (fmt[i] == 'E')
8760         {
8761           int j;
8762
8763           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8764             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8765               return 1;
8766         }
8767       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
8768         return 1;
8769     }
8770
8771   return 0;
8772 }
8773
8774 /* Implement REGNO_REG_CLASS.  */
8775
8776 enum reg_class
8777 aarch64_regno_regclass (unsigned regno)
8778 {
8779   if (GP_REGNUM_P (regno))
8780     return GENERAL_REGS;
8781
8782   if (regno == SP_REGNUM)
8783     return STACK_REG;
8784
8785   if (regno == FRAME_POINTER_REGNUM
8786       || regno == ARG_POINTER_REGNUM)
8787     return POINTER_REGS;
8788
8789   if (FP_REGNUM_P (regno))
8790     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
8791             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
8792
8793   if (PR_REGNUM_P (regno))
8794     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
8795
8796   return NO_REGS;
8797 }
8798
8799 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8800    If OFFSET is out of range, return an offset of an anchor point
8801    that is in range.  Return 0 otherwise.  */
8802
8803 static HOST_WIDE_INT
8804 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
8805                        machine_mode mode)
8806 {
8807   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
8808   if (size > 16)
8809     return (offset + 0x400) & ~0x7f0;
8810
8811   /* For offsets that aren't a multiple of the access size, the limit is
8812      -256...255.  */
8813   if (offset & (size - 1))
8814     {
8815       /* BLKmode typically uses LDP of X-registers.  */
8816       if (mode == BLKmode)
8817         return (offset + 512) & ~0x3ff;
8818       return (offset + 0x100) & ~0x1ff;
8819     }
8820
8821   /* Small negative offsets are supported.  */
8822   if (IN_RANGE (offset, -256, 0))
8823     return 0;
8824
8825   if (mode == TImode || mode == TFmode)
8826     return (offset + 0x100) & ~0x1ff;
8827
8828   /* Use 12-bit offset by access size.  */
8829   return offset & (~0xfff * size);
8830 }
8831
8832 static rtx
8833 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
8834 {
8835   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8836      where mask is selected by alignment and size of the offset.
8837      We try to pick as large a range for the offset as possible to
8838      maximize the chance of a CSE.  However, for aligned addresses
8839      we limit the range to 4k so that structures with different sized
8840      elements are likely to use the same base.  We need to be careful
8841      not to split a CONST for some forms of address expression, otherwise
8842      it will generate sub-optimal code.  */
8843
8844   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
8845     {
8846       rtx base = XEXP (x, 0);
8847       rtx offset_rtx = XEXP (x, 1);
8848       HOST_WIDE_INT offset = INTVAL (offset_rtx);
8849
8850       if (GET_CODE (base) == PLUS)
8851         {
8852           rtx op0 = XEXP (base, 0);
8853           rtx op1 = XEXP (base, 1);
8854
8855           /* Force any scaling into a temp for CSE.  */
8856           op0 = force_reg (Pmode, op0);
8857           op1 = force_reg (Pmode, op1);
8858
8859           /* Let the pointer register be in op0.  */
8860           if (REG_POINTER (op1))
8861             std::swap (op0, op1);
8862
8863           /* If the pointer is virtual or frame related, then we know that
8864              virtual register instantiation or register elimination is going
8865              to apply a second constant.  We want the two constants folded
8866              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
8867           if (virt_or_elim_regno_p (REGNO (op0)))
8868             {
8869               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
8870                                    NULL_RTX, true, OPTAB_DIRECT);
8871               return gen_rtx_PLUS (Pmode, base, op1);
8872             }
8873
8874           /* Otherwise, in order to encourage CSE (and thence loop strength
8875              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
8876           base = expand_binop (Pmode, add_optab, op0, op1,
8877                                NULL_RTX, true, OPTAB_DIRECT);
8878           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
8879         }
8880
8881       HOST_WIDE_INT size;
8882       if (GET_MODE_SIZE (mode).is_constant (&size))
8883         {
8884           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
8885                                                              mode);
8886           if (base_offset != 0)
8887             {
8888               base = plus_constant (Pmode, base, base_offset);
8889               base = force_operand (base, NULL_RTX);
8890               return plus_constant (Pmode, base, offset - base_offset);
8891             }
8892         }
8893     }
8894
8895   return x;
8896 }
8897
8898 static reg_class_t
8899 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
8900                           reg_class_t rclass,
8901                           machine_mode mode,
8902                           secondary_reload_info *sri)
8903 {
8904   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8905      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
8906      comment at the head of aarch64-sve.md for more details about the
8907      big-endian handling.  */
8908   if (BYTES_BIG_ENDIAN
8909       && reg_class_subset_p (rclass, FP_REGS)
8910       && !((REG_P (x) && HARD_REGISTER_P (x))
8911            || aarch64_simd_valid_immediate (x, NULL))
8912       && aarch64_sve_data_mode_p (mode))
8913     {
8914       sri->icode = CODE_FOR_aarch64_sve_reload_be;
8915       return NO_REGS;
8916     }
8917
8918   /* If we have to disable direct literal pool loads and stores because the
8919      function is too big, then we need a scratch register.  */
8920   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
8921       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
8922           || targetm.vector_mode_supported_p (GET_MODE (x)))
8923       && !aarch64_pcrelative_literal_loads)
8924     {
8925       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
8926       return NO_REGS;
8927     }
8928
8929   /* Without the TARGET_SIMD instructions we cannot move a Q register
8930      to a Q register directly.  We need a scratch.  */
8931   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
8932       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
8933       && reg_class_subset_p (rclass, FP_REGS))
8934     {
8935       sri->icode = code_for_aarch64_reload_mov (mode);
8936       return NO_REGS;
8937     }
8938
8939   /* A TFmode or TImode memory access should be handled via an FP_REGS
8940      because AArch64 has richer addressing modes for LDR/STR instructions
8941      than LDP/STP instructions.  */
8942   if (TARGET_FLOAT && rclass == GENERAL_REGS
8943       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
8944     return FP_REGS;
8945
8946   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
8947       return GENERAL_REGS;
8948
8949   return NO_REGS;
8950 }
8951
8952 static bool
8953 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
8954 {
8955   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
8956
8957   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8958      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
8959   if (frame_pointer_needed)
8960     return to == HARD_FRAME_POINTER_REGNUM;
8961   return true;
8962 }
8963
8964 poly_int64
8965 aarch64_initial_elimination_offset (unsigned from, unsigned to)
8966 {
8967   if (to == HARD_FRAME_POINTER_REGNUM)
8968     {
8969       if (from == ARG_POINTER_REGNUM)
8970         return cfun->machine->frame.hard_fp_offset;
8971
8972       if (from == FRAME_POINTER_REGNUM)
8973         return cfun->machine->frame.hard_fp_offset
8974                - cfun->machine->frame.locals_offset;
8975     }
8976
8977   if (to == STACK_POINTER_REGNUM)
8978     {
8979       if (from == FRAME_POINTER_REGNUM)
8980           return cfun->machine->frame.frame_size
8981                  - cfun->machine->frame.locals_offset;
8982     }
8983
8984   return cfun->machine->frame.frame_size;
8985 }
8986
8987 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
8988    previous frame.  */
8989
8990 rtx
8991 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
8992 {
8993   if (count != 0)
8994     return const0_rtx;
8995   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
8996 }
8997
8998
8999 static void
9000 aarch64_asm_trampoline_template (FILE *f)
9001 {
9002   int offset1 = 16;
9003   int offset2 = 20;
9004
9005   if (aarch64_bti_enabled ())
9006     {
9007       asm_fprintf (f, "\thint\t34 // bti c\n");
9008       offset1 -= 4;
9009       offset2 -= 4;
9010     }
9011
9012   if (TARGET_ILP32)
9013     {
9014       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
9015       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
9016                    offset1);
9017     }
9018   else
9019     {
9020       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
9021       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
9022                    offset2);
9023     }
9024   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
9025
9026   /* The trampoline needs an extra padding instruction.  In case if BTI is
9027      enabled the padding instruction is replaced by the BTI instruction at
9028      the beginning.  */
9029   if (!aarch64_bti_enabled ())
9030     assemble_aligned_integer (4, const0_rtx);
9031
9032   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9033   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9034 }
9035
9036 static void
9037 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
9038 {
9039   rtx fnaddr, mem, a_tramp;
9040   const int tramp_code_sz = 16;
9041
9042   /* Don't need to copy the trailing D-words, we fill those in below.  */
9043   emit_block_move (m_tramp, assemble_trampoline_template (),
9044                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
9045   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
9046   fnaddr = XEXP (DECL_RTL (fndecl), 0);
9047   if (GET_MODE (fnaddr) != ptr_mode)
9048     fnaddr = convert_memory_address (ptr_mode, fnaddr);
9049   emit_move_insn (mem, fnaddr);
9050
9051   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
9052   emit_move_insn (mem, chain_value);
9053
9054   /* XXX We should really define a "clear_cache" pattern and use
9055      gen_clear_cache().  */
9056   a_tramp = XEXP (m_tramp, 0);
9057   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
9058                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
9059                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
9060                      ptr_mode);
9061 }
9062
9063 static unsigned char
9064 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
9065 {
9066   /* ??? Logically we should only need to provide a value when
9067      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
9068      can hold MODE, but at the moment we need to handle all modes.
9069      Just ignore any runtime parts for registers that can't store them.  */
9070   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
9071   unsigned int nregs;
9072   switch (regclass)
9073     {
9074     case TAILCALL_ADDR_REGS:
9075     case POINTER_REGS:
9076     case GENERAL_REGS:
9077     case ALL_REGS:
9078     case POINTER_AND_FP_REGS:
9079     case FP_REGS:
9080     case FP_LO_REGS:
9081     case FP_LO8_REGS:
9082       if (aarch64_sve_data_mode_p (mode)
9083           && constant_multiple_p (GET_MODE_SIZE (mode),
9084                                   BYTES_PER_SVE_VECTOR, &nregs))
9085         return nregs;
9086       return (aarch64_vector_data_mode_p (mode)
9087               ? CEIL (lowest_size, UNITS_PER_VREG)
9088               : CEIL (lowest_size, UNITS_PER_WORD));
9089     case STACK_REG:
9090     case PR_REGS:
9091     case PR_LO_REGS:
9092     case PR_HI_REGS:
9093       return 1;
9094
9095     case NO_REGS:
9096       return 0;
9097
9098     default:
9099       break;
9100     }
9101   gcc_unreachable ();
9102 }
9103
9104 static reg_class_t
9105 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
9106 {
9107   if (regclass == POINTER_REGS)
9108     return GENERAL_REGS;
9109
9110   if (regclass == STACK_REG)
9111     {
9112       if (REG_P(x)
9113           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
9114           return regclass;
9115
9116       return NO_REGS;
9117     }
9118
9119   /* Register eliminiation can result in a request for
9120      SP+constant->FP_REGS.  We cannot support such operations which
9121      use SP as source and an FP_REG as destination, so reject out
9122      right now.  */
9123   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
9124     {
9125       rtx lhs = XEXP (x, 0);
9126
9127       /* Look through a possible SUBREG introduced by ILP32.  */
9128       if (GET_CODE (lhs) == SUBREG)
9129         lhs = SUBREG_REG (lhs);
9130
9131       gcc_assert (REG_P (lhs));
9132       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
9133                                       POINTER_REGS));
9134       return NO_REGS;
9135     }
9136
9137   return regclass;
9138 }
9139
9140 void
9141 aarch64_asm_output_labelref (FILE* f, const char *name)
9142 {
9143   asm_fprintf (f, "%U%s", name);
9144 }
9145
9146 static void
9147 aarch64_elf_asm_constructor (rtx symbol, int priority)
9148 {
9149   if (priority == DEFAULT_INIT_PRIORITY)
9150     default_ctor_section_asm_out_constructor (symbol, priority);
9151   else
9152     {
9153       section *s;
9154       /* While priority is known to be in range [0, 65535], so 18 bytes
9155          would be enough, the compiler might not know that.  To avoid
9156          -Wformat-truncation false positive, use a larger size.  */
9157       char buf[23];
9158       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
9159       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9160       switch_to_section (s);
9161       assemble_align (POINTER_SIZE);
9162       assemble_aligned_integer (POINTER_BYTES, symbol);
9163     }
9164 }
9165
9166 static void
9167 aarch64_elf_asm_destructor (rtx symbol, int priority)
9168 {
9169   if (priority == DEFAULT_INIT_PRIORITY)
9170     default_dtor_section_asm_out_destructor (symbol, priority);
9171   else
9172     {
9173       section *s;
9174       /* While priority is known to be in range [0, 65535], so 18 bytes
9175          would be enough, the compiler might not know that.  To avoid
9176          -Wformat-truncation false positive, use a larger size.  */
9177       char buf[23];
9178       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
9179       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9180       switch_to_section (s);
9181       assemble_align (POINTER_SIZE);
9182       assemble_aligned_integer (POINTER_BYTES, symbol);
9183     }
9184 }
9185
9186 const char*
9187 aarch64_output_casesi (rtx *operands)
9188 {
9189   char buf[100];
9190   char label[100];
9191   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
9192   int index;
9193   static const char *const patterns[4][2] =
9194   {
9195     {
9196       "ldrb\t%w3, [%0,%w1,uxtw]",
9197       "add\t%3, %4, %w3, sxtb #2"
9198     },
9199     {
9200       "ldrh\t%w3, [%0,%w1,uxtw #1]",
9201       "add\t%3, %4, %w3, sxth #2"
9202     },
9203     {
9204       "ldr\t%w3, [%0,%w1,uxtw #2]",
9205       "add\t%3, %4, %w3, sxtw #2"
9206     },
9207     /* We assume that DImode is only generated when not optimizing and
9208        that we don't really need 64-bit address offsets.  That would
9209        imply an object file with 8GB of code in a single function!  */
9210     {
9211       "ldr\t%w3, [%0,%w1,uxtw #2]",
9212       "add\t%3, %4, %w3, sxtw #2"
9213     }
9214   };
9215
9216   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
9217
9218   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
9219   index = exact_log2 (GET_MODE_SIZE (mode));
9220
9221   gcc_assert (index >= 0 && index <= 3);
9222
9223   /* Need to implement table size reduction, by chaning the code below.  */
9224   output_asm_insn (patterns[index][0], operands);
9225   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
9226   snprintf (buf, sizeof (buf),
9227             "adr\t%%4, %s", targetm.strip_name_encoding (label));
9228   output_asm_insn (buf, operands);
9229   output_asm_insn (patterns[index][1], operands);
9230   output_asm_insn ("br\t%3", operands);
9231   assemble_label (asm_out_file, label);
9232   return "";
9233 }
9234
9235
9236 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9237    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9238    operator.  */
9239
9240 int
9241 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
9242 {
9243   if (shift >= 0 && shift <= 3)
9244     {
9245       int size;
9246       for (size = 8; size <= 32; size *= 2)
9247         {
9248           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
9249           if (mask == bits << shift)
9250             return size;
9251         }
9252     }
9253   return 0;
9254 }
9255
9256 /* Constant pools are per function only when PC relative
9257    literal loads are true or we are in the large memory
9258    model.  */
9259
9260 static inline bool
9261 aarch64_can_use_per_function_literal_pools_p (void)
9262 {
9263   return (aarch64_pcrelative_literal_loads
9264           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
9265 }
9266
9267 static bool
9268 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
9269 {
9270   /* We can't use blocks for constants when we're using a per-function
9271      constant pool.  */
9272   return !aarch64_can_use_per_function_literal_pools_p ();
9273 }
9274
9275 /* Select appropriate section for constants depending
9276    on where we place literal pools.  */
9277
9278 static section *
9279 aarch64_select_rtx_section (machine_mode mode,
9280                             rtx x,
9281                             unsigned HOST_WIDE_INT align)
9282 {
9283   if (aarch64_can_use_per_function_literal_pools_p ())
9284     return function_section (current_function_decl);
9285
9286   return default_elf_select_rtx_section (mode, x, align);
9287 }
9288
9289 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
9290 void
9291 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
9292                                   HOST_WIDE_INT offset)
9293 {
9294   /* When using per-function literal pools, we must ensure that any code
9295      section is aligned to the minimal instruction length, lest we get
9296      errors from the assembler re "unaligned instructions".  */
9297   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
9298     ASM_OUTPUT_ALIGN (f, 2);
9299 }
9300
9301 /* Costs.  */
9302
9303 /* Helper function for rtx cost calculation.  Strip a shift expression
9304    from X.  Returns the inner operand if successful, or the original
9305    expression on failure.  */
9306 static rtx
9307 aarch64_strip_shift (rtx x)
9308 {
9309   rtx op = x;
9310
9311   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9312      we can convert both to ROR during final output.  */
9313   if ((GET_CODE (op) == ASHIFT
9314        || GET_CODE (op) == ASHIFTRT
9315        || GET_CODE (op) == LSHIFTRT
9316        || GET_CODE (op) == ROTATERT
9317        || GET_CODE (op) == ROTATE)
9318       && CONST_INT_P (XEXP (op, 1)))
9319     return XEXP (op, 0);
9320
9321   if (GET_CODE (op) == MULT
9322       && CONST_INT_P (XEXP (op, 1))
9323       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
9324     return XEXP (op, 0);
9325
9326   return x;
9327 }
9328
9329 /* Helper function for rtx cost calculation.  Strip an extend
9330    expression from X.  Returns the inner operand if successful, or the
9331    original expression on failure.  We deal with a number of possible
9332    canonicalization variations here. If STRIP_SHIFT is true, then
9333    we can strip off a shift also.  */
9334 static rtx
9335 aarch64_strip_extend (rtx x, bool strip_shift)
9336 {
9337   scalar_int_mode mode;
9338   rtx op = x;
9339
9340   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
9341     return op;
9342
9343   /* Zero and sign extraction of a widened value.  */
9344   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
9345       && XEXP (op, 2) == const0_rtx
9346       && GET_CODE (XEXP (op, 0)) == MULT
9347       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
9348                                          XEXP (op, 1)))
9349     return XEXP (XEXP (op, 0), 0);
9350
9351   /* It can also be represented (for zero-extend) as an AND with an
9352      immediate.  */
9353   if (GET_CODE (op) == AND
9354       && GET_CODE (XEXP (op, 0)) == MULT
9355       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
9356       && CONST_INT_P (XEXP (op, 1))
9357       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
9358                            INTVAL (XEXP (op, 1))) != 0)
9359     return XEXP (XEXP (op, 0), 0);
9360
9361   /* Now handle extended register, as this may also have an optional
9362      left shift by 1..4.  */
9363   if (strip_shift
9364       && GET_CODE (op) == ASHIFT
9365       && CONST_INT_P (XEXP (op, 1))
9366       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
9367     op = XEXP (op, 0);
9368
9369   if (GET_CODE (op) == ZERO_EXTEND
9370       || GET_CODE (op) == SIGN_EXTEND)
9371     op = XEXP (op, 0);
9372
9373   if (op != x)
9374     return op;
9375
9376   return x;
9377 }
9378
9379 /* Return true iff CODE is a shift supported in combination
9380    with arithmetic instructions.  */
9381
9382 static bool
9383 aarch64_shift_p (enum rtx_code code)
9384 {
9385   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
9386 }
9387
9388
9389 /* Return true iff X is a cheap shift without a sign extend. */
9390
9391 static bool
9392 aarch64_cheap_mult_shift_p (rtx x)
9393 {
9394   rtx op0, op1;
9395
9396   op0 = XEXP (x, 0);
9397   op1 = XEXP (x, 1);
9398
9399   if (!(aarch64_tune_params.extra_tuning_flags
9400                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
9401     return false;
9402
9403   if (GET_CODE (op0) == SIGN_EXTEND)
9404     return false;
9405
9406   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
9407       && UINTVAL (op1) <= 4)
9408     return true;
9409
9410   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
9411     return false;
9412
9413   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
9414
9415   if (l2 > 0 && l2 <= 4)
9416     return true;
9417
9418   return false;
9419 }
9420
9421 /* Helper function for rtx cost calculation.  Calculate the cost of
9422    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9423    Return the calculated cost of the expression, recursing manually in to
9424    operands where needed.  */
9425
9426 static int
9427 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
9428 {
9429   rtx op0, op1;
9430   const struct cpu_cost_table *extra_cost
9431     = aarch64_tune_params.insn_extra_cost;
9432   int cost = 0;
9433   bool compound_p = (outer == PLUS || outer == MINUS);
9434   machine_mode mode = GET_MODE (x);
9435
9436   gcc_checking_assert (code == MULT);
9437
9438   op0 = XEXP (x, 0);
9439   op1 = XEXP (x, 1);
9440
9441   if (VECTOR_MODE_P (mode))
9442     mode = GET_MODE_INNER (mode);
9443
9444   /* Integer multiply/fma.  */
9445   if (GET_MODE_CLASS (mode) == MODE_INT)
9446     {
9447       /* The multiply will be canonicalized as a shift, cost it as such.  */
9448       if (aarch64_shift_p (GET_CODE (x))
9449           || (CONST_INT_P (op1)
9450               && exact_log2 (INTVAL (op1)) > 0))
9451         {
9452           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
9453                            || GET_CODE (op0) == SIGN_EXTEND;
9454           if (speed)
9455             {
9456               if (compound_p)
9457                 {
9458                   /* If the shift is considered cheap,
9459                      then don't add any cost. */
9460                   if (aarch64_cheap_mult_shift_p (x))
9461                     ;
9462                   else if (REG_P (op1))
9463                     /* ARITH + shift-by-register.  */
9464                     cost += extra_cost->alu.arith_shift_reg;
9465                   else if (is_extend)
9466                     /* ARITH + extended register.  We don't have a cost field
9467                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
9468                     cost += extra_cost->alu.extend_arith;
9469                   else
9470                     /* ARITH + shift-by-immediate.  */
9471                     cost += extra_cost->alu.arith_shift;
9472                 }
9473               else
9474                 /* LSL (immediate).  */
9475                 cost += extra_cost->alu.shift;
9476
9477             }
9478           /* Strip extends as we will have costed them in the case above.  */
9479           if (is_extend)
9480             op0 = aarch64_strip_extend (op0, true);
9481
9482           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
9483
9484           return cost;
9485         }
9486
9487       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
9488          compound and let the below cases handle it.  After all, MNEG is a
9489          special-case alias of MSUB.  */
9490       if (GET_CODE (op0) == NEG)
9491         {
9492           op0 = XEXP (op0, 0);
9493           compound_p = true;
9494         }
9495
9496       /* Integer multiplies or FMAs have zero/sign extending variants.  */
9497       if ((GET_CODE (op0) == ZERO_EXTEND
9498            && GET_CODE (op1) == ZERO_EXTEND)
9499           || (GET_CODE (op0) == SIGN_EXTEND
9500               && GET_CODE (op1) == SIGN_EXTEND))
9501         {
9502           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
9503           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
9504
9505           if (speed)
9506             {
9507               if (compound_p)
9508                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
9509                 cost += extra_cost->mult[0].extend_add;
9510               else
9511                 /* MUL/SMULL/UMULL.  */
9512                 cost += extra_cost->mult[0].extend;
9513             }
9514
9515           return cost;
9516         }
9517
9518       /* This is either an integer multiply or a MADD.  In both cases
9519          we want to recurse and cost the operands.  */
9520       cost += rtx_cost (op0, mode, MULT, 0, speed);
9521       cost += rtx_cost (op1, mode, MULT, 1, speed);
9522
9523       if (speed)
9524         {
9525           if (compound_p)
9526             /* MADD/MSUB.  */
9527             cost += extra_cost->mult[mode == DImode].add;
9528           else
9529             /* MUL.  */
9530             cost += extra_cost->mult[mode == DImode].simple;
9531         }
9532
9533       return cost;
9534     }
9535   else
9536     {
9537       if (speed)
9538         {
9539           /* Floating-point FMA/FMUL can also support negations of the
9540              operands, unless the rounding mode is upward or downward in
9541              which case FNMUL is different than FMUL with operand negation.  */
9542           bool neg0 = GET_CODE (op0) == NEG;
9543           bool neg1 = GET_CODE (op1) == NEG;
9544           if (compound_p || !flag_rounding_math || (neg0 && neg1))
9545             {
9546               if (neg0)
9547                 op0 = XEXP (op0, 0);
9548               if (neg1)
9549                 op1 = XEXP (op1, 0);
9550             }
9551
9552           if (compound_p)
9553             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
9554             cost += extra_cost->fp[mode == DFmode].fma;
9555           else
9556             /* FMUL/FNMUL.  */
9557             cost += extra_cost->fp[mode == DFmode].mult;
9558         }
9559
9560       cost += rtx_cost (op0, mode, MULT, 0, speed);
9561       cost += rtx_cost (op1, mode, MULT, 1, speed);
9562       return cost;
9563     }
9564 }
9565
9566 static int
9567 aarch64_address_cost (rtx x,
9568                       machine_mode mode,
9569                       addr_space_t as ATTRIBUTE_UNUSED,
9570                       bool speed)
9571 {
9572   enum rtx_code c = GET_CODE (x);
9573   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9574   struct aarch64_address_info info;
9575   int cost = 0;
9576   info.shift = 0;
9577
9578   if (!aarch64_classify_address (&info, x, mode, false))
9579     {
9580       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9581         {
9582           /* This is a CONST or SYMBOL ref which will be split
9583              in a different way depending on the code model in use.
9584              Cost it through the generic infrastructure.  */
9585           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9586           /* Divide through by the cost of one instruction to
9587              bring it to the same units as the address costs.  */
9588           cost_symbol_ref /= COSTS_N_INSNS (1);
9589           /* The cost is then the cost of preparing the address,
9590              followed by an immediate (possibly 0) offset.  */
9591           return cost_symbol_ref + addr_cost->imm_offset;
9592         }
9593       else
9594         {
9595           /* This is most likely a jump table from a case
9596              statement.  */
9597           return addr_cost->register_offset;
9598         }
9599     }
9600
9601   switch (info.type)
9602     {
9603       case ADDRESS_LO_SUM:
9604       case ADDRESS_SYMBOLIC:
9605       case ADDRESS_REG_IMM:
9606         cost += addr_cost->imm_offset;
9607         break;
9608
9609       case ADDRESS_REG_WB:
9610         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9611           cost += addr_cost->pre_modify;
9612         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9613           cost += addr_cost->post_modify;
9614         else
9615           gcc_unreachable ();
9616
9617         break;
9618
9619       case ADDRESS_REG_REG:
9620         cost += addr_cost->register_offset;
9621         break;
9622
9623       case ADDRESS_REG_SXTW:
9624         cost += addr_cost->register_sextend;
9625         break;
9626
9627       case ADDRESS_REG_UXTW:
9628         cost += addr_cost->register_zextend;
9629         break;
9630
9631       default:
9632         gcc_unreachable ();
9633     }
9634
9635
9636   if (info.shift > 0)
9637     {
9638       /* For the sake of calculating the cost of the shifted register
9639          component, we can treat same sized modes in the same way.  */
9640       if (known_eq (GET_MODE_BITSIZE (mode), 16))
9641         cost += addr_cost->addr_scale_costs.hi;
9642       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9643         cost += addr_cost->addr_scale_costs.si;
9644       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9645         cost += addr_cost->addr_scale_costs.di;
9646       else
9647         /* We can't tell, or this is a 128-bit vector.  */
9648         cost += addr_cost->addr_scale_costs.ti;
9649     }
9650
9651   return cost;
9652 }
9653
9654 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
9655    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
9656    to be taken.  */
9657
9658 int
9659 aarch64_branch_cost (bool speed_p, bool predictable_p)
9660 {
9661   /* When optimizing for speed, use the cost of unpredictable branches.  */
9662   const struct cpu_branch_cost *branch_costs =
9663     aarch64_tune_params.branch_costs;
9664
9665   if (!speed_p || predictable_p)
9666     return branch_costs->predictable;
9667   else
9668     return branch_costs->unpredictable;
9669 }
9670
9671 /* Return true if the RTX X in mode MODE is a zero or sign extract
9672    usable in an ADD or SUB (extended register) instruction.  */
9673 static bool
9674 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9675 {
9676   /* Catch add with a sign extract.
9677      This is add_<optab><mode>_multp2.  */
9678   if (GET_CODE (x) == SIGN_EXTRACT
9679       || GET_CODE (x) == ZERO_EXTRACT)
9680     {
9681       rtx op0 = XEXP (x, 0);
9682       rtx op1 = XEXP (x, 1);
9683       rtx op2 = XEXP (x, 2);
9684
9685       if (GET_CODE (op0) == MULT
9686           && CONST_INT_P (op1)
9687           && op2 == const0_rtx
9688           && CONST_INT_P (XEXP (op0, 1))
9689           && aarch64_is_extend_from_extract (mode,
9690                                              XEXP (op0, 1),
9691                                              op1))
9692         {
9693           return true;
9694         }
9695     }
9696   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9697      No shift.  */
9698   else if (GET_CODE (x) == SIGN_EXTEND
9699            || GET_CODE (x) == ZERO_EXTEND)
9700     return REG_P (XEXP (x, 0));
9701
9702   return false;
9703 }
9704
9705 static bool
9706 aarch64_frint_unspec_p (unsigned int u)
9707 {
9708   switch (u)
9709     {
9710       case UNSPEC_FRINTZ:
9711       case UNSPEC_FRINTP:
9712       case UNSPEC_FRINTM:
9713       case UNSPEC_FRINTA:
9714       case UNSPEC_FRINTN:
9715       case UNSPEC_FRINTX:
9716       case UNSPEC_FRINTI:
9717         return true;
9718
9719       default:
9720         return false;
9721     }
9722 }
9723
9724 /* Return true iff X is an rtx that will match an extr instruction
9725    i.e. as described in the *extr<mode>5_insn family of patterns.
9726    OP0 and OP1 will be set to the operands of the shifts involved
9727    on success and will be NULL_RTX otherwise.  */
9728
9729 static bool
9730 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9731 {
9732   rtx op0, op1;
9733   scalar_int_mode mode;
9734   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9735     return false;
9736
9737   *res_op0 = NULL_RTX;
9738   *res_op1 = NULL_RTX;
9739
9740   if (GET_CODE (x) != IOR)
9741     return false;
9742
9743   op0 = XEXP (x, 0);
9744   op1 = XEXP (x, 1);
9745
9746   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
9747       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
9748     {
9749      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
9750       if (GET_CODE (op1) == ASHIFT)
9751         std::swap (op0, op1);
9752
9753       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9754         return false;
9755
9756       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9757       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9758
9759       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9760           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9761         {
9762           *res_op0 = XEXP (op0, 0);
9763           *res_op1 = XEXP (op1, 0);
9764           return true;
9765         }
9766     }
9767
9768   return false;
9769 }
9770
9771 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9772    storing it in *COST.  Result is true if the total cost of the operation
9773    has now been calculated.  */
9774 static bool
9775 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
9776 {
9777   rtx inner;
9778   rtx comparator;
9779   enum rtx_code cmpcode;
9780
9781   if (COMPARISON_P (op0))
9782     {
9783       inner = XEXP (op0, 0);
9784       comparator = XEXP (op0, 1);
9785       cmpcode = GET_CODE (op0);
9786     }
9787   else
9788     {
9789       inner = op0;
9790       comparator = const0_rtx;
9791       cmpcode = NE;
9792     }
9793
9794   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
9795     {
9796       /* Conditional branch.  */
9797       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9798         return true;
9799       else
9800         {
9801           if (cmpcode == NE || cmpcode == EQ)
9802             {
9803               if (comparator == const0_rtx)
9804                 {
9805                   /* TBZ/TBNZ/CBZ/CBNZ.  */
9806                   if (GET_CODE (inner) == ZERO_EXTRACT)
9807                     /* TBZ/TBNZ.  */
9808                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
9809                                        ZERO_EXTRACT, 0, speed);
9810                   else
9811                     /* CBZ/CBNZ.  */
9812                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
9813
9814                 return true;
9815               }
9816             }
9817           else if (cmpcode == LT || cmpcode == GE)
9818             {
9819               /* TBZ/TBNZ.  */
9820               if (comparator == const0_rtx)
9821                 return true;
9822             }
9823         }
9824     }
9825   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9826     {
9827       /* CCMP.  */
9828       if (GET_CODE (op1) == COMPARE)
9829         {
9830           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
9831           if (XEXP (op1, 1) == const0_rtx)
9832             *cost += 1;
9833           if (speed)
9834             {
9835               machine_mode mode = GET_MODE (XEXP (op1, 0));
9836               const struct cpu_cost_table *extra_cost
9837                 = aarch64_tune_params.insn_extra_cost;
9838
9839               if (GET_MODE_CLASS (mode) == MODE_INT)
9840                 *cost += extra_cost->alu.arith;
9841               else
9842                 *cost += extra_cost->fp[mode == DFmode].compare;
9843             }
9844           return true;
9845         }
9846
9847       /* It's a conditional operation based on the status flags,
9848          so it must be some flavor of CSEL.  */
9849
9850       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
9851       if (GET_CODE (op1) == NEG
9852           || GET_CODE (op1) == NOT
9853           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
9854         op1 = XEXP (op1, 0);
9855       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
9856         {
9857           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
9858           op1 = XEXP (op1, 0);
9859           op2 = XEXP (op2, 0);
9860         }
9861
9862       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
9863       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
9864       return true;
9865     }
9866
9867   /* We don't know what this is, cost all operands.  */
9868   return false;
9869 }
9870
9871 /* Check whether X is a bitfield operation of the form shift + extend that
9872    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
9873    operand to which the bitfield operation is applied.  Otherwise return
9874    NULL_RTX.  */
9875
9876 static rtx
9877 aarch64_extend_bitfield_pattern_p (rtx x)
9878 {
9879   rtx_code outer_code = GET_CODE (x);
9880   machine_mode outer_mode = GET_MODE (x);
9881
9882   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
9883       && outer_mode != SImode && outer_mode != DImode)
9884     return NULL_RTX;
9885
9886   rtx inner = XEXP (x, 0);
9887   rtx_code inner_code = GET_CODE (inner);
9888   machine_mode inner_mode = GET_MODE (inner);
9889   rtx op = NULL_RTX;
9890
9891   switch (inner_code)
9892     {
9893       case ASHIFT:
9894         if (CONST_INT_P (XEXP (inner, 1))
9895             && (inner_mode == QImode || inner_mode == HImode))
9896           op = XEXP (inner, 0);
9897         break;
9898       case LSHIFTRT:
9899         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
9900             && (inner_mode == QImode || inner_mode == HImode))
9901           op = XEXP (inner, 0);
9902         break;
9903       case ASHIFTRT:
9904         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
9905             && (inner_mode == QImode || inner_mode == HImode))
9906           op = XEXP (inner, 0);
9907         break;
9908       default:
9909         break;
9910     }
9911
9912   return op;
9913 }
9914
9915 /* Return true if the mask and a shift amount from an RTX of the form
9916    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9917    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
9918
9919 bool
9920 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
9921                                     rtx shft_amnt)
9922 {
9923   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
9924          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
9925          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
9926          && (INTVAL (mask)
9927              & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
9928 }
9929
9930 /* Return true if the masks and a shift amount from an RTX of the form
9931    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
9932    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
9933
9934 bool
9935 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
9936                                    unsigned HOST_WIDE_INT mask1,
9937                                    unsigned HOST_WIDE_INT shft_amnt,
9938                                    unsigned HOST_WIDE_INT mask2)
9939 {
9940   unsigned HOST_WIDE_INT t;
9941
9942   /* Verify that there is no overlap in what bits are set in the two masks.  */
9943   if (mask1 != ~mask2)
9944     return false;
9945
9946   /* Verify that mask2 is not all zeros or ones.  */
9947   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
9948     return false;
9949
9950   /* The shift amount should always be less than the mode size.  */
9951   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
9952
9953   /* Verify that the mask being shifted is contiguous and would be in the
9954      least significant bits after shifting by shft_amnt.  */
9955   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
9956   return (t == (t & -t));
9957 }
9958
9959 /* Calculate the cost of calculating X, storing it in *COST.  Result
9960    is true if the total cost of the operation has now been calculated.  */
9961 static bool
9962 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
9963                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
9964 {
9965   rtx op0, op1, op2;
9966   const struct cpu_cost_table *extra_cost
9967     = aarch64_tune_params.insn_extra_cost;
9968   int code = GET_CODE (x);
9969   scalar_int_mode int_mode;
9970
9971   /* By default, assume that everything has equivalent cost to the
9972      cheapest instruction.  Any additional costs are applied as a delta
9973      above this default.  */
9974   *cost = COSTS_N_INSNS (1);
9975
9976   switch (code)
9977     {
9978     case SET:
9979       /* The cost depends entirely on the operands to SET.  */
9980       *cost = 0;
9981       op0 = SET_DEST (x);
9982       op1 = SET_SRC (x);
9983
9984       switch (GET_CODE (op0))
9985         {
9986         case MEM:
9987           if (speed)
9988             {
9989               rtx address = XEXP (op0, 0);
9990               if (VECTOR_MODE_P (mode))
9991                 *cost += extra_cost->ldst.storev;
9992               else if (GET_MODE_CLASS (mode) == MODE_INT)
9993                 *cost += extra_cost->ldst.store;
9994               else if (mode == SFmode)
9995                 *cost += extra_cost->ldst.storef;
9996               else if (mode == DFmode)
9997                 *cost += extra_cost->ldst.stored;
9998
9999               *cost +=
10000                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10001                                                      0, speed));
10002             }
10003
10004           *cost += rtx_cost (op1, mode, SET, 1, speed);
10005           return true;
10006
10007         case SUBREG:
10008           if (! REG_P (SUBREG_REG (op0)))
10009             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
10010
10011           /* Fall through.  */
10012         case REG:
10013           /* The cost is one per vector-register copied.  */
10014           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
10015             {
10016               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
10017               *cost = COSTS_N_INSNS (nregs);
10018             }
10019           /* const0_rtx is in general free, but we will use an
10020              instruction to set a register to 0.  */
10021           else if (REG_P (op1) || op1 == const0_rtx)
10022             {
10023               /* The cost is 1 per register copied.  */
10024               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
10025               *cost = COSTS_N_INSNS (nregs);
10026             }
10027           else
10028             /* Cost is just the cost of the RHS of the set.  */
10029             *cost += rtx_cost (op1, mode, SET, 1, speed);
10030           return true;
10031
10032         case ZERO_EXTRACT:
10033         case SIGN_EXTRACT:
10034           /* Bit-field insertion.  Strip any redundant widening of
10035              the RHS to meet the width of the target.  */
10036           if (GET_CODE (op1) == SUBREG)
10037             op1 = SUBREG_REG (op1);
10038           if ((GET_CODE (op1) == ZERO_EXTEND
10039                || GET_CODE (op1) == SIGN_EXTEND)
10040               && CONST_INT_P (XEXP (op0, 1))
10041               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
10042               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
10043             op1 = XEXP (op1, 0);
10044
10045           if (CONST_INT_P (op1))
10046             {
10047               /* MOV immediate is assumed to always be cheap.  */
10048               *cost = COSTS_N_INSNS (1);
10049             }
10050           else
10051             {
10052               /* BFM.  */
10053               if (speed)
10054                 *cost += extra_cost->alu.bfi;
10055               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
10056             }
10057
10058           return true;
10059
10060         default:
10061           /* We can't make sense of this, assume default cost.  */
10062           *cost = COSTS_N_INSNS (1);
10063           return false;
10064         }
10065       return false;
10066
10067     case CONST_INT:
10068       /* If an instruction can incorporate a constant within the
10069          instruction, the instruction's expression avoids calling
10070          rtx_cost() on the constant.  If rtx_cost() is called on a
10071          constant, then it is usually because the constant must be
10072          moved into a register by one or more instructions.
10073
10074          The exception is constant 0, which can be expressed
10075          as XZR/WZR and is therefore free.  The exception to this is
10076          if we have (set (reg) (const0_rtx)) in which case we must cost
10077          the move.  However, we can catch that when we cost the SET, so
10078          we don't need to consider that here.  */
10079       if (x == const0_rtx)
10080         *cost = 0;
10081       else
10082         {
10083           /* To an approximation, building any other constant is
10084              proportionally expensive to the number of instructions
10085              required to build that constant.  This is true whether we
10086              are compiling for SPEED or otherwise.  */
10087           if (!is_a <scalar_int_mode> (mode, &int_mode))
10088             int_mode = word_mode;
10089           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
10090                                  (NULL_RTX, x, false, int_mode));
10091         }
10092       return true;
10093
10094     case CONST_DOUBLE:
10095
10096       /* First determine number of instructions to do the move
10097           as an integer constant.  */
10098       if (!aarch64_float_const_representable_p (x)
10099            && !aarch64_can_const_movi_rtx_p (x, mode)
10100            && aarch64_float_const_rtx_p (x))
10101         {
10102           unsigned HOST_WIDE_INT ival;
10103           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
10104           gcc_assert (succeed);
10105
10106           scalar_int_mode imode = (mode == HFmode
10107                                    ? SImode
10108                                    : int_mode_for_mode (mode).require ());
10109           int ncost = aarch64_internal_mov_immediate
10110                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10111           *cost += COSTS_N_INSNS (ncost);
10112           return true;
10113         }
10114
10115       if (speed)
10116         {
10117           /* mov[df,sf]_aarch64.  */
10118           if (aarch64_float_const_representable_p (x))
10119             /* FMOV (scalar immediate).  */
10120             *cost += extra_cost->fp[mode == DFmode].fpconst;
10121           else if (!aarch64_float_const_zero_rtx_p (x))
10122             {
10123               /* This will be a load from memory.  */
10124               if (mode == DFmode)
10125                 *cost += extra_cost->ldst.loadd;
10126               else
10127                 *cost += extra_cost->ldst.loadf;
10128             }
10129           else
10130             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
10131                or MOV v0.s[0], wzr - neither of which are modeled by the
10132                cost tables.  Just use the default cost.  */
10133             {
10134             }
10135         }
10136
10137       return true;
10138
10139     case MEM:
10140       if (speed)
10141         {
10142           /* For loads we want the base cost of a load, plus an
10143              approximation for the additional cost of the addressing
10144              mode.  */
10145           rtx address = XEXP (x, 0);
10146           if (VECTOR_MODE_P (mode))
10147             *cost += extra_cost->ldst.loadv;
10148           else if (GET_MODE_CLASS (mode) == MODE_INT)
10149             *cost += extra_cost->ldst.load;
10150           else if (mode == SFmode)
10151             *cost += extra_cost->ldst.loadf;
10152           else if (mode == DFmode)
10153             *cost += extra_cost->ldst.loadd;
10154
10155           *cost +=
10156                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10157                                                      0, speed));
10158         }
10159
10160       return true;
10161
10162     case NEG:
10163       op0 = XEXP (x, 0);
10164
10165       if (VECTOR_MODE_P (mode))
10166         {
10167           if (speed)
10168             {
10169               /* FNEG.  */
10170               *cost += extra_cost->vect.alu;
10171             }
10172           return false;
10173         }
10174
10175       if (GET_MODE_CLASS (mode) == MODE_INT)
10176         {
10177           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10178               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10179             {
10180               /* CSETM.  */
10181               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
10182               return true;
10183             }
10184
10185           /* Cost this as SUB wzr, X.  */
10186           op0 = CONST0_RTX (mode);
10187           op1 = XEXP (x, 0);
10188           goto cost_minus;
10189         }
10190
10191       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10192         {
10193           /* Support (neg(fma...)) as a single instruction only if
10194              sign of zeros is unimportant.  This matches the decision
10195              making in aarch64.md.  */
10196           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
10197             {
10198               /* FNMADD.  */
10199               *cost = rtx_cost (op0, mode, NEG, 0, speed);
10200               return true;
10201             }
10202           if (GET_CODE (op0) == MULT)
10203             {
10204               /* FNMUL.  */
10205               *cost = rtx_cost (op0, mode, NEG, 0, speed);
10206               return true;
10207             }
10208           if (speed)
10209             /* FNEG.  */
10210             *cost += extra_cost->fp[mode == DFmode].neg;
10211           return false;
10212         }
10213
10214       return false;
10215
10216     case CLRSB:
10217     case CLZ:
10218       if (speed)
10219         {
10220           if (VECTOR_MODE_P (mode))
10221             *cost += extra_cost->vect.alu;
10222           else
10223             *cost += extra_cost->alu.clz;
10224         }
10225
10226       return false;
10227
10228     case COMPARE:
10229       op0 = XEXP (x, 0);
10230       op1 = XEXP (x, 1);
10231
10232       if (op1 == const0_rtx
10233           && GET_CODE (op0) == AND)
10234         {
10235           x = op0;
10236           mode = GET_MODE (op0);
10237           goto cost_logic;
10238         }
10239
10240       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
10241         {
10242           /* TODO: A write to the CC flags possibly costs extra, this
10243              needs encoding in the cost tables.  */
10244
10245           mode = GET_MODE (op0);
10246           /* ANDS.  */
10247           if (GET_CODE (op0) == AND)
10248             {
10249               x = op0;
10250               goto cost_logic;
10251             }
10252
10253           if (GET_CODE (op0) == PLUS)
10254             {
10255               /* ADDS (and CMN alias).  */
10256               x = op0;
10257               goto cost_plus;
10258             }
10259
10260           if (GET_CODE (op0) == MINUS)
10261             {
10262               /* SUBS.  */
10263               x = op0;
10264               goto cost_minus;
10265             }
10266
10267           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
10268               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
10269               && CONST_INT_P (XEXP (op0, 2)))
10270             {
10271               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10272                  Handle it here directly rather than going to cost_logic
10273                  since we know the immediate generated for the TST is valid
10274                  so we can avoid creating an intermediate rtx for it only
10275                  for costing purposes.  */
10276               if (speed)
10277                 *cost += extra_cost->alu.logical;
10278
10279               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
10280                                  ZERO_EXTRACT, 0, speed);
10281               return true;
10282             }
10283
10284           if (GET_CODE (op1) == NEG)
10285             {
10286               /* CMN.  */
10287               if (speed)
10288                 *cost += extra_cost->alu.arith;
10289
10290               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
10291               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
10292               return true;
10293             }
10294
10295           /* CMP.
10296
10297              Compare can freely swap the order of operands, and
10298              canonicalization puts the more complex operation first.
10299              But the integer MINUS logic expects the shift/extend
10300              operation in op1.  */
10301           if (! (REG_P (op0)
10302                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
10303           {
10304             op0 = XEXP (x, 1);
10305             op1 = XEXP (x, 0);
10306           }
10307           goto cost_minus;
10308         }
10309
10310       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
10311         {
10312           /* FCMP.  */
10313           if (speed)
10314             *cost += extra_cost->fp[mode == DFmode].compare;
10315
10316           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
10317             {
10318               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
10319               /* FCMP supports constant 0.0 for no extra cost. */
10320               return true;
10321             }
10322           return false;
10323         }
10324
10325       if (VECTOR_MODE_P (mode))
10326         {
10327           /* Vector compare.  */
10328           if (speed)
10329             *cost += extra_cost->vect.alu;
10330
10331           if (aarch64_float_const_zero_rtx_p (op1))
10332             {
10333               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10334                  cost.  */
10335               return true;
10336             }
10337           return false;
10338         }
10339       return false;
10340
10341     case MINUS:
10342       {
10343         op0 = XEXP (x, 0);
10344         op1 = XEXP (x, 1);
10345
10346 cost_minus:
10347         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
10348
10349         /* Detect valid immediates.  */
10350         if ((GET_MODE_CLASS (mode) == MODE_INT
10351              || (GET_MODE_CLASS (mode) == MODE_CC
10352                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
10353             && CONST_INT_P (op1)
10354             && aarch64_uimm12_shift (INTVAL (op1)))
10355           {
10356             if (speed)
10357               /* SUB(S) (immediate).  */
10358               *cost += extra_cost->alu.arith;
10359             return true;
10360           }
10361
10362         /* Look for SUB (extended register).  */
10363         if (is_a <scalar_int_mode> (mode, &int_mode)
10364             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
10365           {
10366             if (speed)
10367               *cost += extra_cost->alu.extend_arith;
10368
10369             op1 = aarch64_strip_extend (op1, true);
10370             *cost += rtx_cost (op1, VOIDmode,
10371                                (enum rtx_code) GET_CODE (op1), 0, speed);
10372             return true;
10373           }
10374
10375         rtx new_op1 = aarch64_strip_extend (op1, false);
10376
10377         /* Cost this as an FMA-alike operation.  */
10378         if ((GET_CODE (new_op1) == MULT
10379              || aarch64_shift_p (GET_CODE (new_op1)))
10380             && code != COMPARE)
10381           {
10382             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
10383                                             (enum rtx_code) code,
10384                                             speed);
10385             return true;
10386           }
10387
10388         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
10389
10390         if (speed)
10391           {
10392             if (VECTOR_MODE_P (mode))
10393               {
10394                 /* Vector SUB.  */
10395                 *cost += extra_cost->vect.alu;
10396               }
10397             else if (GET_MODE_CLASS (mode) == MODE_INT)
10398               {
10399                 /* SUB(S).  */
10400                 *cost += extra_cost->alu.arith;
10401               }
10402             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10403               {
10404                 /* FSUB.  */
10405                 *cost += extra_cost->fp[mode == DFmode].addsub;
10406               }
10407           }
10408         return true;
10409       }
10410
10411     case PLUS:
10412       {
10413         rtx new_op0;
10414
10415         op0 = XEXP (x, 0);
10416         op1 = XEXP (x, 1);
10417
10418 cost_plus:
10419         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10420             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10421           {
10422             /* CSINC.  */
10423             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
10424             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10425             return true;
10426           }
10427
10428         if (GET_MODE_CLASS (mode) == MODE_INT
10429             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
10430                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
10431           {
10432             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
10433
10434             if (speed)
10435               /* ADD (immediate).  */
10436               *cost += extra_cost->alu.arith;
10437             return true;
10438           }
10439
10440         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10441
10442         /* Look for ADD (extended register).  */
10443         if (is_a <scalar_int_mode> (mode, &int_mode)
10444             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
10445           {
10446             if (speed)
10447               *cost += extra_cost->alu.extend_arith;
10448
10449             op0 = aarch64_strip_extend (op0, true);
10450             *cost += rtx_cost (op0, VOIDmode,
10451                                (enum rtx_code) GET_CODE (op0), 0, speed);
10452             return true;
10453           }
10454
10455         /* Strip any extend, leave shifts behind as we will
10456            cost them through mult_cost.  */
10457         new_op0 = aarch64_strip_extend (op0, false);
10458
10459         if (GET_CODE (new_op0) == MULT
10460             || aarch64_shift_p (GET_CODE (new_op0)))
10461           {
10462             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
10463                                             speed);
10464             return true;
10465           }
10466
10467         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
10468
10469         if (speed)
10470           {
10471             if (VECTOR_MODE_P (mode))
10472               {
10473                 /* Vector ADD.  */
10474                 *cost += extra_cost->vect.alu;
10475               }
10476             else if (GET_MODE_CLASS (mode) == MODE_INT)
10477               {
10478                 /* ADD.  */
10479                 *cost += extra_cost->alu.arith;
10480               }
10481             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10482               {
10483                 /* FADD.  */
10484                 *cost += extra_cost->fp[mode == DFmode].addsub;
10485               }
10486           }
10487         return true;
10488       }
10489
10490     case BSWAP:
10491       *cost = COSTS_N_INSNS (1);
10492
10493       if (speed)
10494         {
10495           if (VECTOR_MODE_P (mode))
10496             *cost += extra_cost->vect.alu;
10497           else
10498             *cost += extra_cost->alu.rev;
10499         }
10500       return false;
10501
10502     case IOR:
10503       if (aarch_rev16_p (x))
10504         {
10505           *cost = COSTS_N_INSNS (1);
10506
10507           if (speed)
10508             {
10509               if (VECTOR_MODE_P (mode))
10510                 *cost += extra_cost->vect.alu;
10511               else
10512                 *cost += extra_cost->alu.rev;
10513             }
10514           return true;
10515         }
10516
10517       if (aarch64_extr_rtx_p (x, &op0, &op1))
10518         {
10519           *cost += rtx_cost (op0, mode, IOR, 0, speed);
10520           *cost += rtx_cost (op1, mode, IOR, 1, speed);
10521           if (speed)
10522             *cost += extra_cost->alu.shift;
10523
10524           return true;
10525         }
10526     /* Fall through.  */
10527     case XOR:
10528     case AND:
10529     cost_logic:
10530       op0 = XEXP (x, 0);
10531       op1 = XEXP (x, 1);
10532
10533       if (VECTOR_MODE_P (mode))
10534         {
10535           if (speed)
10536             *cost += extra_cost->vect.alu;
10537           return true;
10538         }
10539
10540       if (code == AND
10541           && GET_CODE (op0) == MULT
10542           && CONST_INT_P (XEXP (op0, 1))
10543           && CONST_INT_P (op1)
10544           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10545                                INTVAL (op1)) != 0)
10546         {
10547           /* This is a UBFM/SBFM.  */
10548           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10549           if (speed)
10550             *cost += extra_cost->alu.bfx;
10551           return true;
10552         }
10553
10554       if (is_int_mode (mode, &int_mode))
10555         {
10556           if (CONST_INT_P (op1))
10557             {
10558               /* We have a mask + shift version of a UBFIZ
10559                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
10560               if (GET_CODE (op0) == ASHIFT
10561                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10562                                                          XEXP (op0, 1)))
10563                 {
10564                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
10565                                      (enum rtx_code) code, 0, speed);
10566                   if (speed)
10567                     *cost += extra_cost->alu.bfx;
10568
10569                   return true;
10570                 }
10571               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10572                 {
10573                 /* We possibly get the immediate for free, this is not
10574                    modelled.  */
10575                   *cost += rtx_cost (op0, int_mode,
10576                                      (enum rtx_code) code, 0, speed);
10577                   if (speed)
10578                     *cost += extra_cost->alu.logical;
10579
10580                   return true;
10581                 }
10582             }
10583           else
10584             {
10585               rtx new_op0 = op0;
10586
10587               /* Handle ORN, EON, or BIC.  */
10588               if (GET_CODE (op0) == NOT)
10589                 op0 = XEXP (op0, 0);
10590
10591               new_op0 = aarch64_strip_shift (op0);
10592
10593               /* If we had a shift on op0 then this is a logical-shift-
10594                  by-register/immediate operation.  Otherwise, this is just
10595                  a logical operation.  */
10596               if (speed)
10597                 {
10598                   if (new_op0 != op0)
10599                     {
10600                       /* Shift by immediate.  */
10601                       if (CONST_INT_P (XEXP (op0, 1)))
10602                         *cost += extra_cost->alu.log_shift;
10603                       else
10604                         *cost += extra_cost->alu.log_shift_reg;
10605                     }
10606                   else
10607                     *cost += extra_cost->alu.logical;
10608                 }
10609
10610               /* In both cases we want to cost both operands.  */
10611               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10612                                  0, speed);
10613               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10614                                  1, speed);
10615
10616               return true;
10617             }
10618         }
10619       return false;
10620
10621     case NOT:
10622       x = XEXP (x, 0);
10623       op0 = aarch64_strip_shift (x);
10624
10625       if (VECTOR_MODE_P (mode))
10626         {
10627           /* Vector NOT.  */
10628           *cost += extra_cost->vect.alu;
10629           return false;
10630         }
10631
10632       /* MVN-shifted-reg.  */
10633       if (op0 != x)
10634         {
10635           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10636
10637           if (speed)
10638             *cost += extra_cost->alu.log_shift;
10639
10640           return true;
10641         }
10642       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10643          Handle the second form here taking care that 'a' in the above can
10644          be a shift.  */
10645       else if (GET_CODE (op0) == XOR)
10646         {
10647           rtx newop0 = XEXP (op0, 0);
10648           rtx newop1 = XEXP (op0, 1);
10649           rtx op0_stripped = aarch64_strip_shift (newop0);
10650
10651           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10652           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10653
10654           if (speed)
10655             {
10656               if (op0_stripped != newop0)
10657                 *cost += extra_cost->alu.log_shift;
10658               else
10659                 *cost += extra_cost->alu.logical;
10660             }
10661
10662           return true;
10663         }
10664       /* MVN.  */
10665       if (speed)
10666         *cost += extra_cost->alu.logical;
10667
10668       return false;
10669
10670     case ZERO_EXTEND:
10671
10672       op0 = XEXP (x, 0);
10673       /* If a value is written in SI mode, then zero extended to DI
10674          mode, the operation will in general be free as a write to
10675          a 'w' register implicitly zeroes the upper bits of an 'x'
10676          register.  However, if this is
10677
10678            (set (reg) (zero_extend (reg)))
10679
10680          we must cost the explicit register move.  */
10681       if (mode == DImode
10682           && GET_MODE (op0) == SImode
10683           && outer == SET)
10684         {
10685           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10686
10687         /* If OP_COST is non-zero, then the cost of the zero extend
10688            is effectively the cost of the inner operation.  Otherwise
10689            we have a MOV instruction and we take the cost from the MOV
10690            itself.  This is true independently of whether we are
10691            optimizing for space or time.  */
10692           if (op_cost)
10693             *cost = op_cost;
10694
10695           return true;
10696         }
10697       else if (MEM_P (op0))
10698         {
10699           /* All loads can zero extend to any size for free.  */
10700           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10701           return true;
10702         }
10703
10704       op0 = aarch64_extend_bitfield_pattern_p (x);
10705       if (op0)
10706         {
10707           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
10708           if (speed)
10709             *cost += extra_cost->alu.bfx;
10710           return true;
10711         }
10712
10713       if (speed)
10714         {
10715           if (VECTOR_MODE_P (mode))
10716             {
10717               /* UMOV.  */
10718               *cost += extra_cost->vect.alu;
10719             }
10720           else
10721             {
10722               /* We generate an AND instead of UXTB/UXTH.  */
10723               *cost += extra_cost->alu.logical;
10724             }
10725         }
10726       return false;
10727
10728     case SIGN_EXTEND:
10729       if (MEM_P (XEXP (x, 0)))
10730         {
10731           /* LDRSH.  */
10732           if (speed)
10733             {
10734               rtx address = XEXP (XEXP (x, 0), 0);
10735               *cost += extra_cost->ldst.load_sign_extend;
10736
10737               *cost +=
10738                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10739                                                      0, speed));
10740             }
10741           return true;
10742         }
10743
10744       op0 = aarch64_extend_bitfield_pattern_p (x);
10745       if (op0)
10746         {
10747           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
10748           if (speed)
10749             *cost += extra_cost->alu.bfx;
10750           return true;
10751         }
10752
10753       if (speed)
10754         {
10755           if (VECTOR_MODE_P (mode))
10756             *cost += extra_cost->vect.alu;
10757           else
10758             *cost += extra_cost->alu.extend;
10759         }
10760       return false;
10761
10762     case ASHIFT:
10763       op0 = XEXP (x, 0);
10764       op1 = XEXP (x, 1);
10765
10766       if (CONST_INT_P (op1))
10767         {
10768           if (speed)
10769             {
10770               if (VECTOR_MODE_P (mode))
10771                 {
10772                   /* Vector shift (immediate).  */
10773                   *cost += extra_cost->vect.alu;
10774                 }
10775               else
10776                 {
10777                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
10778                      aliases.  */
10779                   *cost += extra_cost->alu.shift;
10780                 }
10781             }
10782
10783           /* We can incorporate zero/sign extend for free.  */
10784           if (GET_CODE (op0) == ZERO_EXTEND
10785               || GET_CODE (op0) == SIGN_EXTEND)
10786             op0 = XEXP (op0, 0);
10787
10788           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
10789           return true;
10790         }
10791       else
10792         {
10793           if (VECTOR_MODE_P (mode))
10794             {
10795               if (speed)
10796                 /* Vector shift (register).  */
10797                 *cost += extra_cost->vect.alu;
10798             }
10799           else
10800             {
10801               if (speed)
10802                 /* LSLV.  */
10803                 *cost += extra_cost->alu.shift_reg;
10804
10805               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10806                   && CONST_INT_P (XEXP (op1, 1))
10807                   && known_eq (INTVAL (XEXP (op1, 1)),
10808                                GET_MODE_BITSIZE (mode) - 1))
10809                 {
10810                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10811                   /* We already demanded XEXP (op1, 0) to be REG_P, so
10812                      don't recurse into it.  */
10813                   return true;
10814                 }
10815             }
10816           return false;  /* All arguments need to be in registers.  */
10817         }
10818
10819     case ROTATE:
10820     case ROTATERT:
10821     case LSHIFTRT:
10822     case ASHIFTRT:
10823       op0 = XEXP (x, 0);
10824       op1 = XEXP (x, 1);
10825
10826       if (CONST_INT_P (op1))
10827         {
10828           /* ASR (immediate) and friends.  */
10829           if (speed)
10830             {
10831               if (VECTOR_MODE_P (mode))
10832                 *cost += extra_cost->vect.alu;
10833               else
10834                 *cost += extra_cost->alu.shift;
10835             }
10836
10837           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10838           return true;
10839         }
10840       else
10841         {
10842           if (VECTOR_MODE_P (mode))
10843             {
10844               if (speed)
10845                 /* Vector shift (register).  */
10846                 *cost += extra_cost->vect.alu;
10847             }
10848           else
10849             {
10850               if (speed)
10851                 /* ASR (register) and friends.  */
10852                 *cost += extra_cost->alu.shift_reg;
10853
10854               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10855                   && CONST_INT_P (XEXP (op1, 1))
10856                   && known_eq (INTVAL (XEXP (op1, 1)),
10857                                GET_MODE_BITSIZE (mode) - 1))
10858                 {
10859                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10860                   /* We already demanded XEXP (op1, 0) to be REG_P, so
10861                      don't recurse into it.  */
10862                   return true;
10863                 }
10864             }
10865           return false;  /* All arguments need to be in registers.  */
10866         }
10867
10868     case SYMBOL_REF:
10869
10870       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
10871           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
10872         {
10873           /* LDR.  */
10874           if (speed)
10875             *cost += extra_cost->ldst.load;
10876         }
10877       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
10878                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
10879         {
10880           /* ADRP, followed by ADD.  */
10881           *cost += COSTS_N_INSNS (1);
10882           if (speed)
10883             *cost += 2 * extra_cost->alu.arith;
10884         }
10885       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
10886                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10887         {
10888           /* ADR.  */
10889           if (speed)
10890             *cost += extra_cost->alu.arith;
10891         }
10892
10893       if (flag_pic)
10894         {
10895           /* One extra load instruction, after accessing the GOT.  */
10896           *cost += COSTS_N_INSNS (1);
10897           if (speed)
10898             *cost += extra_cost->ldst.load;
10899         }
10900       return true;
10901
10902     case HIGH:
10903     case LO_SUM:
10904       /* ADRP/ADD (immediate).  */
10905       if (speed)
10906         *cost += extra_cost->alu.arith;
10907       return true;
10908
10909     case ZERO_EXTRACT:
10910     case SIGN_EXTRACT:
10911       /* UBFX/SBFX.  */
10912       if (speed)
10913         {
10914           if (VECTOR_MODE_P (mode))
10915             *cost += extra_cost->vect.alu;
10916           else
10917             *cost += extra_cost->alu.bfx;
10918         }
10919
10920       /* We can trust that the immediates used will be correct (there
10921          are no by-register forms), so we need only cost op0.  */
10922       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
10923       return true;
10924
10925     case MULT:
10926       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
10927       /* aarch64_rtx_mult_cost always handles recursion to its
10928          operands.  */
10929       return true;
10930
10931     case MOD:
10932     /* We can expand signed mod by power of 2 using a NEGS, two parallel
10933        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
10934        an unconditional negate.  This case should only ever be reached through
10935        the set_smod_pow2_cheap check in expmed.c.  */
10936       if (CONST_INT_P (XEXP (x, 1))
10937           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
10938           && (mode == SImode || mode == DImode))
10939         {
10940           /* We expand to 4 instructions.  Reset the baseline.  */
10941           *cost = COSTS_N_INSNS (4);
10942
10943           if (speed)
10944             *cost += 2 * extra_cost->alu.logical
10945                      + 2 * extra_cost->alu.arith;
10946
10947           return true;
10948         }
10949
10950     /* Fall-through.  */
10951     case UMOD:
10952       if (speed)
10953         {
10954           /* Slighly prefer UMOD over SMOD.  */
10955           if (VECTOR_MODE_P (mode))
10956             *cost += extra_cost->vect.alu;
10957           else if (GET_MODE_CLASS (mode) == MODE_INT)
10958             *cost += (extra_cost->mult[mode == DImode].add
10959                       + extra_cost->mult[mode == DImode].idiv
10960                       + (code == MOD ? 1 : 0));
10961         }
10962       return false;  /* All arguments need to be in registers.  */
10963
10964     case DIV:
10965     case UDIV:
10966     case SQRT:
10967       if (speed)
10968         {
10969           if (VECTOR_MODE_P (mode))
10970             *cost += extra_cost->vect.alu;
10971           else if (GET_MODE_CLASS (mode) == MODE_INT)
10972             /* There is no integer SQRT, so only DIV and UDIV can get
10973                here.  */
10974             *cost += (extra_cost->mult[mode == DImode].idiv
10975                      /* Slighly prefer UDIV over SDIV.  */
10976                      + (code == DIV ? 1 : 0));
10977           else
10978             *cost += extra_cost->fp[mode == DFmode].div;
10979         }
10980       return false;  /* All arguments need to be in registers.  */
10981
10982     case IF_THEN_ELSE:
10983       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
10984                                          XEXP (x, 2), cost, speed);
10985
10986     case EQ:
10987     case NE:
10988     case GT:
10989     case GTU:
10990     case LT:
10991     case LTU:
10992     case GE:
10993     case GEU:
10994     case LE:
10995     case LEU:
10996
10997       return false; /* All arguments must be in registers.  */
10998
10999     case FMA:
11000       op0 = XEXP (x, 0);
11001       op1 = XEXP (x, 1);
11002       op2 = XEXP (x, 2);
11003
11004       if (speed)
11005         {
11006           if (VECTOR_MODE_P (mode))
11007             *cost += extra_cost->vect.alu;
11008           else
11009             *cost += extra_cost->fp[mode == DFmode].fma;
11010         }
11011
11012       /* FMSUB, FNMADD, and FNMSUB are free.  */
11013       if (GET_CODE (op0) == NEG)
11014         op0 = XEXP (op0, 0);
11015
11016       if (GET_CODE (op2) == NEG)
11017         op2 = XEXP (op2, 0);
11018
11019       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
11020          and the by-element operand as operand 0.  */
11021       if (GET_CODE (op1) == NEG)
11022         op1 = XEXP (op1, 0);
11023
11024       /* Catch vector-by-element operations.  The by-element operand can
11025          either be (vec_duplicate (vec_select (x))) or just
11026          (vec_select (x)), depending on whether we are multiplying by
11027          a vector or a scalar.
11028
11029          Canonicalization is not very good in these cases, FMA4 will put the
11030          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
11031       if (GET_CODE (op0) == VEC_DUPLICATE)
11032         op0 = XEXP (op0, 0);
11033       else if (GET_CODE (op1) == VEC_DUPLICATE)
11034         op1 = XEXP (op1, 0);
11035
11036       if (GET_CODE (op0) == VEC_SELECT)
11037         op0 = XEXP (op0, 0);
11038       else if (GET_CODE (op1) == VEC_SELECT)
11039         op1 = XEXP (op1, 0);
11040
11041       /* If the remaining parameters are not registers,
11042          get the cost to put them into registers.  */
11043       *cost += rtx_cost (op0, mode, FMA, 0, speed);
11044       *cost += rtx_cost (op1, mode, FMA, 1, speed);
11045       *cost += rtx_cost (op2, mode, FMA, 2, speed);
11046       return true;
11047
11048     case FLOAT:
11049     case UNSIGNED_FLOAT:
11050       if (speed)
11051         *cost += extra_cost->fp[mode == DFmode].fromint;
11052       return false;
11053
11054     case FLOAT_EXTEND:
11055       if (speed)
11056         {
11057           if (VECTOR_MODE_P (mode))
11058             {
11059               /*Vector truncate.  */
11060               *cost += extra_cost->vect.alu;
11061             }
11062           else
11063             *cost += extra_cost->fp[mode == DFmode].widen;
11064         }
11065       return false;
11066
11067     case FLOAT_TRUNCATE:
11068       if (speed)
11069         {
11070           if (VECTOR_MODE_P (mode))
11071             {
11072               /*Vector conversion.  */
11073               *cost += extra_cost->vect.alu;
11074             }
11075           else
11076             *cost += extra_cost->fp[mode == DFmode].narrow;
11077         }
11078       return false;
11079
11080     case FIX:
11081     case UNSIGNED_FIX:
11082       x = XEXP (x, 0);
11083       /* Strip the rounding part.  They will all be implemented
11084          by the fcvt* family of instructions anyway.  */
11085       if (GET_CODE (x) == UNSPEC)
11086         {
11087           unsigned int uns_code = XINT (x, 1);
11088
11089           if (uns_code == UNSPEC_FRINTA
11090               || uns_code == UNSPEC_FRINTM
11091               || uns_code == UNSPEC_FRINTN
11092               || uns_code == UNSPEC_FRINTP
11093               || uns_code == UNSPEC_FRINTZ)
11094             x = XVECEXP (x, 0, 0);
11095         }
11096
11097       if (speed)
11098         {
11099           if (VECTOR_MODE_P (mode))
11100             *cost += extra_cost->vect.alu;
11101           else
11102             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
11103         }
11104
11105       /* We can combine fmul by a power of 2 followed by a fcvt into a single
11106          fixed-point fcvt.  */
11107       if (GET_CODE (x) == MULT
11108           && ((VECTOR_MODE_P (mode)
11109                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
11110               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
11111         {
11112           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
11113                              0, speed);
11114           return true;
11115         }
11116
11117       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
11118       return true;
11119
11120     case ABS:
11121       if (VECTOR_MODE_P (mode))
11122         {
11123           /* ABS (vector).  */
11124           if (speed)
11125             *cost += extra_cost->vect.alu;
11126         }
11127       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11128         {
11129           op0 = XEXP (x, 0);
11130
11131           /* FABD, which is analogous to FADD.  */
11132           if (GET_CODE (op0) == MINUS)
11133             {
11134               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
11135               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
11136               if (speed)
11137                 *cost += extra_cost->fp[mode == DFmode].addsub;
11138
11139               return true;
11140             }
11141           /* Simple FABS is analogous to FNEG.  */
11142           if (speed)
11143             *cost += extra_cost->fp[mode == DFmode].neg;
11144         }
11145       else
11146         {
11147           /* Integer ABS will either be split to
11148              two arithmetic instructions, or will be an ABS
11149              (scalar), which we don't model.  */
11150           *cost = COSTS_N_INSNS (2);
11151           if (speed)
11152             *cost += 2 * extra_cost->alu.arith;
11153         }
11154       return false;
11155
11156     case SMAX:
11157     case SMIN:
11158       if (speed)
11159         {
11160           if (VECTOR_MODE_P (mode))
11161             *cost += extra_cost->vect.alu;
11162           else
11163             {
11164               /* FMAXNM/FMINNM/FMAX/FMIN.
11165                  TODO: This may not be accurate for all implementations, but
11166                  we do not model this in the cost tables.  */
11167               *cost += extra_cost->fp[mode == DFmode].addsub;
11168             }
11169         }
11170       return false;
11171
11172     case UNSPEC:
11173       /* The floating point round to integer frint* instructions.  */
11174       if (aarch64_frint_unspec_p (XINT (x, 1)))
11175         {
11176           if (speed)
11177             *cost += extra_cost->fp[mode == DFmode].roundint;
11178
11179           return false;
11180         }
11181
11182       if (XINT (x, 1) == UNSPEC_RBIT)
11183         {
11184           if (speed)
11185             *cost += extra_cost->alu.rev;
11186
11187           return false;
11188         }
11189       break;
11190
11191     case TRUNCATE:
11192
11193       /* Decompose <su>muldi3_highpart.  */
11194       if (/* (truncate:DI  */
11195           mode == DImode
11196           /*   (lshiftrt:TI  */
11197           && GET_MODE (XEXP (x, 0)) == TImode
11198           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
11199           /*      (mult:TI  */
11200           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
11201           /*        (ANY_EXTEND:TI (reg:DI))
11202                     (ANY_EXTEND:TI (reg:DI)))  */
11203           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
11204                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
11205               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
11206                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
11207           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
11208           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
11209           /*     (const_int 64)  */
11210           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
11211           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
11212         {
11213           /* UMULH/SMULH.  */
11214           if (speed)
11215             *cost += extra_cost->mult[mode == DImode].extend;
11216           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
11217                              mode, MULT, 0, speed);
11218           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
11219                              mode, MULT, 1, speed);
11220           return true;
11221         }
11222
11223       /* Fall through.  */
11224     default:
11225       break;
11226     }
11227
11228   if (dump_file
11229       && flag_aarch64_verbose_cost)
11230     fprintf (dump_file,
11231       "\nFailed to cost RTX.  Assuming default cost.\n");
11232
11233   return true;
11234 }
11235
11236 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11237    calculated for X.  This cost is stored in *COST.  Returns true
11238    if the total cost of X was calculated.  */
11239 static bool
11240 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
11241                    int param, int *cost, bool speed)
11242 {
11243   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
11244
11245   if (dump_file
11246       && flag_aarch64_verbose_cost)
11247     {
11248       print_rtl_single (dump_file, x);
11249       fprintf (dump_file, "\n%s cost: %d (%s)\n",
11250                speed ? "Hot" : "Cold",
11251                *cost, result ? "final" : "partial");
11252     }
11253
11254   return result;
11255 }
11256
11257 static int
11258 aarch64_register_move_cost (machine_mode mode,
11259                             reg_class_t from_i, reg_class_t to_i)
11260 {
11261   enum reg_class from = (enum reg_class) from_i;
11262   enum reg_class to = (enum reg_class) to_i;
11263   const struct cpu_regmove_cost *regmove_cost
11264     = aarch64_tune_params.regmove_cost;
11265
11266   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
11267   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
11268     to = GENERAL_REGS;
11269
11270   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
11271     from = GENERAL_REGS;
11272
11273   /* Moving between GPR and stack cost is the same as GP2GP.  */
11274   if ((from == GENERAL_REGS && to == STACK_REG)
11275       || (to == GENERAL_REGS && from == STACK_REG))
11276     return regmove_cost->GP2GP;
11277
11278   /* To/From the stack register, we move via the gprs.  */
11279   if (to == STACK_REG || from == STACK_REG)
11280     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
11281             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
11282
11283   if (known_eq (GET_MODE_SIZE (mode), 16))
11284     {
11285       /* 128-bit operations on general registers require 2 instructions.  */
11286       if (from == GENERAL_REGS && to == GENERAL_REGS)
11287         return regmove_cost->GP2GP * 2;
11288       else if (from == GENERAL_REGS)
11289         return regmove_cost->GP2FP * 2;
11290       else if (to == GENERAL_REGS)
11291         return regmove_cost->FP2GP * 2;
11292
11293       /* When AdvSIMD instructions are disabled it is not possible to move
11294          a 128-bit value directly between Q registers.  This is handled in
11295          secondary reload.  A general register is used as a scratch to move
11296          the upper DI value and the lower DI value is moved directly,
11297          hence the cost is the sum of three moves. */
11298       if (! TARGET_SIMD)
11299         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
11300
11301       return regmove_cost->FP2FP;
11302     }
11303
11304   if (from == GENERAL_REGS && to == GENERAL_REGS)
11305     return regmove_cost->GP2GP;
11306   else if (from == GENERAL_REGS)
11307     return regmove_cost->GP2FP;
11308   else if (to == GENERAL_REGS)
11309     return regmove_cost->FP2GP;
11310
11311   return regmove_cost->FP2FP;
11312 }
11313
11314 static int
11315 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
11316                           reg_class_t rclass ATTRIBUTE_UNUSED,
11317                           bool in ATTRIBUTE_UNUSED)
11318 {
11319   return aarch64_tune_params.memmov_cost;
11320 }
11321
11322 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11323    to optimize 1.0/sqrt.  */
11324
11325 static bool
11326 use_rsqrt_p (machine_mode mode)
11327 {
11328   return (!flag_trapping_math
11329           && flag_unsafe_math_optimizations
11330           && ((aarch64_tune_params.approx_modes->recip_sqrt
11331                & AARCH64_APPROX_MODE (mode))
11332               || flag_mrecip_low_precision_sqrt));
11333 }
11334
11335 /* Function to decide when to use the approximate reciprocal square root
11336    builtin.  */
11337
11338 static tree
11339 aarch64_builtin_reciprocal (tree fndecl)
11340 {
11341   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
11342
11343   if (!use_rsqrt_p (mode))
11344     return NULL_TREE;
11345   return aarch64_builtin_rsqrt (DECL_MD_FUNCTION_CODE (fndecl));
11346 }
11347
11348 /* Emit instruction sequence to compute either the approximate square root
11349    or its approximate reciprocal, depending on the flag RECP, and return
11350    whether the sequence was emitted or not.  */
11351
11352 bool
11353 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
11354 {
11355   machine_mode mode = GET_MODE (dst);
11356
11357   if (GET_MODE_INNER (mode) == HFmode)
11358     {
11359       gcc_assert (!recp);
11360       return false;
11361     }
11362
11363   if (!recp)
11364     {
11365       if (!(flag_mlow_precision_sqrt
11366             || (aarch64_tune_params.approx_modes->sqrt
11367                 & AARCH64_APPROX_MODE (mode))))
11368         return false;
11369
11370       if (flag_finite_math_only
11371           || flag_trapping_math
11372           || !flag_unsafe_math_optimizations
11373           || optimize_function_for_size_p (cfun))
11374         return false;
11375     }
11376   else
11377     /* Caller assumes we cannot fail.  */
11378     gcc_assert (use_rsqrt_p (mode));
11379
11380   machine_mode mmsk = mode_for_int_vector (mode).require ();
11381   rtx xmsk = gen_reg_rtx (mmsk);
11382   if (!recp)
11383     /* When calculating the approximate square root, compare the
11384        argument with 0.0 and create a mask.  */
11385     emit_insn (gen_rtx_SET (xmsk,
11386                             gen_rtx_NEG (mmsk,
11387                                          gen_rtx_EQ (mmsk, src,
11388                                                      CONST0_RTX (mode)))));
11389
11390   /* Estimate the approximate reciprocal square root.  */
11391   rtx xdst = gen_reg_rtx (mode);
11392   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
11393
11394   /* Iterate over the series twice for SF and thrice for DF.  */
11395   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11396
11397   /* Optionally iterate over the series once less for faster performance
11398      while sacrificing the accuracy.  */
11399   if ((recp && flag_mrecip_low_precision_sqrt)
11400       || (!recp && flag_mlow_precision_sqrt))
11401     iterations--;
11402
11403   /* Iterate over the series to calculate the approximate reciprocal square
11404      root.  */
11405   rtx x1 = gen_reg_rtx (mode);
11406   while (iterations--)
11407     {
11408       rtx x2 = gen_reg_rtx (mode);
11409       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
11410
11411       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
11412
11413       if (iterations > 0)
11414         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
11415     }
11416
11417   if (!recp)
11418     {
11419       /* Qualify the approximate reciprocal square root when the argument is
11420          0.0 by squashing the intermediary result to 0.0.  */
11421       rtx xtmp = gen_reg_rtx (mmsk);
11422       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
11423                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
11424       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
11425
11426       /* Calculate the approximate square root.  */
11427       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
11428     }
11429
11430   /* Finalize the approximation.  */
11431   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
11432
11433   return true;
11434 }
11435
11436 /* Emit the instruction sequence to compute the approximation for the division
11437    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
11438
11439 bool
11440 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
11441 {
11442   machine_mode mode = GET_MODE (quo);
11443
11444   if (GET_MODE_INNER (mode) == HFmode)
11445     return false;
11446
11447   bool use_approx_division_p = (flag_mlow_precision_div
11448                                 || (aarch64_tune_params.approx_modes->division
11449                                     & AARCH64_APPROX_MODE (mode)));
11450
11451   if (!flag_finite_math_only
11452       || flag_trapping_math
11453       || !flag_unsafe_math_optimizations
11454       || optimize_function_for_size_p (cfun)
11455       || !use_approx_division_p)
11456     return false;
11457
11458   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
11459     return false;
11460
11461   /* Estimate the approximate reciprocal.  */
11462   rtx xrcp = gen_reg_rtx (mode);
11463   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
11464
11465   /* Iterate over the series twice for SF and thrice for DF.  */
11466   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11467
11468   /* Optionally iterate over the series once less for faster performance,
11469      while sacrificing the accuracy.  */
11470   if (flag_mlow_precision_div)
11471     iterations--;
11472
11473   /* Iterate over the series to calculate the approximate reciprocal.  */
11474   rtx xtmp = gen_reg_rtx (mode);
11475   while (iterations--)
11476     {
11477       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
11478
11479       if (iterations > 0)
11480         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
11481     }
11482
11483   if (num != CONST1_RTX (mode))
11484     {
11485       /* As the approximate reciprocal of DEN is already calculated, only
11486          calculate the approximate division when NUM is not 1.0.  */
11487       rtx xnum = force_reg (mode, num);
11488       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
11489     }
11490
11491   /* Finalize the approximation.  */
11492   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
11493   return true;
11494 }
11495
11496 /* Return the number of instructions that can be issued per cycle.  */
11497 static int
11498 aarch64_sched_issue_rate (void)
11499 {
11500   return aarch64_tune_params.issue_rate;
11501 }
11502
11503 static int
11504 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11505 {
11506   int issue_rate = aarch64_sched_issue_rate ();
11507
11508   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
11509 }
11510
11511
11512 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11513    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
11514    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
11515
11516 static int
11517 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11518                                                     int ready_index)
11519 {
11520   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11521 }
11522
11523
11524 /* Vectorizer cost model target hooks.  */
11525
11526 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
11527 static int
11528 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11529                                     tree vectype,
11530                                     int misalign ATTRIBUTE_UNUSED)
11531 {
11532   unsigned elements;
11533   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11534   bool fp = false;
11535
11536   if (vectype != NULL)
11537     fp = FLOAT_TYPE_P (vectype);
11538
11539   switch (type_of_cost)
11540     {
11541       case scalar_stmt:
11542         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11543
11544       case scalar_load:
11545         return costs->scalar_load_cost;
11546
11547       case scalar_store:
11548         return costs->scalar_store_cost;
11549
11550       case vector_stmt:
11551         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11552
11553       case vector_load:
11554         return costs->vec_align_load_cost;
11555
11556       case vector_store:
11557         return costs->vec_store_cost;
11558
11559       case vec_to_scalar:
11560         return costs->vec_to_scalar_cost;
11561
11562       case scalar_to_vec:
11563         return costs->scalar_to_vec_cost;
11564
11565       case unaligned_load:
11566       case vector_gather_load:
11567         return costs->vec_unalign_load_cost;
11568
11569       case unaligned_store:
11570       case vector_scatter_store:
11571         return costs->vec_unalign_store_cost;
11572
11573       case cond_branch_taken:
11574         return costs->cond_taken_branch_cost;
11575
11576       case cond_branch_not_taken:
11577         return costs->cond_not_taken_branch_cost;
11578
11579       case vec_perm:
11580         return costs->vec_permute_cost;
11581
11582       case vec_promote_demote:
11583         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11584
11585       case vec_construct:
11586         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
11587         return elements / 2 + 1;
11588
11589       default:
11590         gcc_unreachable ();
11591     }
11592 }
11593
11594 /* Implement targetm.vectorize.add_stmt_cost.  */
11595 static unsigned
11596 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
11597                        struct _stmt_vec_info *stmt_info, int misalign,
11598                        enum vect_cost_model_location where)
11599 {
11600   unsigned *cost = (unsigned *) data;
11601   unsigned retval = 0;
11602
11603   if (flag_vect_cost_model)
11604     {
11605       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
11606       int stmt_cost =
11607             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
11608
11609       /* Statements in an inner loop relative to the loop being
11610          vectorized are weighted more heavily.  The value here is
11611          arbitrary and could potentially be improved with analysis.  */
11612       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
11613         count *= 50; /*  FIXME  */
11614
11615       retval = (unsigned) (count * stmt_cost);
11616       cost[where] += retval;
11617     }
11618
11619   return retval;
11620 }
11621
11622 static void initialize_aarch64_code_model (struct gcc_options *);
11623
11624 /* Parse the TO_PARSE string and put the architecture struct that it
11625    selects into RES and the architectural features into ISA_FLAGS.
11626    Return an aarch64_parse_opt_result describing the parse result.
11627    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11628    When the TO_PARSE string contains an invalid extension,
11629    a copy of the string is created and stored to INVALID_EXTENSION.  */
11630
11631 static enum aarch64_parse_opt_result
11632 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11633                     uint64_t *isa_flags, std::string *invalid_extension)
11634 {
11635   const char *ext;
11636   const struct processor *arch;
11637   size_t len;
11638
11639   ext = strchr (to_parse, '+');
11640
11641   if (ext != NULL)
11642     len = ext - to_parse;
11643   else
11644     len = strlen (to_parse);
11645
11646   if (len == 0)
11647     return AARCH64_PARSE_MISSING_ARG;
11648
11649
11650   /* Loop through the list of supported ARCHes to find a match.  */
11651   for (arch = all_architectures; arch->name != NULL; arch++)
11652     {
11653       if (strlen (arch->name) == len
11654           && strncmp (arch->name, to_parse, len) == 0)
11655         {
11656           uint64_t isa_temp = arch->flags;
11657
11658           if (ext != NULL)
11659             {
11660               /* TO_PARSE string contains at least one extension.  */
11661               enum aarch64_parse_opt_result ext_res
11662                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11663
11664               if (ext_res != AARCH64_PARSE_OK)
11665                 return ext_res;
11666             }
11667           /* Extension parsing was successful.  Confirm the result
11668              arch and ISA flags.  */
11669           *res = arch;
11670           *isa_flags = isa_temp;
11671           return AARCH64_PARSE_OK;
11672         }
11673     }
11674
11675   /* ARCH name not found in list.  */
11676   return AARCH64_PARSE_INVALID_ARG;
11677 }
11678
11679 /* Parse the TO_PARSE string and put the result tuning in RES and the
11680    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
11681    describing the parse result.  If there is an error parsing, RES and
11682    ISA_FLAGS are left unchanged.
11683    When the TO_PARSE string contains an invalid extension,
11684    a copy of the string is created and stored to INVALID_EXTENSION.  */
11685
11686 static enum aarch64_parse_opt_result
11687 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11688                    uint64_t *isa_flags, std::string *invalid_extension)
11689 {
11690   const char *ext;
11691   const struct processor *cpu;
11692   size_t len;
11693
11694   ext = strchr (to_parse, '+');
11695
11696   if (ext != NULL)
11697     len = ext - to_parse;
11698   else
11699     len = strlen (to_parse);
11700
11701   if (len == 0)
11702     return AARCH64_PARSE_MISSING_ARG;
11703
11704
11705   /* Loop through the list of supported CPUs to find a match.  */
11706   for (cpu = all_cores; cpu->name != NULL; cpu++)
11707     {
11708       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
11709         {
11710           uint64_t isa_temp = cpu->flags;
11711
11712
11713           if (ext != NULL)
11714             {
11715               /* TO_PARSE string contains at least one extension.  */
11716               enum aarch64_parse_opt_result ext_res
11717                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11718
11719               if (ext_res != AARCH64_PARSE_OK)
11720                 return ext_res;
11721             }
11722           /* Extension parsing was successfull.  Confirm the result
11723              cpu and ISA flags.  */
11724           *res = cpu;
11725           *isa_flags = isa_temp;
11726           return AARCH64_PARSE_OK;
11727         }
11728     }
11729
11730   /* CPU name not found in list.  */
11731   return AARCH64_PARSE_INVALID_ARG;
11732 }
11733
11734 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11735    Return an aarch64_parse_opt_result describing the parse result.
11736    If the parsing fails the RES does not change.  */
11737
11738 static enum aarch64_parse_opt_result
11739 aarch64_parse_tune (const char *to_parse, const struct processor **res)
11740 {
11741   const struct processor *cpu;
11742
11743   /* Loop through the list of supported CPUs to find a match.  */
11744   for (cpu = all_cores; cpu->name != NULL; cpu++)
11745     {
11746       if (strcmp (cpu->name, to_parse) == 0)
11747         {
11748           *res = cpu;
11749           return AARCH64_PARSE_OK;
11750         }
11751     }
11752
11753   /* CPU name not found in list.  */
11754   return AARCH64_PARSE_INVALID_ARG;
11755 }
11756
11757 /* Parse TOKEN, which has length LENGTH to see if it is an option
11758    described in FLAG.  If it is, return the index bit for that fusion type.
11759    If not, error (printing OPTION_NAME) and return zero.  */
11760
11761 static unsigned int
11762 aarch64_parse_one_option_token (const char *token,
11763                                 size_t length,
11764                                 const struct aarch64_flag_desc *flag,
11765                                 const char *option_name)
11766 {
11767   for (; flag->name != NULL; flag++)
11768     {
11769       if (length == strlen (flag->name)
11770           && !strncmp (flag->name, token, length))
11771         return flag->flag;
11772     }
11773
11774   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
11775   return 0;
11776 }
11777
11778 /* Parse OPTION which is a comma-separated list of flags to enable.
11779    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11780    default state we inherit from the CPU tuning structures.  OPTION_NAME
11781    gives the top-level option we are parsing in the -moverride string,
11782    for use in error messages.  */
11783
11784 static unsigned int
11785 aarch64_parse_boolean_options (const char *option,
11786                                const struct aarch64_flag_desc *flags,
11787                                unsigned int initial_state,
11788                                const char *option_name)
11789 {
11790   const char separator = '.';
11791   const char* specs = option;
11792   const char* ntoken = option;
11793   unsigned int found_flags = initial_state;
11794
11795   while ((ntoken = strchr (specs, separator)))
11796     {
11797       size_t token_length = ntoken - specs;
11798       unsigned token_ops = aarch64_parse_one_option_token (specs,
11799                                                            token_length,
11800                                                            flags,
11801                                                            option_name);
11802       /* If we find "none" (or, for simplicity's sake, an error) anywhere
11803          in the token stream, reset the supported operations.  So:
11804
11805            adrp+add.cmp+branch.none.adrp+add
11806
11807            would have the result of turning on only adrp+add fusion.  */
11808       if (!token_ops)
11809         found_flags = 0;
11810
11811       found_flags |= token_ops;
11812       specs = ++ntoken;
11813     }
11814
11815   /* We ended with a comma, print something.  */
11816   if (!(*specs))
11817     {
11818       error ("%s string ill-formed\n", option_name);
11819       return 0;
11820     }
11821
11822   /* We still have one more token to parse.  */
11823   size_t token_length = strlen (specs);
11824   unsigned token_ops = aarch64_parse_one_option_token (specs,
11825                                                        token_length,
11826                                                        flags,
11827                                                        option_name);
11828    if (!token_ops)
11829      found_flags = 0;
11830
11831   found_flags |= token_ops;
11832   return found_flags;
11833 }
11834
11835 /* Support for overriding instruction fusion.  */
11836
11837 static void
11838 aarch64_parse_fuse_string (const char *fuse_string,
11839                             struct tune_params *tune)
11840 {
11841   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
11842                                                      aarch64_fusible_pairs,
11843                                                      tune->fusible_ops,
11844                                                      "fuse=");
11845 }
11846
11847 /* Support for overriding other tuning flags.  */
11848
11849 static void
11850 aarch64_parse_tune_string (const char *tune_string,
11851                             struct tune_params *tune)
11852 {
11853   tune->extra_tuning_flags
11854     = aarch64_parse_boolean_options (tune_string,
11855                                      aarch64_tuning_flags,
11856                                      tune->extra_tuning_flags,
11857                                      "tune=");
11858 }
11859
11860 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11861    Accept the valid SVE vector widths allowed by
11862    aarch64_sve_vector_bits_enum and use it to override sve_width
11863    in TUNE.  */
11864
11865 static void
11866 aarch64_parse_sve_width_string (const char *tune_string,
11867                                 struct tune_params *tune)
11868 {
11869   int width = -1;
11870
11871   int n = sscanf (tune_string, "%d", &width);
11872   if (n == EOF)
11873     {
11874       error ("invalid format for sve_width");
11875       return;
11876     }
11877   switch (width)
11878     {
11879     case SVE_128:
11880     case SVE_256:
11881     case SVE_512:
11882     case SVE_1024:
11883     case SVE_2048:
11884       break;
11885     default:
11886       error ("invalid sve_width value: %d", width);
11887     }
11888   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
11889 }
11890
11891 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11892    we understand.  If it is, extract the option string and handoff to
11893    the appropriate function.  */
11894
11895 void
11896 aarch64_parse_one_override_token (const char* token,
11897                                   size_t length,
11898                                   struct tune_params *tune)
11899 {
11900   const struct aarch64_tuning_override_function *fn
11901     = aarch64_tuning_override_functions;
11902
11903   const char *option_part = strchr (token, '=');
11904   if (!option_part)
11905     {
11906       error ("tuning string missing in option (%s)", token);
11907       return;
11908     }
11909
11910   /* Get the length of the option name.  */
11911   length = option_part - token;
11912   /* Skip the '=' to get to the option string.  */
11913   option_part++;
11914
11915   for (; fn->name != NULL; fn++)
11916     {
11917       if (!strncmp (fn->name, token, length))
11918         {
11919           fn->parse_override (option_part, tune);
11920           return;
11921         }
11922     }
11923
11924   error ("unknown tuning option (%s)",token);
11925   return;
11926 }
11927
11928 /* A checking mechanism for the implementation of the tls size.  */
11929
11930 static void
11931 initialize_aarch64_tls_size (struct gcc_options *opts)
11932 {
11933   if (aarch64_tls_size == 0)
11934     aarch64_tls_size = 24;
11935
11936   switch (opts->x_aarch64_cmodel_var)
11937     {
11938     case AARCH64_CMODEL_TINY:
11939       /* Both the default and maximum TLS size allowed under tiny is 1M which
11940          needs two instructions to address, so we clamp the size to 24.  */
11941       if (aarch64_tls_size > 24)
11942         aarch64_tls_size = 24;
11943       break;
11944     case AARCH64_CMODEL_SMALL:
11945       /* The maximum TLS size allowed under small is 4G.  */
11946       if (aarch64_tls_size > 32)
11947         aarch64_tls_size = 32;
11948       break;
11949     case AARCH64_CMODEL_LARGE:
11950       /* The maximum TLS size allowed under large is 16E.
11951          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
11952       if (aarch64_tls_size > 48)
11953         aarch64_tls_size = 48;
11954       break;
11955     default:
11956       gcc_unreachable ();
11957     }
11958
11959   return;
11960 }
11961
11962 /* Parse STRING looking for options in the format:
11963      string     :: option:string
11964      option     :: name=substring
11965      name       :: {a-z}
11966      substring  :: defined by option.  */
11967
11968 static void
11969 aarch64_parse_override_string (const char* input_string,
11970                                struct tune_params* tune)
11971 {
11972   const char separator = ':';
11973   size_t string_length = strlen (input_string) + 1;
11974   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
11975   char *string = string_root;
11976   strncpy (string, input_string, string_length);
11977   string[string_length - 1] = '\0';
11978
11979   char* ntoken = string;
11980
11981   while ((ntoken = strchr (string, separator)))
11982     {
11983       size_t token_length = ntoken - string;
11984       /* Make this substring look like a string.  */
11985       *ntoken = '\0';
11986       aarch64_parse_one_override_token (string, token_length, tune);
11987       string = ++ntoken;
11988     }
11989
11990   /* One last option to parse.  */
11991   aarch64_parse_one_override_token (string, strlen (string), tune);
11992   free (string_root);
11993 }
11994
11995
11996 static void
11997 aarch64_override_options_after_change_1 (struct gcc_options *opts)
11998 {
11999   if (accepted_branch_protection_string)
12000     {
12001       opts->x_aarch64_branch_protection_string
12002         = xstrdup (accepted_branch_protection_string);
12003     }
12004
12005   /* PR 70044: We have to be careful about being called multiple times for the
12006      same function.  This means all changes should be repeatable.  */
12007
12008   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
12009      Disable the frame pointer flag so the mid-end will not use a frame
12010      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
12011      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
12012      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
12013   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
12014   if (opts->x_flag_omit_frame_pointer == 0)
12015     opts->x_flag_omit_frame_pointer = 2;
12016
12017   /* If not optimizing for size, set the default
12018      alignment to what the target wants.  */
12019   if (!opts->x_optimize_size)
12020     {
12021       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
12022         opts->x_str_align_loops = aarch64_tune_params.loop_align;
12023       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
12024         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
12025       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
12026         opts->x_str_align_functions = aarch64_tune_params.function_align;
12027     }
12028
12029   /* We default to no pc-relative literal loads.  */
12030
12031   aarch64_pcrelative_literal_loads = false;
12032
12033   /* If -mpc-relative-literal-loads is set on the command line, this
12034      implies that the user asked for PC relative literal loads.  */
12035   if (opts->x_pcrelative_literal_loads == 1)
12036     aarch64_pcrelative_literal_loads = true;
12037
12038   /* In the tiny memory model it makes no sense to disallow PC relative
12039      literal pool loads.  */
12040   if (aarch64_cmodel == AARCH64_CMODEL_TINY
12041       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12042     aarch64_pcrelative_literal_loads = true;
12043
12044   /* When enabling the lower precision Newton series for the square root, also
12045      enable it for the reciprocal square root, since the latter is an
12046      intermediary step for the former.  */
12047   if (flag_mlow_precision_sqrt)
12048     flag_mrecip_low_precision_sqrt = true;
12049 }
12050
12051 /* 'Unpack' up the internal tuning structs and update the options
12052     in OPTS.  The caller must have set up selected_tune and selected_arch
12053     as all the other target-specific codegen decisions are
12054     derived from them.  */
12055
12056 void
12057 aarch64_override_options_internal (struct gcc_options *opts)
12058 {
12059   aarch64_tune_flags = selected_tune->flags;
12060   aarch64_tune = selected_tune->sched_core;
12061   /* Make a copy of the tuning parameters attached to the core, which
12062      we may later overwrite.  */
12063   aarch64_tune_params = *(selected_tune->tune);
12064   aarch64_architecture_version = selected_arch->architecture_version;
12065
12066   if (opts->x_aarch64_override_tune_string)
12067     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
12068                                   &aarch64_tune_params);
12069
12070   /* This target defaults to strict volatile bitfields.  */
12071   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
12072     opts->x_flag_strict_volatile_bitfields = 1;
12073
12074   if (aarch64_stack_protector_guard == SSP_GLOBAL
12075       && opts->x_aarch64_stack_protector_guard_offset_str)
12076     {
12077       error ("incompatible options %<-mstack-protector-guard=global%> and "
12078              "%<-mstack-protector-guard-offset=%s%>",
12079              aarch64_stack_protector_guard_offset_str);
12080     }
12081
12082   if (aarch64_stack_protector_guard == SSP_SYSREG
12083       && !(opts->x_aarch64_stack_protector_guard_offset_str
12084            && opts->x_aarch64_stack_protector_guard_reg_str))
12085     {
12086       error ("both %<-mstack-protector-guard-offset%> and "
12087              "%<-mstack-protector-guard-reg%> must be used "
12088              "with %<-mstack-protector-guard=sysreg%>");
12089     }
12090
12091   if (opts->x_aarch64_stack_protector_guard_reg_str)
12092     {
12093       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
12094           error ("specify a system register with a small string length.");
12095     }
12096
12097   if (opts->x_aarch64_stack_protector_guard_offset_str)
12098     {
12099       char *end;
12100       const char *str = aarch64_stack_protector_guard_offset_str;
12101       errno = 0;
12102       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
12103       if (!*str || *end || errno)
12104         error ("%qs is not a valid offset in %qs", str,
12105                "-mstack-protector-guard-offset=");
12106       aarch64_stack_protector_guard_offset = offs;
12107     }
12108
12109   initialize_aarch64_code_model (opts);
12110   initialize_aarch64_tls_size (opts);
12111
12112   int queue_depth = 0;
12113   switch (aarch64_tune_params.autoprefetcher_model)
12114     {
12115       case tune_params::AUTOPREFETCHER_OFF:
12116         queue_depth = -1;
12117         break;
12118       case tune_params::AUTOPREFETCHER_WEAK:
12119         queue_depth = 0;
12120         break;
12121       case tune_params::AUTOPREFETCHER_STRONG:
12122         queue_depth = max_insn_queue_index + 1;
12123         break;
12124       default:
12125         gcc_unreachable ();
12126     }
12127
12128   /* We don't mind passing in global_options_set here as we don't use
12129      the *options_set structs anyway.  */
12130   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
12131                          queue_depth,
12132                          opts->x_param_values,
12133                          global_options_set.x_param_values);
12134
12135   /* Set up parameters to be used in prefetching algorithm.  Do not
12136      override the defaults unless we are tuning for a core we have
12137      researched values for.  */
12138   if (aarch64_tune_params.prefetch->num_slots > 0)
12139     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
12140                            aarch64_tune_params.prefetch->num_slots,
12141                            opts->x_param_values,
12142                            global_options_set.x_param_values);
12143   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
12144     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
12145                            aarch64_tune_params.prefetch->l1_cache_size,
12146                            opts->x_param_values,
12147                            global_options_set.x_param_values);
12148   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
12149     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
12150                            aarch64_tune_params.prefetch->l1_cache_line_size,
12151                            opts->x_param_values,
12152                            global_options_set.x_param_values);
12153   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
12154     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
12155                            aarch64_tune_params.prefetch->l2_cache_size,
12156                            opts->x_param_values,
12157                            global_options_set.x_param_values);
12158   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
12159     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
12160                            0,
12161                            opts->x_param_values,
12162                            global_options_set.x_param_values);
12163   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
12164     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
12165                            aarch64_tune_params.prefetch->minimum_stride,
12166                            opts->x_param_values,
12167                            global_options_set.x_param_values);
12168
12169   /* Use the alternative scheduling-pressure algorithm by default.  */
12170   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
12171                          opts->x_param_values,
12172                          global_options_set.x_param_values);
12173
12174   /* If the user hasn't changed it via configure then set the default to 64 KB
12175      for the backend.  */
12176   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
12177                          DEFAULT_STK_CLASH_GUARD_SIZE == 0
12178                            ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
12179                          opts->x_param_values,
12180                          global_options_set.x_param_values);
12181
12182   /* Validate the guard size.  */
12183   int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
12184
12185   /* Enforce that interval is the same size as size so the mid-end does the
12186      right thing.  */
12187   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
12188                          guard_size,
12189                          opts->x_param_values,
12190                          global_options_set.x_param_values);
12191
12192   /* The maybe_set calls won't update the value if the user has explicitly set
12193      one.  Which means we need to validate that probing interval and guard size
12194      are equal.  */
12195   int probe_interval
12196     = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
12197   if (guard_size != probe_interval)
12198     error ("stack clash guard size %<%d%> must be equal to probing interval "
12199            "%<%d%>", guard_size, probe_interval);
12200
12201   /* Enable sw prefetching at specified optimization level for
12202      CPUS that have prefetch.  Lower optimization level threshold by 1
12203      when profiling is enabled.  */
12204   if (opts->x_flag_prefetch_loop_arrays < 0
12205       && !opts->x_optimize_size
12206       && aarch64_tune_params.prefetch->default_opt_level >= 0
12207       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
12208     opts->x_flag_prefetch_loop_arrays = 1;
12209
12210   if (opts->x_aarch64_arch_string == NULL)
12211     opts->x_aarch64_arch_string = selected_arch->name;
12212   if (opts->x_aarch64_cpu_string == NULL)
12213     opts->x_aarch64_cpu_string = selected_cpu->name;
12214   if (opts->x_aarch64_tune_string == NULL)
12215     opts->x_aarch64_tune_string = selected_tune->name;
12216
12217   aarch64_override_options_after_change_1 (opts);
12218 }
12219
12220 /* Print a hint with a suggestion for a core or architecture name that
12221    most closely resembles what the user passed in STR.  ARCH is true if
12222    the user is asking for an architecture name.  ARCH is false if the user
12223    is asking for a core name.  */
12224
12225 static void
12226 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
12227 {
12228   auto_vec<const char *> candidates;
12229   const struct processor *entry = arch ? all_architectures : all_cores;
12230   for (; entry->name != NULL; entry++)
12231     candidates.safe_push (entry->name);
12232
12233 #ifdef HAVE_LOCAL_CPU_DETECT
12234   /* Add also "native" as possible value.  */
12235   if (arch)
12236     candidates.safe_push ("native");
12237 #endif
12238
12239   char *s;
12240   const char *hint = candidates_list_and_hint (str, s, candidates);
12241   if (hint)
12242     inform (input_location, "valid arguments are: %s;"
12243                              " did you mean %qs?", s, hint);
12244   else
12245     inform (input_location, "valid arguments are: %s", s);
12246
12247   XDELETEVEC (s);
12248 }
12249
12250 /* Print a hint with a suggestion for a core name that most closely resembles
12251    what the user passed in STR.  */
12252
12253 inline static void
12254 aarch64_print_hint_for_core (const char *str)
12255 {
12256   aarch64_print_hint_for_core_or_arch (str, false);
12257 }
12258
12259 /* Print a hint with a suggestion for an architecture name that most closely
12260    resembles what the user passed in STR.  */
12261
12262 inline static void
12263 aarch64_print_hint_for_arch (const char *str)
12264 {
12265   aarch64_print_hint_for_core_or_arch (str, true);
12266 }
12267
12268
12269 /* Print a hint with a suggestion for an extension name
12270    that most closely resembles what the user passed in STR.  */
12271
12272 void
12273 aarch64_print_hint_for_extensions (const std::string &str)
12274 {
12275   auto_vec<const char *> candidates;
12276   aarch64_get_all_extension_candidates (&candidates);
12277   char *s;
12278   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
12279   if (hint)
12280     inform (input_location, "valid arguments are: %s;"
12281                              " did you mean %qs?", s, hint);
12282   else
12283     inform (input_location, "valid arguments are: %s;", s);
12284
12285   XDELETEVEC (s);
12286 }
12287
12288 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
12289    specified in STR and throw errors if appropriate.  Put the results if
12290    they are valid in RES and ISA_FLAGS.  Return whether the option is
12291    valid.  */
12292
12293 static bool
12294 aarch64_validate_mcpu (const char *str, const struct processor **res,
12295                        uint64_t *isa_flags)
12296 {
12297   std::string invalid_extension;
12298   enum aarch64_parse_opt_result parse_res
12299     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
12300
12301   if (parse_res == AARCH64_PARSE_OK)
12302     return true;
12303
12304   switch (parse_res)
12305     {
12306       case AARCH64_PARSE_MISSING_ARG:
12307         error ("missing cpu name in %<-mcpu=%s%>", str);
12308         break;
12309       case AARCH64_PARSE_INVALID_ARG:
12310         error ("unknown value %qs for %<-mcpu%>", str);
12311         aarch64_print_hint_for_core (str);
12312         break;
12313       case AARCH64_PARSE_INVALID_FEATURE:
12314         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12315                invalid_extension.c_str (), str);
12316         aarch64_print_hint_for_extensions (invalid_extension);
12317         break;
12318       default:
12319         gcc_unreachable ();
12320     }
12321
12322   return false;
12323 }
12324
12325 /* Parses CONST_STR for branch protection features specified in
12326    aarch64_branch_protect_types, and set any global variables required.  Returns
12327    the parsing result and assigns LAST_STR to the last processed token from
12328    CONST_STR so that it can be used for error reporting.  */
12329
12330 static enum
12331 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
12332                                                           char** last_str)
12333 {
12334   char *str_root = xstrdup (const_str);
12335   char* token_save = NULL;
12336   char *str = strtok_r (str_root, "+", &token_save);
12337   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
12338   if (!str)
12339     res = AARCH64_PARSE_MISSING_ARG;
12340   else
12341     {
12342       char *next_str = strtok_r (NULL, "+", &token_save);
12343       /* Reset the branch protection features to their defaults.  */
12344       aarch64_handle_no_branch_protection (NULL, NULL);
12345
12346       while (str && res == AARCH64_PARSE_OK)
12347         {
12348           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
12349           bool found = false;
12350           /* Search for this type.  */
12351           while (type && type->name && !found && res == AARCH64_PARSE_OK)
12352             {
12353               if (strcmp (str, type->name) == 0)
12354                 {
12355                   found = true;
12356                   res = type->handler (str, next_str);
12357                   str = next_str;
12358                   next_str = strtok_r (NULL, "+", &token_save);
12359                 }
12360               else
12361                 type++;
12362             }
12363           if (found && res == AARCH64_PARSE_OK)
12364             {
12365               bool found_subtype = true;
12366               /* Loop through each token until we find one that isn't a
12367                  subtype.  */
12368               while (found_subtype)
12369                 {
12370                   found_subtype = false;
12371                   const aarch64_branch_protect_type *subtype = type->subtypes;
12372                   /* Search for the subtype.  */
12373                   while (str && subtype && subtype->name && !found_subtype
12374                           && res == AARCH64_PARSE_OK)
12375                     {
12376                       if (strcmp (str, subtype->name) == 0)
12377                         {
12378                           found_subtype = true;
12379                           res = subtype->handler (str, next_str);
12380                           str = next_str;
12381                           next_str = strtok_r (NULL, "+", &token_save);
12382                         }
12383                       else
12384                         subtype++;
12385                     }
12386                 }
12387             }
12388           else if (!found)
12389             res = AARCH64_PARSE_INVALID_ARG;
12390         }
12391     }
12392   /* Copy the last processed token into the argument to pass it back.
12393     Used by option and attribute validation to print the offending token.  */
12394   if (last_str)
12395     {
12396       if (str) strcpy (*last_str, str);
12397       else *last_str = NULL;
12398     }
12399   if (res == AARCH64_PARSE_OK)
12400     {
12401       /* If needed, alloc the accepted string then copy in const_str.
12402         Used by override_option_after_change_1.  */
12403       if (!accepted_branch_protection_string)
12404         accepted_branch_protection_string = (char *) xmalloc (
12405                                                       BRANCH_PROTECT_STR_MAX
12406                                                         + 1);
12407       strncpy (accepted_branch_protection_string, const_str,
12408                 BRANCH_PROTECT_STR_MAX + 1);
12409       /* Forcibly null-terminate.  */
12410       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
12411     }
12412   return res;
12413 }
12414
12415 static bool
12416 aarch64_validate_mbranch_protection (const char *const_str)
12417 {
12418   char *str = (char *) xmalloc (strlen (const_str));
12419   enum aarch64_parse_opt_result res =
12420     aarch64_parse_branch_protection (const_str, &str);
12421   if (res == AARCH64_PARSE_INVALID_ARG)
12422     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
12423   else if (res == AARCH64_PARSE_MISSING_ARG)
12424     error ("missing argument for %<-mbranch-protection=%>");
12425   free (str);
12426   return res == AARCH64_PARSE_OK;
12427 }
12428
12429 /* Validate a command-line -march option.  Parse the arch and extensions
12430    (if any) specified in STR and throw errors if appropriate.  Put the
12431    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
12432    option is valid.  */
12433
12434 static bool
12435 aarch64_validate_march (const char *str, const struct processor **res,
12436                          uint64_t *isa_flags)
12437 {
12438   std::string invalid_extension;
12439   enum aarch64_parse_opt_result parse_res
12440     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
12441
12442   if (parse_res == AARCH64_PARSE_OK)
12443     return true;
12444
12445   switch (parse_res)
12446     {
12447       case AARCH64_PARSE_MISSING_ARG:
12448         error ("missing arch name in %<-march=%s%>", str);
12449         break;
12450       case AARCH64_PARSE_INVALID_ARG:
12451         error ("unknown value %qs for %<-march%>", str);
12452         aarch64_print_hint_for_arch (str);
12453         break;
12454       case AARCH64_PARSE_INVALID_FEATURE:
12455         error ("invalid feature modifier %qs in %<-march=%s%>",
12456                invalid_extension.c_str (), str);
12457         aarch64_print_hint_for_extensions (invalid_extension);
12458         break;
12459       default:
12460         gcc_unreachable ();
12461     }
12462
12463   return false;
12464 }
12465
12466 /* Validate a command-line -mtune option.  Parse the cpu
12467    specified in STR and throw errors if appropriate.  Put the
12468    result, if it is valid, in RES.  Return whether the option is
12469    valid.  */
12470
12471 static bool
12472 aarch64_validate_mtune (const char *str, const struct processor **res)
12473 {
12474   enum aarch64_parse_opt_result parse_res
12475     = aarch64_parse_tune (str, res);
12476
12477   if (parse_res == AARCH64_PARSE_OK)
12478     return true;
12479
12480   switch (parse_res)
12481     {
12482       case AARCH64_PARSE_MISSING_ARG:
12483         error ("missing cpu name in %<-mtune=%s%>", str);
12484         break;
12485       case AARCH64_PARSE_INVALID_ARG:
12486         error ("unknown value %qs for %<-mtune%>", str);
12487         aarch64_print_hint_for_core (str);
12488         break;
12489       default:
12490         gcc_unreachable ();
12491     }
12492   return false;
12493 }
12494
12495 /* Return the CPU corresponding to the enum CPU.
12496    If it doesn't specify a cpu, return the default.  */
12497
12498 static const struct processor *
12499 aarch64_get_tune_cpu (enum aarch64_processor cpu)
12500 {
12501   if (cpu != aarch64_none)
12502     return &all_cores[cpu];
12503
12504   /* The & 0x3f is to extract the bottom 6 bits that encode the
12505      default cpu as selected by the --with-cpu GCC configure option
12506      in config.gcc.
12507      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12508      flags mechanism should be reworked to make it more sane.  */
12509   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12510 }
12511
12512 /* Return the architecture corresponding to the enum ARCH.
12513    If it doesn't specify a valid architecture, return the default.  */
12514
12515 static const struct processor *
12516 aarch64_get_arch (enum aarch64_arch arch)
12517 {
12518   if (arch != aarch64_no_arch)
12519     return &all_architectures[arch];
12520
12521   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12522
12523   return &all_architectures[cpu->arch];
12524 }
12525
12526 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
12527
12528 static poly_uint16
12529 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12530 {
12531   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12532      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12533      deciding which .md file patterns to use and when deciding whether
12534      something is a legitimate address or constant.  */
12535   if (value == SVE_SCALABLE || value == SVE_128)
12536     return poly_uint16 (2, 2);
12537   else
12538     return (int) value / 64;
12539 }
12540
12541 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
12542    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12543    tuning structs.  In particular it must set selected_tune and
12544    aarch64_isa_flags that define the available ISA features and tuning
12545    decisions.  It must also set selected_arch as this will be used to
12546    output the .arch asm tags for each function.  */
12547
12548 static void
12549 aarch64_override_options (void)
12550 {
12551   uint64_t cpu_isa = 0;
12552   uint64_t arch_isa = 0;
12553   aarch64_isa_flags = 0;
12554
12555   bool valid_cpu = true;
12556   bool valid_tune = true;
12557   bool valid_arch = true;
12558
12559   selected_cpu = NULL;
12560   selected_arch = NULL;
12561   selected_tune = NULL;
12562
12563   if (aarch64_branch_protection_string)
12564     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12565
12566   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12567      If either of -march or -mtune is given, they override their
12568      respective component of -mcpu.  */
12569   if (aarch64_cpu_string)
12570     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12571                                         &cpu_isa);
12572
12573   if (aarch64_arch_string)
12574     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
12575                                           &arch_isa);
12576
12577   if (aarch64_tune_string)
12578     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
12579
12580 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12581   SUBTARGET_OVERRIDE_OPTIONS;
12582 #endif
12583
12584   /* If the user did not specify a processor, choose the default
12585      one for them.  This will be the CPU set during configuration using
12586      --with-cpu, otherwise it is "generic".  */
12587   if (!selected_cpu)
12588     {
12589       if (selected_arch)
12590         {
12591           selected_cpu = &all_cores[selected_arch->ident];
12592           aarch64_isa_flags = arch_isa;
12593           explicit_arch = selected_arch->arch;
12594         }
12595       else
12596         {
12597           /* Get default configure-time CPU.  */
12598           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
12599           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
12600         }
12601
12602       if (selected_tune)
12603         explicit_tune_core = selected_tune->ident;
12604     }
12605   /* If both -mcpu and -march are specified check that they are architecturally
12606      compatible, warn if they're not and prefer the -march ISA flags.  */
12607   else if (selected_arch)
12608     {
12609       if (selected_arch->arch != selected_cpu->arch)
12610         {
12611           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12612                        all_architectures[selected_cpu->arch].name,
12613                        selected_arch->name);
12614         }
12615       aarch64_isa_flags = arch_isa;
12616       explicit_arch = selected_arch->arch;
12617       explicit_tune_core = selected_tune ? selected_tune->ident
12618                                           : selected_cpu->ident;
12619     }
12620   else
12621     {
12622       /* -mcpu but no -march.  */
12623       aarch64_isa_flags = cpu_isa;
12624       explicit_tune_core = selected_tune ? selected_tune->ident
12625                                           : selected_cpu->ident;
12626       gcc_assert (selected_cpu);
12627       selected_arch = &all_architectures[selected_cpu->arch];
12628       explicit_arch = selected_arch->arch;
12629     }
12630
12631   /* Set the arch as well as we will need it when outputing
12632      the .arch directive in assembly.  */
12633   if (!selected_arch)
12634     {
12635       gcc_assert (selected_cpu);
12636       selected_arch = &all_architectures[selected_cpu->arch];
12637     }
12638
12639   if (!selected_tune)
12640     selected_tune = selected_cpu;
12641
12642   if (aarch64_enable_bti == 2)
12643     {
12644 #ifdef TARGET_ENABLE_BTI
12645       aarch64_enable_bti = 1;
12646 #else
12647       aarch64_enable_bti = 0;
12648 #endif
12649     }
12650
12651   /* Return address signing is currently not supported for ILP32 targets.  For
12652      LP64 targets use the configured option in the absence of a command-line
12653      option for -mbranch-protection.  */
12654   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12655     {
12656 #ifdef TARGET_ENABLE_PAC_RET
12657       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12658 #else
12659       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12660 #endif
12661     }
12662
12663 #ifndef HAVE_AS_MABI_OPTION
12664   /* The compiler may have been configured with 2.23.* binutils, which does
12665      not have support for ILP32.  */
12666   if (TARGET_ILP32)
12667     error ("assembler does not support %<-mabi=ilp32%>");
12668 #endif
12669
12670   /* Convert -msve-vector-bits to a VG count.  */
12671   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12672
12673   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12674     sorry ("return address signing is only supported for %<-mabi=lp64%>");
12675
12676   /* Make sure we properly set up the explicit options.  */
12677   if ((aarch64_cpu_string && valid_cpu)
12678        || (aarch64_tune_string && valid_tune))
12679     gcc_assert (explicit_tune_core != aarch64_none);
12680
12681   if ((aarch64_cpu_string && valid_cpu)
12682        || (aarch64_arch_string && valid_arch))
12683     gcc_assert (explicit_arch != aarch64_no_arch);
12684
12685   /* The pass to insert speculation tracking runs before
12686      shrink-wrapping and the latter does not know how to update the
12687      tracking status.  So disable it in this case.  */
12688   if (aarch64_track_speculation)
12689     flag_shrink_wrap = 0;
12690
12691   aarch64_override_options_internal (&global_options);
12692
12693   /* Save these options as the default ones in case we push and pop them later
12694      while processing functions with potential target attributes.  */
12695   target_option_default_node = target_option_current_node
12696       = build_target_option_node (&global_options);
12697 }
12698
12699 /* Implement targetm.override_options_after_change.  */
12700
12701 static void
12702 aarch64_override_options_after_change (void)
12703 {
12704   aarch64_override_options_after_change_1 (&global_options);
12705 }
12706
12707 static struct machine_function *
12708 aarch64_init_machine_status (void)
12709 {
12710   struct machine_function *machine;
12711   machine = ggc_cleared_alloc<machine_function> ();
12712   return machine;
12713 }
12714
12715 void
12716 aarch64_init_expanders (void)
12717 {
12718   init_machine_status = aarch64_init_machine_status;
12719 }
12720
12721 /* A checking mechanism for the implementation of the various code models.  */
12722 static void
12723 initialize_aarch64_code_model (struct gcc_options *opts)
12724 {
12725    if (opts->x_flag_pic)
12726      {
12727        switch (opts->x_aarch64_cmodel_var)
12728          {
12729          case AARCH64_CMODEL_TINY:
12730            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
12731            break;
12732          case AARCH64_CMODEL_SMALL:
12733 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12734            aarch64_cmodel = (flag_pic == 2
12735                              ? AARCH64_CMODEL_SMALL_PIC
12736                              : AARCH64_CMODEL_SMALL_SPIC);
12737 #else
12738            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
12739 #endif
12740            break;
12741          case AARCH64_CMODEL_LARGE:
12742            sorry ("code model %qs with %<-f%s%>", "large",
12743                   opts->x_flag_pic > 1 ? "PIC" : "pic");
12744            break;
12745          default:
12746            gcc_unreachable ();
12747          }
12748      }
12749    else
12750      aarch64_cmodel = opts->x_aarch64_cmodel_var;
12751 }
12752
12753 /* Implement TARGET_OPTION_SAVE.  */
12754
12755 static void
12756 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
12757 {
12758   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
12759   ptr->x_aarch64_branch_protection_string
12760     = opts->x_aarch64_branch_protection_string;
12761 }
12762
12763 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
12764    using the information saved in PTR.  */
12765
12766 static void
12767 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
12768 {
12769   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
12770   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12771   opts->x_explicit_arch = ptr->x_explicit_arch;
12772   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
12773   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
12774   opts->x_aarch64_branch_protection_string
12775     = ptr->x_aarch64_branch_protection_string;
12776   if (opts->x_aarch64_branch_protection_string)
12777     {
12778       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
12779                                         NULL);
12780     }
12781
12782   aarch64_override_options_internal (opts);
12783 }
12784
12785 /* Implement TARGET_OPTION_PRINT.  */
12786
12787 static void
12788 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
12789 {
12790   const struct processor *cpu
12791     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12792   uint64_t isa_flags = ptr->x_aarch64_isa_flags;
12793   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
12794   std::string extension
12795     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
12796
12797   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
12798   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
12799            arch->name, extension.c_str ());
12800 }
12801
12802 static GTY(()) tree aarch64_previous_fndecl;
12803
12804 void
12805 aarch64_reset_previous_fndecl (void)
12806 {
12807   aarch64_previous_fndecl = NULL;
12808 }
12809
12810 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
12811    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
12812    make sure optab availability predicates are recomputed when necessary.  */
12813
12814 void
12815 aarch64_save_restore_target_globals (tree new_tree)
12816 {
12817   if (TREE_TARGET_GLOBALS (new_tree))
12818     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
12819   else if (new_tree == target_option_default_node)
12820     restore_target_globals (&default_target_globals);
12821   else
12822     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
12823 }
12824
12825 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
12826    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
12827    of the function, if such exists.  This function may be called multiple
12828    times on a single function so use aarch64_previous_fndecl to avoid
12829    setting up identical state.  */
12830
12831 static void
12832 aarch64_set_current_function (tree fndecl)
12833 {
12834   if (!fndecl || fndecl == aarch64_previous_fndecl)
12835     return;
12836
12837   tree old_tree = (aarch64_previous_fndecl
12838                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
12839                    : NULL_TREE);
12840
12841   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12842
12843   /* If current function has no attributes but the previous one did,
12844      use the default node.  */
12845   if (!new_tree && old_tree)
12846     new_tree = target_option_default_node;
12847
12848   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
12849      the default have been handled by aarch64_save_restore_target_globals from
12850      aarch64_pragma_target_parse.  */
12851   if (old_tree == new_tree)
12852     return;
12853
12854   aarch64_previous_fndecl = fndecl;
12855
12856   /* First set the target options.  */
12857   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
12858
12859   aarch64_save_restore_target_globals (new_tree);
12860 }
12861
12862 /* Enum describing the various ways we can handle attributes.
12863    In many cases we can reuse the generic option handling machinery.  */
12864
12865 enum aarch64_attr_opt_type
12866 {
12867   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
12868   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
12869   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
12870   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
12871 };
12872
12873 /* All the information needed to handle a target attribute.
12874    NAME is the name of the attribute.
12875    ATTR_TYPE specifies the type of behavior of the attribute as described
12876    in the definition of enum aarch64_attr_opt_type.
12877    ALLOW_NEG is true if the attribute supports a "no-" form.
12878    HANDLER is the function that takes the attribute string as an argument
12879    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12880    OPT_NUM is the enum specifying the option that the attribute modifies.
12881    This is needed for attributes that mirror the behavior of a command-line
12882    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12883    aarch64_attr_enum.  */
12884
12885 struct aarch64_attribute_info
12886 {
12887   const char *name;
12888   enum aarch64_attr_opt_type attr_type;
12889   bool allow_neg;
12890   bool (*handler) (const char *);
12891   enum opt_code opt_num;
12892 };
12893
12894 /* Handle the ARCH_STR argument to the arch= target attribute.  */
12895
12896 static bool
12897 aarch64_handle_attr_arch (const char *str)
12898 {
12899   const struct processor *tmp_arch = NULL;
12900   std::string invalid_extension;
12901   enum aarch64_parse_opt_result parse_res
12902     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
12903
12904   if (parse_res == AARCH64_PARSE_OK)
12905     {
12906       gcc_assert (tmp_arch);
12907       selected_arch = tmp_arch;
12908       explicit_arch = selected_arch->arch;
12909       return true;
12910     }
12911
12912   switch (parse_res)
12913     {
12914       case AARCH64_PARSE_MISSING_ARG:
12915         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12916         break;
12917       case AARCH64_PARSE_INVALID_ARG:
12918         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
12919         aarch64_print_hint_for_arch (str);
12920         break;
12921       case AARCH64_PARSE_INVALID_FEATURE:
12922         error ("invalid feature modifier %s of value (\"%s\") in "
12923                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12924         aarch64_print_hint_for_extensions (invalid_extension);
12925         break;
12926       default:
12927         gcc_unreachable ();
12928     }
12929
12930   return false;
12931 }
12932
12933 /* Handle the argument CPU_STR to the cpu= target attribute.  */
12934
12935 static bool
12936 aarch64_handle_attr_cpu (const char *str)
12937 {
12938   const struct processor *tmp_cpu = NULL;
12939   std::string invalid_extension;
12940   enum aarch64_parse_opt_result parse_res
12941     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
12942
12943   if (parse_res == AARCH64_PARSE_OK)
12944     {
12945       gcc_assert (tmp_cpu);
12946       selected_tune = tmp_cpu;
12947       explicit_tune_core = selected_tune->ident;
12948
12949       selected_arch = &all_architectures[tmp_cpu->arch];
12950       explicit_arch = selected_arch->arch;
12951       return true;
12952     }
12953
12954   switch (parse_res)
12955     {
12956       case AARCH64_PARSE_MISSING_ARG:
12957         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12958         break;
12959       case AARCH64_PARSE_INVALID_ARG:
12960         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
12961         aarch64_print_hint_for_core (str);
12962         break;
12963       case AARCH64_PARSE_INVALID_FEATURE:
12964         error ("invalid feature modifier %s of value (\"%s\") in "
12965                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12966         aarch64_print_hint_for_extensions (invalid_extension);
12967         break;
12968       default:
12969         gcc_unreachable ();
12970     }
12971
12972   return false;
12973 }
12974
12975 /* Handle the argument STR to the branch-protection= attribute.  */
12976
12977  static bool
12978  aarch64_handle_attr_branch_protection (const char* str)
12979  {
12980   char *err_str = (char *) xmalloc (strlen (str));
12981   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
12982                                                                       &err_str);
12983   bool success = false;
12984   switch (res)
12985     {
12986      case AARCH64_PARSE_MISSING_ARG:
12987        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12988               " attribute");
12989        break;
12990      case AARCH64_PARSE_INVALID_ARG:
12991        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12992               "=\")%> pragma or attribute", err_str);
12993        break;
12994      case AARCH64_PARSE_OK:
12995        success = true;
12996       /* Fall through.  */
12997      case AARCH64_PARSE_INVALID_FEATURE:
12998        break;
12999      default:
13000        gcc_unreachable ();
13001     }
13002   free (err_str);
13003   return success;
13004  }
13005
13006 /* Handle the argument STR to the tune= target attribute.  */
13007
13008 static bool
13009 aarch64_handle_attr_tune (const char *str)
13010 {
13011   const struct processor *tmp_tune = NULL;
13012   enum aarch64_parse_opt_result parse_res
13013     = aarch64_parse_tune (str, &tmp_tune);
13014
13015   if (parse_res == AARCH64_PARSE_OK)
13016     {
13017       gcc_assert (tmp_tune);
13018       selected_tune = tmp_tune;
13019       explicit_tune_core = selected_tune->ident;
13020       return true;
13021     }
13022
13023   switch (parse_res)
13024     {
13025       case AARCH64_PARSE_INVALID_ARG:
13026         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
13027         aarch64_print_hint_for_core (str);
13028         break;
13029       default:
13030         gcc_unreachable ();
13031     }
13032
13033   return false;
13034 }
13035
13036 /* Parse an architecture extensions target attribute string specified in STR.
13037    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
13038    if successful.  Update aarch64_isa_flags to reflect the ISA features
13039    modified.  */
13040
13041 static bool
13042 aarch64_handle_attr_isa_flags (char *str)
13043 {
13044   enum aarch64_parse_opt_result parse_res;
13045   uint64_t isa_flags = aarch64_isa_flags;
13046
13047   /* We allow "+nothing" in the beginning to clear out all architectural
13048      features if the user wants to handpick specific features.  */
13049   if (strncmp ("+nothing", str, 8) == 0)
13050     {
13051       isa_flags = 0;
13052       str += 8;
13053     }
13054
13055   std::string invalid_extension;
13056   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
13057
13058   if (parse_res == AARCH64_PARSE_OK)
13059     {
13060       aarch64_isa_flags = isa_flags;
13061       return true;
13062     }
13063
13064   switch (parse_res)
13065     {
13066       case AARCH64_PARSE_MISSING_ARG:
13067         error ("missing value in %<target()%> pragma or attribute");
13068         break;
13069
13070       case AARCH64_PARSE_INVALID_FEATURE:
13071         error ("invalid feature modifier %s of value (\"%s\") in "
13072                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13073         break;
13074
13075       default:
13076         gcc_unreachable ();
13077     }
13078
13079  return false;
13080 }
13081
13082 /* The target attributes that we support.  On top of these we also support just
13083    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
13084    handled explicitly in aarch64_process_one_target_attr.  */
13085
13086 static const struct aarch64_attribute_info aarch64_attributes[] =
13087 {
13088   { "general-regs-only", aarch64_attr_mask, false, NULL,
13089      OPT_mgeneral_regs_only },
13090   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
13091      OPT_mfix_cortex_a53_835769 },
13092   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
13093      OPT_mfix_cortex_a53_843419 },
13094   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
13095   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
13096   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
13097      OPT_momit_leaf_frame_pointer },
13098   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
13099   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
13100      OPT_march_ },
13101   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
13102   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
13103      OPT_mtune_ },
13104   { "branch-protection", aarch64_attr_custom, false,
13105      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
13106   { "sign-return-address", aarch64_attr_enum, false, NULL,
13107      OPT_msign_return_address_ },
13108   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
13109 };
13110
13111 /* Parse ARG_STR which contains the definition of one target attribute.
13112    Show appropriate errors if any or return true if the attribute is valid.  */
13113
13114 static bool
13115 aarch64_process_one_target_attr (char *arg_str)
13116 {
13117   bool invert = false;
13118
13119   size_t len = strlen (arg_str);
13120
13121   if (len == 0)
13122     {
13123       error ("malformed %<target()%> pragma or attribute");
13124       return false;
13125     }
13126
13127   char *str_to_check = (char *) alloca (len + 1);
13128   strcpy (str_to_check, arg_str);
13129
13130   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
13131      It is easier to detect and handle it explicitly here rather than going
13132      through the machinery for the rest of the target attributes in this
13133      function.  */
13134   if (*str_to_check == '+')
13135     return aarch64_handle_attr_isa_flags (str_to_check);
13136
13137   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
13138     {
13139       invert = true;
13140       str_to_check += 3;
13141     }
13142   char *arg = strchr (str_to_check, '=');
13143
13144   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
13145      and point ARG to "foo".  */
13146   if (arg)
13147     {
13148       *arg = '\0';
13149       arg++;
13150     }
13151   const struct aarch64_attribute_info *p_attr;
13152   bool found = false;
13153   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
13154     {
13155       /* If the names don't match up, or the user has given an argument
13156          to an attribute that doesn't accept one, or didn't give an argument
13157          to an attribute that expects one, fail to match.  */
13158       if (strcmp (str_to_check, p_attr->name) != 0)
13159         continue;
13160
13161       found = true;
13162       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
13163                               || p_attr->attr_type == aarch64_attr_enum;
13164
13165       if (attr_need_arg_p ^ (arg != NULL))
13166         {
13167           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
13168           return false;
13169         }
13170
13171       /* If the name matches but the attribute does not allow "no-" versions
13172          then we can't match.  */
13173       if (invert && !p_attr->allow_neg)
13174         {
13175           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
13176           return false;
13177         }
13178
13179       switch (p_attr->attr_type)
13180         {
13181         /* Has a custom handler registered.
13182            For example, cpu=, arch=, tune=.  */
13183           case aarch64_attr_custom:
13184             gcc_assert (p_attr->handler);
13185             if (!p_attr->handler (arg))
13186               return false;
13187             break;
13188
13189           /* Either set or unset a boolean option.  */
13190           case aarch64_attr_bool:
13191             {
13192               struct cl_decoded_option decoded;
13193
13194               generate_option (p_attr->opt_num, NULL, !invert,
13195                                CL_TARGET, &decoded);
13196               aarch64_handle_option (&global_options, &global_options_set,
13197                                       &decoded, input_location);
13198               break;
13199             }
13200           /* Set or unset a bit in the target_flags.  aarch64_handle_option
13201              should know what mask to apply given the option number.  */
13202           case aarch64_attr_mask:
13203             {
13204               struct cl_decoded_option decoded;
13205               /* We only need to specify the option number.
13206                  aarch64_handle_option will know which mask to apply.  */
13207               decoded.opt_index = p_attr->opt_num;
13208               decoded.value = !invert;
13209               aarch64_handle_option (&global_options, &global_options_set,
13210                                       &decoded, input_location);
13211               break;
13212             }
13213           /* Use the option setting machinery to set an option to an enum.  */
13214           case aarch64_attr_enum:
13215             {
13216               gcc_assert (arg);
13217               bool valid;
13218               int value;
13219               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
13220                                               &value, CL_TARGET);
13221               if (valid)
13222                 {
13223                   set_option (&global_options, NULL, p_attr->opt_num, value,
13224                               NULL, DK_UNSPECIFIED, input_location,
13225                               global_dc);
13226                 }
13227               else
13228                 {
13229                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
13230                 }
13231               break;
13232             }
13233           default:
13234             gcc_unreachable ();
13235         }
13236     }
13237
13238   /* If we reached here we either have found an attribute and validated
13239      it or didn't match any.  If we matched an attribute but its arguments
13240      were malformed we will have returned false already.  */
13241   return found;
13242 }
13243
13244 /* Count how many times the character C appears in
13245    NULL-terminated string STR.  */
13246
13247 static unsigned int
13248 num_occurences_in_str (char c, char *str)
13249 {
13250   unsigned int res = 0;
13251   while (*str != '\0')
13252     {
13253       if (*str == c)
13254         res++;
13255
13256       str++;
13257     }
13258
13259   return res;
13260 }
13261
13262 /* Parse the tree in ARGS that contains the target attribute information
13263    and update the global target options space.  */
13264
13265 bool
13266 aarch64_process_target_attr (tree args)
13267 {
13268   if (TREE_CODE (args) == TREE_LIST)
13269     {
13270       do
13271         {
13272           tree head = TREE_VALUE (args);
13273           if (head)
13274             {
13275               if (!aarch64_process_target_attr (head))
13276                 return false;
13277             }
13278           args = TREE_CHAIN (args);
13279         } while (args);
13280
13281       return true;
13282     }
13283
13284   if (TREE_CODE (args) != STRING_CST)
13285     {
13286       error ("attribute %<target%> argument not a string");
13287       return false;
13288     }
13289
13290   size_t len = strlen (TREE_STRING_POINTER (args));
13291   char *str_to_check = (char *) alloca (len + 1);
13292   strcpy (str_to_check, TREE_STRING_POINTER (args));
13293
13294   if (len == 0)
13295     {
13296       error ("malformed %<target()%> pragma or attribute");
13297       return false;
13298     }
13299
13300   /* Used to catch empty spaces between commas i.e.
13301      attribute ((target ("attr1,,attr2"))).  */
13302   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
13303
13304   /* Handle multiple target attributes separated by ','.  */
13305   char *token = strtok_r (str_to_check, ",", &str_to_check);
13306
13307   unsigned int num_attrs = 0;
13308   while (token)
13309     {
13310       num_attrs++;
13311       if (!aarch64_process_one_target_attr (token))
13312         {
13313           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
13314           return false;
13315         }
13316
13317       token = strtok_r (NULL, ",", &str_to_check);
13318     }
13319
13320   if (num_attrs != num_commas + 1)
13321     {
13322       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
13323       return false;
13324     }
13325
13326   return true;
13327 }
13328
13329 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
13330    process attribute ((target ("..."))).  */
13331
13332 static bool
13333 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
13334 {
13335   struct cl_target_option cur_target;
13336   bool ret;
13337   tree old_optimize;
13338   tree new_target, new_optimize;
13339   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13340
13341   /* If what we're processing is the current pragma string then the
13342      target option node is already stored in target_option_current_node
13343      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
13344      having to re-parse the string.  This is especially useful to keep
13345      arm_neon.h compile times down since that header contains a lot
13346      of intrinsics enclosed in pragmas.  */
13347   if (!existing_target && args == current_target_pragma)
13348     {
13349       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
13350       return true;
13351     }
13352   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13353
13354   old_optimize = build_optimization_node (&global_options);
13355   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13356
13357   /* If the function changed the optimization levels as well as setting
13358      target options, start with the optimizations specified.  */
13359   if (func_optimize && func_optimize != old_optimize)
13360     cl_optimization_restore (&global_options,
13361                              TREE_OPTIMIZATION (func_optimize));
13362
13363   /* Save the current target options to restore at the end.  */
13364   cl_target_option_save (&cur_target, &global_options);
13365
13366   /* If fndecl already has some target attributes applied to it, unpack
13367      them so that we add this attribute on top of them, rather than
13368      overwriting them.  */
13369   if (existing_target)
13370     {
13371       struct cl_target_option *existing_options
13372         = TREE_TARGET_OPTION (existing_target);
13373
13374       if (existing_options)
13375         cl_target_option_restore (&global_options, existing_options);
13376     }
13377   else
13378     cl_target_option_restore (&global_options,
13379                         TREE_TARGET_OPTION (target_option_current_node));
13380
13381   ret = aarch64_process_target_attr (args);
13382
13383   /* Set up any additional state.  */
13384   if (ret)
13385     {
13386       aarch64_override_options_internal (&global_options);
13387       /* Initialize SIMD builtins if we haven't already.
13388          Set current_target_pragma to NULL for the duration so that
13389          the builtin initialization code doesn't try to tag the functions
13390          being built with the attributes specified by any current pragma, thus
13391          going into an infinite recursion.  */
13392       if (TARGET_SIMD)
13393         {
13394           tree saved_current_target_pragma = current_target_pragma;
13395           current_target_pragma = NULL;
13396           aarch64_init_simd_builtins ();
13397           current_target_pragma = saved_current_target_pragma;
13398         }
13399       new_target = build_target_option_node (&global_options);
13400     }
13401   else
13402     new_target = NULL;
13403
13404   new_optimize = build_optimization_node (&global_options);
13405
13406   if (fndecl && ret)
13407     {
13408       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
13409
13410       if (old_optimize != new_optimize)
13411         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
13412     }
13413
13414   cl_target_option_restore (&global_options, &cur_target);
13415
13416   if (old_optimize != new_optimize)
13417     cl_optimization_restore (&global_options,
13418                              TREE_OPTIMIZATION (old_optimize));
13419   return ret;
13420 }
13421
13422 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
13423    tri-bool options (yes, no, don't care) and the default value is
13424    DEF, determine whether to reject inlining.  */
13425
13426 static bool
13427 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
13428                                      int dont_care, int def)
13429 {
13430   /* If the callee doesn't care, always allow inlining.  */
13431   if (callee == dont_care)
13432     return true;
13433
13434   /* If the caller doesn't care, always allow inlining.  */
13435   if (caller == dont_care)
13436     return true;
13437
13438   /* Otherwise, allow inlining if either the callee and caller values
13439      agree, or if the callee is using the default value.  */
13440   return (callee == caller || callee == def);
13441 }
13442
13443 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
13444    to inline CALLEE into CALLER based on target-specific info.
13445    Make sure that the caller and callee have compatible architectural
13446    features.  Then go through the other possible target attributes
13447    and see if they can block inlining.  Try not to reject always_inline
13448    callees unless they are incompatible architecturally.  */
13449
13450 static bool
13451 aarch64_can_inline_p (tree caller, tree callee)
13452 {
13453   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
13454   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
13455
13456   struct cl_target_option *caller_opts
13457         = TREE_TARGET_OPTION (caller_tree ? caller_tree
13458                                            : target_option_default_node);
13459
13460   struct cl_target_option *callee_opts
13461         = TREE_TARGET_OPTION (callee_tree ? callee_tree
13462                                            : target_option_default_node);
13463
13464   /* Callee's ISA flags should be a subset of the caller's.  */
13465   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
13466        != callee_opts->x_aarch64_isa_flags)
13467     return false;
13468
13469   /* Allow non-strict aligned functions inlining into strict
13470      aligned ones.  */
13471   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
13472        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
13473       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
13474            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
13475     return false;
13476
13477   bool always_inline = lookup_attribute ("always_inline",
13478                                           DECL_ATTRIBUTES (callee));
13479
13480   /* If the architectural features match up and the callee is always_inline
13481      then the other attributes don't matter.  */
13482   if (always_inline)
13483     return true;
13484
13485   if (caller_opts->x_aarch64_cmodel_var
13486       != callee_opts->x_aarch64_cmodel_var)
13487     return false;
13488
13489   if (caller_opts->x_aarch64_tls_dialect
13490       != callee_opts->x_aarch64_tls_dialect)
13491     return false;
13492
13493   /* Honour explicit requests to workaround errata.  */
13494   if (!aarch64_tribools_ok_for_inlining_p (
13495           caller_opts->x_aarch64_fix_a53_err835769,
13496           callee_opts->x_aarch64_fix_a53_err835769,
13497           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
13498     return false;
13499
13500   if (!aarch64_tribools_ok_for_inlining_p (
13501           caller_opts->x_aarch64_fix_a53_err843419,
13502           callee_opts->x_aarch64_fix_a53_err843419,
13503           2, TARGET_FIX_ERR_A53_843419))
13504     return false;
13505
13506   /* If the user explicitly specified -momit-leaf-frame-pointer for the
13507      caller and calle and they don't match up, reject inlining.  */
13508   if (!aarch64_tribools_ok_for_inlining_p (
13509           caller_opts->x_flag_omit_leaf_frame_pointer,
13510           callee_opts->x_flag_omit_leaf_frame_pointer,
13511           2, 1))
13512     return false;
13513
13514   /* If the callee has specific tuning overrides, respect them.  */
13515   if (callee_opts->x_aarch64_override_tune_string != NULL
13516       && caller_opts->x_aarch64_override_tune_string == NULL)
13517     return false;
13518
13519   /* If the user specified tuning override strings for the
13520      caller and callee and they don't match up, reject inlining.
13521      We just do a string compare here, we don't analyze the meaning
13522      of the string, as it would be too costly for little gain.  */
13523   if (callee_opts->x_aarch64_override_tune_string
13524       && caller_opts->x_aarch64_override_tune_string
13525       && (strcmp (callee_opts->x_aarch64_override_tune_string,
13526                   caller_opts->x_aarch64_override_tune_string) != 0))
13527     return false;
13528
13529   return true;
13530 }
13531
13532 /* Return true if SYMBOL_REF X binds locally.  */
13533
13534 static bool
13535 aarch64_symbol_binds_local_p (const_rtx x)
13536 {
13537   return (SYMBOL_REF_DECL (x)
13538           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13539           : SYMBOL_REF_LOCAL_P (x));
13540 }
13541
13542 /* Return true if SYMBOL_REF X is thread local */
13543 static bool
13544 aarch64_tls_symbol_p (rtx x)
13545 {
13546   if (! TARGET_HAVE_TLS)
13547     return false;
13548
13549   if (GET_CODE (x) != SYMBOL_REF)
13550     return false;
13551
13552   return SYMBOL_REF_TLS_MODEL (x) != 0;
13553 }
13554
13555 /* Classify a TLS symbol into one of the TLS kinds.  */
13556 enum aarch64_symbol_type
13557 aarch64_classify_tls_symbol (rtx x)
13558 {
13559   enum tls_model tls_kind = tls_symbolic_operand_type (x);
13560
13561   switch (tls_kind)
13562     {
13563     case TLS_MODEL_GLOBAL_DYNAMIC:
13564     case TLS_MODEL_LOCAL_DYNAMIC:
13565       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
13566
13567     case TLS_MODEL_INITIAL_EXEC:
13568       switch (aarch64_cmodel)
13569         {
13570         case AARCH64_CMODEL_TINY:
13571         case AARCH64_CMODEL_TINY_PIC:
13572           return SYMBOL_TINY_TLSIE;
13573         default:
13574           return SYMBOL_SMALL_TLSIE;
13575         }
13576
13577     case TLS_MODEL_LOCAL_EXEC:
13578       if (aarch64_tls_size == 12)
13579         return SYMBOL_TLSLE12;
13580       else if (aarch64_tls_size == 24)
13581         return SYMBOL_TLSLE24;
13582       else if (aarch64_tls_size == 32)
13583         return SYMBOL_TLSLE32;
13584       else if (aarch64_tls_size == 48)
13585         return SYMBOL_TLSLE48;
13586       else
13587         gcc_unreachable ();
13588
13589     case TLS_MODEL_EMULATED:
13590     case TLS_MODEL_NONE:
13591       return SYMBOL_FORCE_TO_MEM;
13592
13593     default:
13594       gcc_unreachable ();
13595     }
13596 }
13597
13598 /* Return the correct method for accessing X + OFFSET, where X is either
13599    a SYMBOL_REF or LABEL_REF.  */
13600
13601 enum aarch64_symbol_type
13602 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
13603 {
13604   if (GET_CODE (x) == LABEL_REF)
13605     {
13606       switch (aarch64_cmodel)
13607         {
13608         case AARCH64_CMODEL_LARGE:
13609           return SYMBOL_FORCE_TO_MEM;
13610
13611         case AARCH64_CMODEL_TINY_PIC:
13612         case AARCH64_CMODEL_TINY:
13613           return SYMBOL_TINY_ABSOLUTE;
13614
13615         case AARCH64_CMODEL_SMALL_SPIC:
13616         case AARCH64_CMODEL_SMALL_PIC:
13617         case AARCH64_CMODEL_SMALL:
13618           return SYMBOL_SMALL_ABSOLUTE;
13619
13620         default:
13621           gcc_unreachable ();
13622         }
13623     }
13624
13625   if (GET_CODE (x) == SYMBOL_REF)
13626     {
13627       if (aarch64_tls_symbol_p (x))
13628         return aarch64_classify_tls_symbol (x);
13629
13630       switch (aarch64_cmodel)
13631         {
13632         case AARCH64_CMODEL_TINY:
13633           /* When we retrieve symbol + offset address, we have to make sure
13634              the offset does not cause overflow of the final address.  But
13635              we have no way of knowing the address of symbol at compile time
13636              so we can't accurately say if the distance between the PC and
13637              symbol + offset is outside the addressible range of +/-1M in the
13638              TINY code model.  So we rely on images not being greater than
13639              1M and cap the offset at 1M and anything beyond 1M will have to
13640              be loaded using an alternative mechanism.  Furthermore if the
13641              symbol is a weak reference to something that isn't known to
13642              resolve to a symbol in this module, then force to memory.  */
13643           if ((SYMBOL_REF_WEAK (x)
13644                && !aarch64_symbol_binds_local_p (x))
13645               || !IN_RANGE (offset, -1048575, 1048575))
13646             return SYMBOL_FORCE_TO_MEM;
13647           return SYMBOL_TINY_ABSOLUTE;
13648
13649         case AARCH64_CMODEL_SMALL:
13650           /* Same reasoning as the tiny code model, but the offset cap here is
13651              4G.  */
13652           if ((SYMBOL_REF_WEAK (x)
13653                && !aarch64_symbol_binds_local_p (x))
13654               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13655                             HOST_WIDE_INT_C (4294967264)))
13656             return SYMBOL_FORCE_TO_MEM;
13657           return SYMBOL_SMALL_ABSOLUTE;
13658
13659         case AARCH64_CMODEL_TINY_PIC:
13660           if (!aarch64_symbol_binds_local_p (x))
13661             return SYMBOL_TINY_GOT;
13662           return SYMBOL_TINY_ABSOLUTE;
13663
13664         case AARCH64_CMODEL_SMALL_SPIC:
13665         case AARCH64_CMODEL_SMALL_PIC:
13666           if (!aarch64_symbol_binds_local_p (x))
13667             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13668                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13669           return SYMBOL_SMALL_ABSOLUTE;
13670
13671         case AARCH64_CMODEL_LARGE:
13672           /* This is alright even in PIC code as the constant
13673              pool reference is always PC relative and within
13674              the same translation unit.  */
13675           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13676             return SYMBOL_SMALL_ABSOLUTE;
13677           else
13678             return SYMBOL_FORCE_TO_MEM;
13679
13680         default:
13681           gcc_unreachable ();
13682         }
13683     }
13684
13685   /* By default push everything into the constant pool.  */
13686   return SYMBOL_FORCE_TO_MEM;
13687 }
13688
13689 bool
13690 aarch64_constant_address_p (rtx x)
13691 {
13692   return (CONSTANT_P (x) && memory_address_p (DImode, x));
13693 }
13694
13695 bool
13696 aarch64_legitimate_pic_operand_p (rtx x)
13697 {
13698   if (GET_CODE (x) == SYMBOL_REF
13699       || (GET_CODE (x) == CONST
13700           && GET_CODE (XEXP (x, 0)) == PLUS
13701           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13702      return false;
13703
13704   return true;
13705 }
13706
13707 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
13708    that should be rematerialized rather than spilled.  */
13709
13710 static bool
13711 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
13712 {
13713   /* Support CSE and rematerialization of common constants.  */
13714   if (CONST_INT_P (x)
13715       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
13716       || GET_CODE (x) == CONST_VECTOR)
13717     return true;
13718
13719   /* Do not allow vector struct mode constants for Advanced SIMD.
13720      We could support 0 and -1 easily, but they need support in
13721      aarch64-simd.md.  */
13722   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13723   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13724     return false;
13725
13726   /* Only accept variable-length vector constants if they can be
13727      handled directly.
13728
13729      ??? It would be possible to handle rematerialization of other
13730      constants via secondary reloads.  */
13731   if (vec_flags & VEC_ANY_SVE)
13732     return aarch64_simd_valid_immediate (x, NULL);
13733
13734   if (GET_CODE (x) == HIGH)
13735     x = XEXP (x, 0);
13736
13737   /* Accept polynomial constants that can be calculated by using the
13738      destination of a move as the sole temporary.  Constants that
13739      require a second temporary cannot be rematerialized (they can't be
13740      forced to memory and also aren't legitimate constants).  */
13741   poly_int64 offset;
13742   if (poly_int_rtx_p (x, &offset))
13743     return aarch64_offset_temporaries (false, offset) <= 1;
13744
13745   /* If an offset is being added to something else, we need to allow the
13746      base to be moved into the destination register, meaning that there
13747      are no free temporaries for the offset.  */
13748   x = strip_offset (x, &offset);
13749   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
13750     return false;
13751
13752   /* Do not allow const (plus (anchor_symbol, const_int)).  */
13753   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
13754     return false;
13755
13756   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
13757      so spilling them is better than rematerialization.  */
13758   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
13759     return true;
13760
13761   /* Label references are always constant.  */
13762   if (GET_CODE (x) == LABEL_REF)
13763     return true;
13764
13765   return false;
13766 }
13767
13768 rtx
13769 aarch64_load_tp (rtx target)
13770 {
13771   if (!target
13772       || GET_MODE (target) != Pmode
13773       || !register_operand (target, Pmode))
13774     target = gen_reg_rtx (Pmode);
13775
13776   /* Can return in any reg.  */
13777   emit_insn (gen_aarch64_load_tp_hard (target));
13778   return target;
13779 }
13780
13781 /* On AAPCS systems, this is the "struct __va_list".  */
13782 static GTY(()) tree va_list_type;
13783
13784 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13785    Return the type to use as __builtin_va_list.
13786
13787    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
13788
13789    struct __va_list
13790    {
13791      void *__stack;
13792      void *__gr_top;
13793      void *__vr_top;
13794      int   __gr_offs;
13795      int   __vr_offs;
13796    };  */
13797
13798 static tree
13799 aarch64_build_builtin_va_list (void)
13800 {
13801   tree va_list_name;
13802   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13803
13804   /* Create the type.  */
13805   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
13806   /* Give it the required name.  */
13807   va_list_name = build_decl (BUILTINS_LOCATION,
13808                              TYPE_DECL,
13809                              get_identifier ("__va_list"),
13810                              va_list_type);
13811   DECL_ARTIFICIAL (va_list_name) = 1;
13812   TYPE_NAME (va_list_type) = va_list_name;
13813   TYPE_STUB_DECL (va_list_type) = va_list_name;
13814
13815   /* Create the fields.  */
13816   f_stack = build_decl (BUILTINS_LOCATION,
13817                         FIELD_DECL, get_identifier ("__stack"),
13818                         ptr_type_node);
13819   f_grtop = build_decl (BUILTINS_LOCATION,
13820                         FIELD_DECL, get_identifier ("__gr_top"),
13821                         ptr_type_node);
13822   f_vrtop = build_decl (BUILTINS_LOCATION,
13823                         FIELD_DECL, get_identifier ("__vr_top"),
13824                         ptr_type_node);
13825   f_groff = build_decl (BUILTINS_LOCATION,
13826                         FIELD_DECL, get_identifier ("__gr_offs"),
13827                         integer_type_node);
13828   f_vroff = build_decl (BUILTINS_LOCATION,
13829                         FIELD_DECL, get_identifier ("__vr_offs"),
13830                         integer_type_node);
13831
13832   /* Tell tree-stdarg pass about our internal offset fields.
13833      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
13834      purpose to identify whether the code is updating va_list internal
13835      offset fields through irregular way.  */
13836   va_list_gpr_counter_field = f_groff;
13837   va_list_fpr_counter_field = f_vroff;
13838
13839   DECL_ARTIFICIAL (f_stack) = 1;
13840   DECL_ARTIFICIAL (f_grtop) = 1;
13841   DECL_ARTIFICIAL (f_vrtop) = 1;
13842   DECL_ARTIFICIAL (f_groff) = 1;
13843   DECL_ARTIFICIAL (f_vroff) = 1;
13844
13845   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
13846   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
13847   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
13848   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
13849   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
13850
13851   TYPE_FIELDS (va_list_type) = f_stack;
13852   DECL_CHAIN (f_stack) = f_grtop;
13853   DECL_CHAIN (f_grtop) = f_vrtop;
13854   DECL_CHAIN (f_vrtop) = f_groff;
13855   DECL_CHAIN (f_groff) = f_vroff;
13856
13857   /* Compute its layout.  */
13858   layout_type (va_list_type);
13859
13860   return va_list_type;
13861 }
13862
13863 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
13864 static void
13865 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
13866 {
13867   const CUMULATIVE_ARGS *cum;
13868   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13869   tree stack, grtop, vrtop, groff, vroff;
13870   tree t;
13871   int gr_save_area_size = cfun->va_list_gpr_size;
13872   int vr_save_area_size = cfun->va_list_fpr_size;
13873   int vr_offset;
13874
13875   cum = &crtl->args.info;
13876   if (cfun->va_list_gpr_size)
13877     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
13878                              cfun->va_list_gpr_size);
13879   if (cfun->va_list_fpr_size)
13880     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
13881                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
13882
13883   if (!TARGET_FLOAT)
13884     {
13885       gcc_assert (cum->aapcs_nvrn == 0);
13886       vr_save_area_size = 0;
13887     }
13888
13889   f_stack = TYPE_FIELDS (va_list_type_node);
13890   f_grtop = DECL_CHAIN (f_stack);
13891   f_vrtop = DECL_CHAIN (f_grtop);
13892   f_groff = DECL_CHAIN (f_vrtop);
13893   f_vroff = DECL_CHAIN (f_groff);
13894
13895   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
13896                   NULL_TREE);
13897   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
13898                   NULL_TREE);
13899   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
13900                   NULL_TREE);
13901   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
13902                   NULL_TREE);
13903   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
13904                   NULL_TREE);
13905
13906   /* Emit code to initialize STACK, which points to the next varargs stack
13907      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
13908      by named arguments.  STACK is 8-byte aligned.  */
13909   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
13910   if (cum->aapcs_stack_size > 0)
13911     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
13912   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
13913   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13914
13915   /* Emit code to initialize GRTOP, the top of the GR save area.
13916      virtual_incoming_args_rtx should have been 16 byte aligned.  */
13917   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
13918   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
13919   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13920
13921   /* Emit code to initialize VRTOP, the top of the VR save area.
13922      This address is gr_save_area_bytes below GRTOP, rounded
13923      down to the next 16-byte boundary.  */
13924   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
13925   vr_offset = ROUND_UP (gr_save_area_size,
13926                         STACK_BOUNDARY / BITS_PER_UNIT);
13927
13928   if (vr_offset)
13929     t = fold_build_pointer_plus_hwi (t, -vr_offset);
13930   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
13931   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13932
13933   /* Emit code to initialize GROFF, the offset from GRTOP of the
13934      next GPR argument.  */
13935   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
13936               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
13937   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13938
13939   /* Likewise emit code to initialize VROFF, the offset from FTOP
13940      of the next VR argument.  */
13941   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
13942               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
13943   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13944 }
13945
13946 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
13947
13948 static tree
13949 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
13950                               gimple_seq *post_p ATTRIBUTE_UNUSED)
13951 {
13952   tree addr;
13953   bool indirect_p;
13954   bool is_ha;           /* is HFA or HVA.  */
13955   bool dw_align;        /* double-word align.  */
13956   machine_mode ag_mode = VOIDmode;
13957   int nregs;
13958   machine_mode mode;
13959
13960   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13961   tree stack, f_top, f_off, off, arg, roundup, on_stack;
13962   HOST_WIDE_INT size, rsize, adjust, align;
13963   tree t, u, cond1, cond2;
13964
13965   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
13966   if (indirect_p)
13967     type = build_pointer_type (type);
13968
13969   mode = TYPE_MODE (type);
13970
13971   f_stack = TYPE_FIELDS (va_list_type_node);
13972   f_grtop = DECL_CHAIN (f_stack);
13973   f_vrtop = DECL_CHAIN (f_grtop);
13974   f_groff = DECL_CHAIN (f_vrtop);
13975   f_vroff = DECL_CHAIN (f_groff);
13976
13977   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
13978                   f_stack, NULL_TREE);
13979   size = int_size_in_bytes (type);
13980
13981   bool abi_break;
13982   align
13983     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
13984
13985   dw_align = false;
13986   adjust = 0;
13987   if (aarch64_vfp_is_call_or_return_candidate (mode,
13988                                                type,
13989                                                &ag_mode,
13990                                                &nregs,
13991                                                &is_ha))
13992     {
13993       /* No frontends can create types with variable-sized modes, so we
13994          shouldn't be asked to pass or return them.  */
13995       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
13996
13997       /* TYPE passed in fp/simd registers.  */
13998       if (!TARGET_FLOAT)
13999         aarch64_err_no_fpadvsimd (mode);
14000
14001       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
14002                       unshare_expr (valist), f_vrtop, NULL_TREE);
14003       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
14004                       unshare_expr (valist), f_vroff, NULL_TREE);
14005
14006       rsize = nregs * UNITS_PER_VREG;
14007
14008       if (is_ha)
14009         {
14010           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
14011             adjust = UNITS_PER_VREG - ag_size;
14012         }
14013       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14014                && size < UNITS_PER_VREG)
14015         {
14016           adjust = UNITS_PER_VREG - size;
14017         }
14018     }
14019   else
14020     {
14021       /* TYPE passed in general registers.  */
14022       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
14023                       unshare_expr (valist), f_grtop, NULL_TREE);
14024       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
14025                       unshare_expr (valist), f_groff, NULL_TREE);
14026       rsize = ROUND_UP (size, UNITS_PER_WORD);
14027       nregs = rsize / UNITS_PER_WORD;
14028
14029       if (align > 8)
14030         {
14031           if (abi_break && warn_psabi)
14032             inform (input_location, "parameter passing for argument of type "
14033                     "%qT changed in GCC 9.1", type);
14034           dw_align = true;
14035         }
14036
14037       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14038           && size < UNITS_PER_WORD)
14039         {
14040           adjust = UNITS_PER_WORD  - size;
14041         }
14042     }
14043
14044   /* Get a local temporary for the field value.  */
14045   off = get_initialized_tmp_var (f_off, pre_p, NULL);
14046
14047   /* Emit code to branch if off >= 0.  */
14048   t = build2 (GE_EXPR, boolean_type_node, off,
14049               build_int_cst (TREE_TYPE (off), 0));
14050   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
14051
14052   if (dw_align)
14053     {
14054       /* Emit: offs = (offs + 15) & -16.  */
14055       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14056                   build_int_cst (TREE_TYPE (off), 15));
14057       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
14058                   build_int_cst (TREE_TYPE (off), -16));
14059       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
14060     }
14061   else
14062     roundup = NULL;
14063
14064   /* Update ap.__[g|v]r_offs  */
14065   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14066               build_int_cst (TREE_TYPE (off), rsize));
14067   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
14068
14069   /* String up.  */
14070   if (roundup)
14071     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14072
14073   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
14074   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
14075               build_int_cst (TREE_TYPE (f_off), 0));
14076   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
14077
14078   /* String up: make sure the assignment happens before the use.  */
14079   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
14080   COND_EXPR_ELSE (cond1) = t;
14081
14082   /* Prepare the trees handling the argument that is passed on the stack;
14083      the top level node will store in ON_STACK.  */
14084   arg = get_initialized_tmp_var (stack, pre_p, NULL);
14085   if (align > 8)
14086     {
14087       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
14088       t = fold_build_pointer_plus_hwi (arg, 15);
14089       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14090                   build_int_cst (TREE_TYPE (t), -16));
14091       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
14092     }
14093   else
14094     roundup = NULL;
14095   /* Advance ap.__stack  */
14096   t = fold_build_pointer_plus_hwi (arg, size + 7);
14097   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14098               build_int_cst (TREE_TYPE (t), -8));
14099   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
14100   /* String up roundup and advance.  */
14101   if (roundup)
14102     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14103   /* String up with arg */
14104   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
14105   /* Big-endianness related address adjustment.  */
14106   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14107       && size < UNITS_PER_WORD)
14108   {
14109     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
14110                 size_int (UNITS_PER_WORD - size));
14111     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
14112   }
14113
14114   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
14115   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
14116
14117   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
14118   t = off;
14119   if (adjust)
14120     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
14121                 build_int_cst (TREE_TYPE (off), adjust));
14122
14123   t = fold_convert (sizetype, t);
14124   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
14125
14126   if (is_ha)
14127     {
14128       /* type ha; // treat as "struct {ftype field[n];}"
14129          ... [computing offs]
14130          for (i = 0; i <nregs; ++i, offs += 16)
14131            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
14132          return ha;  */
14133       int i;
14134       tree tmp_ha, field_t, field_ptr_t;
14135
14136       /* Declare a local variable.  */
14137       tmp_ha = create_tmp_var_raw (type, "ha");
14138       gimple_add_tmp_var (tmp_ha);
14139
14140       /* Establish the base type.  */
14141       switch (ag_mode)
14142         {
14143         case E_SFmode:
14144           field_t = float_type_node;
14145           field_ptr_t = float_ptr_type_node;
14146           break;
14147         case E_DFmode:
14148           field_t = double_type_node;
14149           field_ptr_t = double_ptr_type_node;
14150           break;
14151         case E_TFmode:
14152           field_t = long_double_type_node;
14153           field_ptr_t = long_double_ptr_type_node;
14154           break;
14155         case E_HFmode:
14156           field_t = aarch64_fp16_type_node;
14157           field_ptr_t = aarch64_fp16_ptr_type_node;
14158           break;
14159         case E_V2SImode:
14160         case E_V4SImode:
14161             {
14162               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
14163               field_t = build_vector_type_for_mode (innertype, ag_mode);
14164               field_ptr_t = build_pointer_type (field_t);
14165             }
14166           break;
14167         default:
14168           gcc_assert (0);
14169         }
14170
14171       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
14172       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
14173       addr = t;
14174       t = fold_convert (field_ptr_t, addr);
14175       t = build2 (MODIFY_EXPR, field_t,
14176                   build1 (INDIRECT_REF, field_t, tmp_ha),
14177                   build1 (INDIRECT_REF, field_t, t));
14178
14179       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
14180       for (i = 1; i < nregs; ++i)
14181         {
14182           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
14183           u = fold_convert (field_ptr_t, addr);
14184           u = build2 (MODIFY_EXPR, field_t,
14185                       build2 (MEM_REF, field_t, tmp_ha,
14186                               build_int_cst (field_ptr_t,
14187                                              (i *
14188                                               int_size_in_bytes (field_t)))),
14189                       build1 (INDIRECT_REF, field_t, u));
14190           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
14191         }
14192
14193       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
14194       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
14195     }
14196
14197   COND_EXPR_ELSE (cond2) = t;
14198   addr = fold_convert (build_pointer_type (type), cond1);
14199   addr = build_va_arg_indirect_ref (addr);
14200
14201   if (indirect_p)
14202     addr = build_va_arg_indirect_ref (addr);
14203
14204   return addr;
14205 }
14206
14207 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
14208
14209 static void
14210 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
14211                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
14212                                 int no_rtl)
14213 {
14214   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
14215   CUMULATIVE_ARGS local_cum;
14216   int gr_saved = cfun->va_list_gpr_size;
14217   int vr_saved = cfun->va_list_fpr_size;
14218
14219   /* The caller has advanced CUM up to, but not beyond, the last named
14220      argument.  Advance a local copy of CUM past the last "real" named
14221      argument, to find out how many registers are left over.  */
14222   local_cum = *cum;
14223   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
14224
14225   /* Found out how many registers we need to save.
14226      Honor tree-stdvar analysis results.  */
14227   if (cfun->va_list_gpr_size)
14228     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
14229                     cfun->va_list_gpr_size / UNITS_PER_WORD);
14230   if (cfun->va_list_fpr_size)
14231     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
14232                     cfun->va_list_fpr_size / UNITS_PER_VREG);
14233
14234   if (!TARGET_FLOAT)
14235     {
14236       gcc_assert (local_cum.aapcs_nvrn == 0);
14237       vr_saved = 0;
14238     }
14239
14240   if (!no_rtl)
14241     {
14242       if (gr_saved > 0)
14243         {
14244           rtx ptr, mem;
14245
14246           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
14247           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
14248                                - gr_saved * UNITS_PER_WORD);
14249           mem = gen_frame_mem (BLKmode, ptr);
14250           set_mem_alias_set (mem, get_varargs_alias_set ());
14251
14252           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
14253                                mem, gr_saved);
14254         }
14255       if (vr_saved > 0)
14256         {
14257           /* We can't use move_block_from_reg, because it will use
14258              the wrong mode, storing D regs only.  */
14259           machine_mode mode = TImode;
14260           int off, i, vr_start;
14261
14262           /* Set OFF to the offset from virtual_incoming_args_rtx of
14263              the first vector register.  The VR save area lies below
14264              the GR one, and is aligned to 16 bytes.  */
14265           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
14266                            STACK_BOUNDARY / BITS_PER_UNIT);
14267           off -= vr_saved * UNITS_PER_VREG;
14268
14269           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
14270           for (i = 0; i < vr_saved; ++i)
14271             {
14272               rtx ptr, mem;
14273
14274               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
14275               mem = gen_frame_mem (mode, ptr);
14276               set_mem_alias_set (mem, get_varargs_alias_set ());
14277               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
14278               off += UNITS_PER_VREG;
14279             }
14280         }
14281     }
14282
14283   /* We don't save the size into *PRETEND_SIZE because we want to avoid
14284      any complication of having crtl->args.pretend_args_size changed.  */
14285   cfun->machine->frame.saved_varargs_size
14286     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
14287                  STACK_BOUNDARY / BITS_PER_UNIT)
14288        + vr_saved * UNITS_PER_VREG);
14289 }
14290
14291 static void
14292 aarch64_conditional_register_usage (void)
14293 {
14294   int i;
14295   if (!TARGET_FLOAT)
14296     {
14297       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
14298         {
14299           fixed_regs[i] = 1;
14300           call_used_regs[i] = 1;
14301         }
14302     }
14303   if (!TARGET_SVE)
14304     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
14305       {
14306         fixed_regs[i] = 1;
14307         call_used_regs[i] = 1;
14308       }
14309
14310   /* When tracking speculation, we need a couple of call-clobbered registers
14311      to track the speculation state.  It would be nice to just use
14312      IP0 and IP1, but currently there are numerous places that just
14313      assume these registers are free for other uses (eg pointer
14314      authentication).  */
14315   if (aarch64_track_speculation)
14316     {
14317       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
14318       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
14319       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14320       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14321     }
14322 }
14323
14324 /* Walk down the type tree of TYPE counting consecutive base elements.
14325    If *MODEP is VOIDmode, then set it to the first valid floating point
14326    type.  If a non-floating point type is found, or if a floating point
14327    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14328    otherwise return the count in the sub-tree.  */
14329 static int
14330 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
14331 {
14332   machine_mode mode;
14333   HOST_WIDE_INT size;
14334
14335   switch (TREE_CODE (type))
14336     {
14337     case REAL_TYPE:
14338       mode = TYPE_MODE (type);
14339       if (mode != DFmode && mode != SFmode
14340           && mode != TFmode && mode != HFmode)
14341         return -1;
14342
14343       if (*modep == VOIDmode)
14344         *modep = mode;
14345
14346       if (*modep == mode)
14347         return 1;
14348
14349       break;
14350
14351     case COMPLEX_TYPE:
14352       mode = TYPE_MODE (TREE_TYPE (type));
14353       if (mode != DFmode && mode != SFmode
14354           && mode != TFmode && mode != HFmode)
14355         return -1;
14356
14357       if (*modep == VOIDmode)
14358         *modep = mode;
14359
14360       if (*modep == mode)
14361         return 2;
14362
14363       break;
14364
14365     case VECTOR_TYPE:
14366       /* Use V2SImode and V4SImode as representatives of all 64-bit
14367          and 128-bit vector types.  */
14368       size = int_size_in_bytes (type);
14369       switch (size)
14370         {
14371         case 8:
14372           mode = V2SImode;
14373           break;
14374         case 16:
14375           mode = V4SImode;
14376           break;
14377         default:
14378           return -1;
14379         }
14380
14381       if (*modep == VOIDmode)
14382         *modep = mode;
14383
14384       /* Vector modes are considered to be opaque: two vectors are
14385          equivalent for the purposes of being homogeneous aggregates
14386          if they are the same size.  */
14387       if (*modep == mode)
14388         return 1;
14389
14390       break;
14391
14392     case ARRAY_TYPE:
14393       {
14394         int count;
14395         tree index = TYPE_DOMAIN (type);
14396
14397         /* Can't handle incomplete types nor sizes that are not
14398            fixed.  */
14399         if (!COMPLETE_TYPE_P (type)
14400             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14401           return -1;
14402
14403         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
14404         if (count == -1
14405             || !index
14406             || !TYPE_MAX_VALUE (index)
14407             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
14408             || !TYPE_MIN_VALUE (index)
14409             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
14410             || count < 0)
14411           return -1;
14412
14413         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
14414                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
14415
14416         /* There must be no padding.  */
14417         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14418                       count * GET_MODE_BITSIZE (*modep)))
14419           return -1;
14420
14421         return count;
14422       }
14423
14424     case RECORD_TYPE:
14425       {
14426         int count = 0;
14427         int sub_count;
14428         tree field;
14429
14430         /* Can't handle incomplete types nor sizes that are not
14431            fixed.  */
14432         if (!COMPLETE_TYPE_P (type)
14433             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14434           return -1;
14435
14436         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14437           {
14438             if (TREE_CODE (field) != FIELD_DECL)
14439               continue;
14440
14441             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14442             if (sub_count < 0)
14443               return -1;
14444             count += sub_count;
14445           }
14446
14447         /* There must be no padding.  */
14448         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14449                       count * GET_MODE_BITSIZE (*modep)))
14450           return -1;
14451
14452         return count;
14453       }
14454
14455     case UNION_TYPE:
14456     case QUAL_UNION_TYPE:
14457       {
14458         /* These aren't very interesting except in a degenerate case.  */
14459         int count = 0;
14460         int sub_count;
14461         tree field;
14462
14463         /* Can't handle incomplete types nor sizes that are not
14464            fixed.  */
14465         if (!COMPLETE_TYPE_P (type)
14466             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14467           return -1;
14468
14469         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14470           {
14471             if (TREE_CODE (field) != FIELD_DECL)
14472               continue;
14473
14474             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14475             if (sub_count < 0)
14476               return -1;
14477             count = count > sub_count ? count : sub_count;
14478           }
14479
14480         /* There must be no padding.  */
14481         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14482                       count * GET_MODE_BITSIZE (*modep)))
14483           return -1;
14484
14485         return count;
14486       }
14487
14488     default:
14489       break;
14490     }
14491
14492   return -1;
14493 }
14494
14495 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14496    type as described in AAPCS64 \S 4.1.2.
14497
14498    See the comment above aarch64_composite_type_p for the notes on MODE.  */
14499
14500 static bool
14501 aarch64_short_vector_p (const_tree type,
14502                         machine_mode mode)
14503 {
14504   poly_int64 size = -1;
14505
14506   if (type && TREE_CODE (type) == VECTOR_TYPE)
14507     size = int_size_in_bytes (type);
14508   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
14509             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
14510     size = GET_MODE_SIZE (mode);
14511
14512   return known_eq (size, 8) || known_eq (size, 16);
14513 }
14514
14515 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14516    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
14517    array types.  The C99 floating-point complex types are also considered
14518    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
14519    types, which are GCC extensions and out of the scope of AAPCS64, are
14520    treated as composite types here as well.
14521
14522    Note that MODE itself is not sufficient in determining whether a type
14523    is such a composite type or not.  This is because
14524    stor-layout.c:compute_record_mode may have already changed the MODE
14525    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
14526    structure with only one field may have its MODE set to the mode of the
14527    field.  Also an integer mode whose size matches the size of the
14528    RECORD_TYPE type may be used to substitute the original mode
14529    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
14530    solely relied on.  */
14531
14532 static bool
14533 aarch64_composite_type_p (const_tree type,
14534                           machine_mode mode)
14535 {
14536   if (aarch64_short_vector_p (type, mode))
14537     return false;
14538
14539   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14540     return true;
14541
14542   if (mode == BLKmode
14543       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14544       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14545     return true;
14546
14547   return false;
14548 }
14549
14550 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14551    shall be passed or returned in simd/fp register(s) (providing these
14552    parameter passing registers are available).
14553
14554    Upon successful return, *COUNT returns the number of needed registers,
14555    *BASE_MODE returns the mode of the individual register and when IS_HAF
14556    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14557    floating-point aggregate or a homogeneous short-vector aggregate.  */
14558
14559 static bool
14560 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
14561                                          const_tree type,
14562                                          machine_mode *base_mode,
14563                                          int *count,
14564                                          bool *is_ha)
14565 {
14566   machine_mode new_mode = VOIDmode;
14567   bool composite_p = aarch64_composite_type_p (type, mode);
14568
14569   if (is_ha != NULL) *is_ha = false;
14570
14571   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
14572       || aarch64_short_vector_p (type, mode))
14573     {
14574       *count = 1;
14575       new_mode = mode;
14576     }
14577   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
14578     {
14579       if (is_ha != NULL) *is_ha = true;
14580       *count = 2;
14581       new_mode = GET_MODE_INNER (mode);
14582     }
14583   else if (type && composite_p)
14584     {
14585       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
14586
14587       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
14588         {
14589           if (is_ha != NULL) *is_ha = true;
14590           *count = ag_count;
14591         }
14592       else
14593         return false;
14594     }
14595   else
14596     return false;
14597
14598   *base_mode = new_mode;
14599   return true;
14600 }
14601
14602 /* Implement TARGET_STRUCT_VALUE_RTX.  */
14603
14604 static rtx
14605 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
14606                           int incoming ATTRIBUTE_UNUSED)
14607 {
14608   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
14609 }
14610
14611 /* Implements target hook vector_mode_supported_p.  */
14612 static bool
14613 aarch64_vector_mode_supported_p (machine_mode mode)
14614 {
14615   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14616   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14617 }
14618
14619 /* Return the full-width SVE vector mode for element mode MODE, if one
14620    exists.  */
14621 opt_machine_mode
14622 aarch64_full_sve_mode (scalar_mode mode)
14623 {
14624   switch (mode)
14625     {
14626     case E_DFmode:
14627       return VNx2DFmode;
14628     case E_SFmode:
14629       return VNx4SFmode;
14630     case E_HFmode:
14631       return VNx8HFmode;
14632     case E_DImode:
14633         return VNx2DImode;
14634     case E_SImode:
14635       return VNx4SImode;
14636     case E_HImode:
14637       return VNx8HImode;
14638     case E_QImode:
14639       return VNx16QImode;
14640     default:
14641       return opt_machine_mode ();
14642     }
14643 }
14644
14645 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
14646    if it exists.  */
14647 opt_machine_mode
14648 aarch64_vq_mode (scalar_mode mode)
14649 {
14650   switch (mode)
14651     {
14652     case E_DFmode:
14653       return V2DFmode;
14654     case E_SFmode:
14655       return V4SFmode;
14656     case E_HFmode:
14657       return V8HFmode;
14658     case E_SImode:
14659       return V4SImode;
14660     case E_HImode:
14661       return V8HImode;
14662     case E_QImode:
14663       return V16QImode;
14664     case E_DImode:
14665       return V2DImode;
14666     default:
14667       return opt_machine_mode ();
14668     }
14669 }
14670
14671 /* Return appropriate SIMD container
14672    for MODE within a vector of WIDTH bits.  */
14673 static machine_mode
14674 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14675 {
14676   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14677     return aarch64_full_sve_mode (mode).else_mode (word_mode);
14678
14679   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14680   if (TARGET_SIMD)
14681     {
14682       if (known_eq (width, 128))
14683         return aarch64_vq_mode (mode).else_mode (word_mode);
14684       else
14685         switch (mode)
14686           {
14687           case E_SFmode:
14688             return V2SFmode;
14689           case E_HFmode:
14690             return V4HFmode;
14691           case E_SImode:
14692             return V2SImode;
14693           case E_HImode:
14694             return V4HImode;
14695           case E_QImode:
14696             return V8QImode;
14697           default:
14698             break;
14699           }
14700     }
14701   return word_mode;
14702 }
14703
14704 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
14705 static machine_mode
14706 aarch64_preferred_simd_mode (scalar_mode mode)
14707 {
14708   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
14709   return aarch64_simd_container_mode (mode, bits);
14710 }
14711
14712 /* Return a list of possible vector sizes for the vectorizer
14713    to iterate over.  */
14714 static void
14715 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
14716 {
14717   if (TARGET_SVE)
14718     sizes->safe_push (BYTES_PER_SVE_VECTOR);
14719   sizes->safe_push (16);
14720   sizes->safe_push (8);
14721 }
14722
14723 /* Implement TARGET_MANGLE_TYPE.  */
14724
14725 static const char *
14726 aarch64_mangle_type (const_tree type)
14727 {
14728   /* The AArch64 ABI documents say that "__va_list" has to be
14729      mangled as if it is in the "std" namespace.  */
14730   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
14731     return "St9__va_list";
14732
14733   /* Half-precision float.  */
14734   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
14735     return "Dh";
14736
14737   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
14738      builtin types.  */
14739   if (TYPE_NAME (type) != NULL)
14740     return aarch64_mangle_builtin_type (type);
14741
14742   /* Use the default mangling.  */
14743   return NULL;
14744 }
14745
14746 /* Find the first rtx_insn before insn that will generate an assembly
14747    instruction.  */
14748
14749 static rtx_insn *
14750 aarch64_prev_real_insn (rtx_insn *insn)
14751 {
14752   if (!insn)
14753     return NULL;
14754
14755   do
14756     {
14757       insn = prev_real_insn (insn);
14758     }
14759   while (insn && recog_memoized (insn) < 0);
14760
14761   return insn;
14762 }
14763
14764 static bool
14765 is_madd_op (enum attr_type t1)
14766 {
14767   unsigned int i;
14768   /* A number of these may be AArch32 only.  */
14769   enum attr_type mlatypes[] = {
14770     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
14771     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
14772     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
14773   };
14774
14775   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
14776     {
14777       if (t1 == mlatypes[i])
14778         return true;
14779     }
14780
14781   return false;
14782 }
14783
14784 /* Check if there is a register dependency between a load and the insn
14785    for which we hold recog_data.  */
14786
14787 static bool
14788 dep_between_memop_and_curr (rtx memop)
14789 {
14790   rtx load_reg;
14791   int opno;
14792
14793   gcc_assert (GET_CODE (memop) == SET);
14794
14795   if (!REG_P (SET_DEST (memop)))
14796     return false;
14797
14798   load_reg = SET_DEST (memop);
14799   for (opno = 1; opno < recog_data.n_operands; opno++)
14800     {
14801       rtx operand = recog_data.operand[opno];
14802       if (REG_P (operand)
14803           && reg_overlap_mentioned_p (load_reg, operand))
14804         return true;
14805
14806     }
14807   return false;
14808 }
14809
14810
14811 /* When working around the Cortex-A53 erratum 835769,
14812    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
14813    instruction and has a preceding memory instruction such that a NOP
14814    should be inserted between them.  */
14815
14816 bool
14817 aarch64_madd_needs_nop (rtx_insn* insn)
14818 {
14819   enum attr_type attr_type;
14820   rtx_insn *prev;
14821   rtx body;
14822
14823   if (!TARGET_FIX_ERR_A53_835769)
14824     return false;
14825
14826   if (!INSN_P (insn) || recog_memoized (insn) < 0)
14827     return false;
14828
14829   attr_type = get_attr_type (insn);
14830   if (!is_madd_op (attr_type))
14831     return false;
14832
14833   prev = aarch64_prev_real_insn (insn);
14834   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
14835      Restore recog state to INSN to avoid state corruption.  */
14836   extract_constrain_insn_cached (insn);
14837
14838   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
14839     return false;
14840
14841   body = single_set (prev);
14842
14843   /* If the previous insn is a memory op and there is no dependency between
14844      it and the DImode madd, emit a NOP between them.  If body is NULL then we
14845      have a complex memory operation, probably a load/store pair.
14846      Be conservative for now and emit a NOP.  */
14847   if (GET_MODE (recog_data.operand[0]) == DImode
14848       && (!body || !dep_between_memop_and_curr (body)))
14849     return true;
14850
14851   return false;
14852
14853 }
14854
14855
14856 /* Implement FINAL_PRESCAN_INSN.  */
14857
14858 void
14859 aarch64_final_prescan_insn (rtx_insn *insn)
14860 {
14861   if (aarch64_madd_needs_nop (insn))
14862     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
14863 }
14864
14865
14866 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
14867    instruction.  */
14868
14869 bool
14870 aarch64_sve_index_immediate_p (rtx base_or_step)
14871 {
14872   return (CONST_INT_P (base_or_step)
14873           && IN_RANGE (INTVAL (base_or_step), -16, 15));
14874 }
14875
14876 /* Return true if X is a valid immediate for the SVE ADD and SUB
14877    instructions.  Negate X first if NEGATE_P is true.  */
14878
14879 bool
14880 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
14881 {
14882   rtx elt;
14883
14884   if (!const_vec_duplicate_p (x, &elt)
14885       || !CONST_INT_P (elt))
14886     return false;
14887
14888   HOST_WIDE_INT val = INTVAL (elt);
14889   if (negate_p)
14890     val = -val;
14891   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
14892
14893   if (val & 0xff)
14894     return IN_RANGE (val, 0, 0xff);
14895   return IN_RANGE (val, 0, 0xff00);
14896 }
14897
14898 /* Return true if X is a valid immediate operand for an SVE logical
14899    instruction such as AND.  */
14900
14901 bool
14902 aarch64_sve_bitmask_immediate_p (rtx x)
14903 {
14904   rtx elt;
14905
14906   return (const_vec_duplicate_p (x, &elt)
14907           && CONST_INT_P (elt)
14908           && aarch64_bitmask_imm (INTVAL (elt),
14909                                   GET_MODE_INNER (GET_MODE (x))));
14910 }
14911
14912 /* Return true if X is a valid immediate for the SVE DUP and CPY
14913    instructions.  */
14914
14915 bool
14916 aarch64_sve_dup_immediate_p (rtx x)
14917 {
14918   rtx elt;
14919
14920   if (!const_vec_duplicate_p (x, &elt)
14921       || !CONST_INT_P (elt))
14922     return false;
14923
14924   HOST_WIDE_INT val = INTVAL (elt);
14925   if (val & 0xff)
14926     return IN_RANGE (val, -0x80, 0x7f);
14927   return IN_RANGE (val, -0x8000, 0x7f00);
14928 }
14929
14930 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14931    SIGNED_P says whether the operand is signed rather than unsigned.  */
14932
14933 bool
14934 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
14935 {
14936   rtx elt;
14937
14938   return (const_vec_duplicate_p (x, &elt)
14939           && CONST_INT_P (elt)
14940           && (signed_p
14941               ? IN_RANGE (INTVAL (elt), -16, 15)
14942               : IN_RANGE (INTVAL (elt), 0, 127)));
14943 }
14944
14945 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14946    instruction.  Negate X first if NEGATE_P is true.  */
14947
14948 bool
14949 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
14950 {
14951   rtx elt;
14952   REAL_VALUE_TYPE r;
14953
14954   if (!const_vec_duplicate_p (x, &elt)
14955       || GET_CODE (elt) != CONST_DOUBLE)
14956     return false;
14957
14958   r = *CONST_DOUBLE_REAL_VALUE (elt);
14959
14960   if (negate_p)
14961     r = real_value_negate (&r);
14962
14963   if (real_equal (&r, &dconst1))
14964     return true;
14965   if (real_equal (&r, &dconsthalf))
14966     return true;
14967   return false;
14968 }
14969
14970 /* Return true if X is a valid immediate operand for an SVE FMUL
14971    instruction.  */
14972
14973 bool
14974 aarch64_sve_float_mul_immediate_p (rtx x)
14975 {
14976   rtx elt;
14977
14978   /* GCC will never generate a multiply with an immediate of 2, so there is no
14979      point testing for it (even though it is a valid constant).  */
14980   return (const_vec_duplicate_p (x, &elt)
14981           && GET_CODE (elt) == CONST_DOUBLE
14982           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
14983 }
14984
14985 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14986    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
14987    is nonnull, use it to describe valid immediates.  */
14988 static bool
14989 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
14990                                     simd_immediate_info *info,
14991                                     enum simd_immediate_check which,
14992                                     simd_immediate_info::insn_type insn)
14993 {
14994   /* Try a 4-byte immediate with LSL.  */
14995   for (unsigned int shift = 0; shift < 32; shift += 8)
14996     if ((val32 & (0xff << shift)) == val32)
14997       {
14998         if (info)
14999           *info = simd_immediate_info (SImode, val32 >> shift, insn,
15000                                        simd_immediate_info::LSL, shift);
15001         return true;
15002       }
15003
15004   /* Try a 2-byte immediate with LSL.  */
15005   unsigned int imm16 = val32 & 0xffff;
15006   if (imm16 == (val32 >> 16))
15007     for (unsigned int shift = 0; shift < 16; shift += 8)
15008       if ((imm16 & (0xff << shift)) == imm16)
15009         {
15010           if (info)
15011             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
15012                                          simd_immediate_info::LSL, shift);
15013           return true;
15014         }
15015
15016   /* Try a 4-byte immediate with MSL, except for cases that MVN
15017      can handle.  */
15018   if (which == AARCH64_CHECK_MOV)
15019     for (unsigned int shift = 8; shift < 24; shift += 8)
15020       {
15021         unsigned int low = (1 << shift) - 1;
15022         if (((val32 & (0xff << shift)) | low) == val32)
15023           {
15024             if (info)
15025               *info = simd_immediate_info (SImode, val32 >> shift, insn,
15026                                            simd_immediate_info::MSL, shift);
15027             return true;
15028           }
15029       }
15030
15031   return false;
15032 }
15033
15034 /* Return true if replicating VAL64 is a valid immediate for the
15035    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
15036    use it to describe valid immediates.  */
15037 static bool
15038 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
15039                                  simd_immediate_info *info,
15040                                  enum simd_immediate_check which)
15041 {
15042   unsigned int val32 = val64 & 0xffffffff;
15043   unsigned int val16 = val64 & 0xffff;
15044   unsigned int val8 = val64 & 0xff;
15045
15046   if (val32 == (val64 >> 32))
15047     {
15048       if ((which & AARCH64_CHECK_ORR) != 0
15049           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
15050                                                  simd_immediate_info::MOV))
15051         return true;
15052
15053       if ((which & AARCH64_CHECK_BIC) != 0
15054           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
15055                                                  simd_immediate_info::MVN))
15056         return true;
15057
15058       /* Try using a replicated byte.  */
15059       if (which == AARCH64_CHECK_MOV
15060           && val16 == (val32 >> 16)
15061           && val8 == (val16 >> 8))
15062         {
15063           if (info)
15064             *info = simd_immediate_info (QImode, val8);
15065           return true;
15066         }
15067     }
15068
15069   /* Try using a bit-to-bytemask.  */
15070   if (which == AARCH64_CHECK_MOV)
15071     {
15072       unsigned int i;
15073       for (i = 0; i < 64; i += 8)
15074         {
15075           unsigned char byte = (val64 >> i) & 0xff;
15076           if (byte != 0 && byte != 0xff)
15077             break;
15078         }
15079       if (i == 64)
15080         {
15081           if (info)
15082             *info = simd_immediate_info (DImode, val64);
15083           return true;
15084         }
15085     }
15086   return false;
15087 }
15088
15089 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
15090    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
15091
15092 static bool
15093 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
15094                              simd_immediate_info *info)
15095 {
15096   scalar_int_mode mode = DImode;
15097   unsigned int val32 = val64 & 0xffffffff;
15098   if (val32 == (val64 >> 32))
15099     {
15100       mode = SImode;
15101       unsigned int val16 = val32 & 0xffff;
15102       if (val16 == (val32 >> 16))
15103         {
15104           mode = HImode;
15105           unsigned int val8 = val16 & 0xff;
15106           if (val8 == (val16 >> 8))
15107             mode = QImode;
15108         }
15109     }
15110   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
15111   if (IN_RANGE (val, -0x80, 0x7f))
15112     {
15113       /* DUP with no shift.  */
15114       if (info)
15115         *info = simd_immediate_info (mode, val);
15116       return true;
15117     }
15118   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
15119     {
15120       /* DUP with LSL #8.  */
15121       if (info)
15122         *info = simd_immediate_info (mode, val);
15123       return true;
15124     }
15125   if (aarch64_bitmask_imm (val64, mode))
15126     {
15127       /* DUPM.  */
15128       if (info)
15129         *info = simd_immediate_info (mode, val);
15130       return true;
15131     }
15132   return false;
15133 }
15134
15135 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
15136    it to describe valid immediates.  */
15137
15138 static bool
15139 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
15140 {
15141   if (x == CONST0_RTX (GET_MODE (x)))
15142     {
15143       if (info)
15144         *info = simd_immediate_info (DImode, 0);
15145       return true;
15146     }
15147
15148   /* Analyze the value as a VNx16BImode.  This should be relatively
15149      efficient, since rtx_vector_builder has enough built-in capacity
15150      to store all VLA predicate constants without needing the heap.  */
15151   rtx_vector_builder builder;
15152   if (!aarch64_get_sve_pred_bits (builder, x))
15153     return false;
15154
15155   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
15156   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
15157     {
15158       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
15159       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
15160       if (pattern != AARCH64_NUM_SVPATTERNS)
15161         {
15162           if (info)
15163             {
15164               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
15165               *info = simd_immediate_info (int_mode, pattern);
15166             }
15167           return true;
15168         }
15169     }
15170   return false;
15171 }
15172
15173 /* Return true if OP is a valid SIMD immediate for the operation
15174    described by WHICH.  If INFO is nonnull, use it to describe valid
15175    immediates.  */
15176 bool
15177 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
15178                               enum simd_immediate_check which)
15179 {
15180   machine_mode mode = GET_MODE (op);
15181   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15182   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15183     return false;
15184
15185   if (vec_flags & VEC_SVE_PRED)
15186     return aarch64_sve_pred_valid_immediate (op, info);
15187
15188   scalar_mode elt_mode = GET_MODE_INNER (mode);
15189   rtx base, step;
15190   unsigned int n_elts;
15191   if (GET_CODE (op) == CONST_VECTOR
15192       && CONST_VECTOR_DUPLICATE_P (op))
15193     n_elts = CONST_VECTOR_NPATTERNS (op);
15194   else if ((vec_flags & VEC_SVE_DATA)
15195            && const_vec_series_p (op, &base, &step))
15196     {
15197       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
15198       if (!aarch64_sve_index_immediate_p (base)
15199           || !aarch64_sve_index_immediate_p (step))
15200         return false;
15201
15202       if (info)
15203         *info = simd_immediate_info (elt_mode, base, step);
15204       return true;
15205     }
15206   else if (GET_CODE (op) == CONST_VECTOR
15207            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
15208     /* N_ELTS set above.  */;
15209   else
15210     return false;
15211
15212   scalar_float_mode elt_float_mode;
15213   if (n_elts == 1
15214       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
15215     {
15216       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
15217       if (aarch64_float_const_zero_rtx_p (elt)
15218           || aarch64_float_const_representable_p (elt))
15219         {
15220           if (info)
15221             *info = simd_immediate_info (elt_float_mode, elt);
15222           return true;
15223         }
15224     }
15225
15226   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
15227   if (elt_size > 8)
15228     return false;
15229
15230   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
15231
15232   /* Expand the vector constant out into a byte vector, with the least
15233      significant byte of the register first.  */
15234   auto_vec<unsigned char, 16> bytes;
15235   bytes.reserve (n_elts * elt_size);
15236   for (unsigned int i = 0; i < n_elts; i++)
15237     {
15238       /* The vector is provided in gcc endian-neutral fashion.
15239          For aarch64_be Advanced SIMD, it must be laid out in the vector
15240          register in reverse order.  */
15241       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
15242       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
15243
15244       if (elt_mode != elt_int_mode)
15245         elt = gen_lowpart (elt_int_mode, elt);
15246
15247       if (!CONST_INT_P (elt))
15248         return false;
15249
15250       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
15251       for (unsigned int byte = 0; byte < elt_size; byte++)
15252         {
15253           bytes.quick_push (elt_val & 0xff);
15254           elt_val >>= BITS_PER_UNIT;
15255         }
15256     }
15257
15258   /* The immediate must repeat every eight bytes.  */
15259   unsigned int nbytes = bytes.length ();
15260   for (unsigned i = 8; i < nbytes; ++i)
15261     if (bytes[i] != bytes[i - 8])
15262       return false;
15263
15264   /* Get the repeating 8-byte value as an integer.  No endian correction
15265      is needed here because bytes is already in lsb-first order.  */
15266   unsigned HOST_WIDE_INT val64 = 0;
15267   for (unsigned int i = 0; i < 8; i++)
15268     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
15269               << (i * BITS_PER_UNIT));
15270
15271   if (vec_flags & VEC_SVE_DATA)
15272     return aarch64_sve_valid_immediate (val64, info);
15273   else
15274     return aarch64_advsimd_valid_immediate (val64, info, which);
15275 }
15276
15277 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15278    has a step in the range of INDEX.  Return the index expression if so,
15279    otherwise return null.  */
15280 rtx
15281 aarch64_check_zero_based_sve_index_immediate (rtx x)
15282 {
15283   rtx base, step;
15284   if (const_vec_series_p (x, &base, &step)
15285       && base == const0_rtx
15286       && aarch64_sve_index_immediate_p (step))
15287     return step;
15288   return NULL_RTX;
15289 }
15290
15291 /* Check of immediate shift constants are within range.  */
15292 bool
15293 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
15294 {
15295   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
15296   if (left)
15297     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
15298   else
15299     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
15300 }
15301
15302 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15303    operation of width WIDTH at bit position POS.  */
15304
15305 rtx
15306 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
15307 {
15308   gcc_assert (CONST_INT_P (width));
15309   gcc_assert (CONST_INT_P (pos));
15310
15311   unsigned HOST_WIDE_INT mask
15312     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
15313   return GEN_INT (mask << UINTVAL (pos));
15314 }
15315
15316 bool
15317 aarch64_mov_operand_p (rtx x, machine_mode mode)
15318 {
15319   if (GET_CODE (x) == HIGH
15320       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
15321     return true;
15322
15323   if (CONST_INT_P (x))
15324     return true;
15325
15326   if (VECTOR_MODE_P (GET_MODE (x)))
15327     {
15328       /* Require predicate constants to be VNx16BI before RA, so that we
15329          force everything to have a canonical form.  */
15330       if (!lra_in_progress
15331           && !reload_completed
15332           && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
15333           && GET_MODE (x) != VNx16BImode)
15334         return false;
15335
15336       return aarch64_simd_valid_immediate (x, NULL);
15337     }
15338
15339   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
15340     return true;
15341
15342   if (aarch64_sve_cnt_immediate_p (x))
15343     return true;
15344
15345   return aarch64_classify_symbolic_expression (x)
15346     == SYMBOL_TINY_ABSOLUTE;
15347 }
15348
15349 /* Return a const_int vector of VAL.  */
15350 rtx
15351 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
15352 {
15353   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
15354   return gen_const_vec_duplicate (mode, c);
15355 }
15356
15357 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
15358
15359 bool
15360 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
15361 {
15362   machine_mode vmode;
15363
15364   vmode = aarch64_simd_container_mode (mode, 64);
15365   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
15366   return aarch64_simd_valid_immediate (op_v, NULL);
15367 }
15368
15369 /* Construct and return a PARALLEL RTX vector with elements numbering the
15370    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15371    the vector - from the perspective of the architecture.  This does not
15372    line up with GCC's perspective on lane numbers, so we end up with
15373    different masks depending on our target endian-ness.  The diagram
15374    below may help.  We must draw the distinction when building masks
15375    which select one half of the vector.  An instruction selecting
15376    architectural low-lanes for a big-endian target, must be described using
15377    a mask selecting GCC high-lanes.
15378
15379                  Big-Endian             Little-Endian
15380
15381 GCC             0   1   2   3           3   2   1   0
15382               | x | x | x | x |       | x | x | x | x |
15383 Architecture    3   2   1   0           3   2   1   0
15384
15385 Low Mask:         { 2, 3 }                { 0, 1 }
15386 High Mask:        { 0, 1 }                { 2, 3 }
15387
15388    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
15389
15390 rtx
15391 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
15392 {
15393   rtvec v = rtvec_alloc (nunits / 2);
15394   int high_base = nunits / 2;
15395   int low_base = 0;
15396   int base;
15397   rtx t1;
15398   int i;
15399
15400   if (BYTES_BIG_ENDIAN)
15401     base = high ? low_base : high_base;
15402   else
15403     base = high ? high_base : low_base;
15404
15405   for (i = 0; i < nunits / 2; i++)
15406     RTVEC_ELT (v, i) = GEN_INT (base + i);
15407
15408   t1 = gen_rtx_PARALLEL (mode, v);
15409   return t1;
15410 }
15411
15412 /* Check OP for validity as a PARALLEL RTX vector with elements
15413    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15414    from the perspective of the architecture.  See the diagram above
15415    aarch64_simd_vect_par_cnst_half for more details.  */
15416
15417 bool
15418 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
15419                                        bool high)
15420 {
15421   int nelts;
15422   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
15423     return false;
15424
15425   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
15426   HOST_WIDE_INT count_op = XVECLEN (op, 0);
15427   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
15428   int i = 0;
15429
15430   if (count_op != count_ideal)
15431     return false;
15432
15433   for (i = 0; i < count_ideal; i++)
15434     {
15435       rtx elt_op = XVECEXP (op, 0, i);
15436       rtx elt_ideal = XVECEXP (ideal, 0, i);
15437
15438       if (!CONST_INT_P (elt_op)
15439           || INTVAL (elt_ideal) != INTVAL (elt_op))
15440         return false;
15441     }
15442   return true;
15443 }
15444
15445 /* Return a PARALLEL containing NELTS elements, with element I equal
15446    to BASE + I * STEP.  */
15447
15448 rtx
15449 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
15450 {
15451   rtvec vec = rtvec_alloc (nelts);
15452   for (unsigned int i = 0; i < nelts; ++i)
15453     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
15454   return gen_rtx_PARALLEL (VOIDmode, vec);
15455 }
15456
15457 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15458    series with step STEP.  */
15459
15460 bool
15461 aarch64_stepped_int_parallel_p (rtx op, int step)
15462 {
15463   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
15464     return false;
15465
15466   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
15467   for (int i = 1; i < XVECLEN (op, 0); ++i)
15468     if (!CONST_INT_P (XVECEXP (op, 0, i))
15469         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
15470       return false;
15471
15472   return true;
15473 }
15474
15475 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
15476    HIGH (exclusive).  */
15477 void
15478 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
15479                           const_tree exp)
15480 {
15481   HOST_WIDE_INT lane;
15482   gcc_assert (CONST_INT_P (operand));
15483   lane = INTVAL (operand);
15484
15485   if (lane < low || lane >= high)
15486   {
15487     if (exp)
15488       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
15489     else
15490       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
15491   }
15492 }
15493
15494 /* Peform endian correction on lane number N, which indexes a vector
15495    of mode MODE, and return the result as an SImode rtx.  */
15496
15497 rtx
15498 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
15499 {
15500   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
15501 }
15502
15503 /* Return TRUE if OP is a valid vector addressing mode.  */
15504
15505 bool
15506 aarch64_simd_mem_operand_p (rtx op)
15507 {
15508   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
15509                         || REG_P (XEXP (op, 0)));
15510 }
15511
15512 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
15513
15514 bool
15515 aarch64_sve_ld1r_operand_p (rtx op)
15516 {
15517   struct aarch64_address_info addr;
15518   scalar_mode mode;
15519
15520   return (MEM_P (op)
15521           && is_a <scalar_mode> (GET_MODE (op), &mode)
15522           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
15523           && addr.type == ADDRESS_REG_IMM
15524           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
15525 }
15526
15527 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
15528 bool
15529 aarch64_sve_ld1rq_operand_p (rtx op)
15530 {
15531   struct aarch64_address_info addr;
15532   scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
15533   if (!MEM_P (op)
15534       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
15535     return false;
15536
15537   if (addr.type == ADDRESS_REG_IMM)
15538     return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
15539
15540   if (addr.type == ADDRESS_REG_REG)
15541     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
15542
15543   return false;
15544 }
15545
15546 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15547    The conditions for STR are the same.  */
15548 bool
15549 aarch64_sve_ldr_operand_p (rtx op)
15550 {
15551   struct aarch64_address_info addr;
15552
15553   return (MEM_P (op)
15554           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
15555                                        false, ADDR_QUERY_ANY)
15556           && addr.type == ADDRESS_REG_IMM);
15557 }
15558
15559 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15560    We need to be able to access the individual pieces, so the range
15561    is different from LD[234] and ST[234].  */
15562 bool
15563 aarch64_sve_struct_memory_operand_p (rtx op)
15564 {
15565   if (!MEM_P (op))
15566     return false;
15567
15568   machine_mode mode = GET_MODE (op);
15569   struct aarch64_address_info addr;
15570   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
15571                                  ADDR_QUERY_ANY)
15572       || addr.type != ADDRESS_REG_IMM)
15573     return false;
15574
15575   poly_int64 first = addr.const_offset;
15576   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
15577   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
15578           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
15579 }
15580
15581 /* Emit a register copy from operand to operand, taking care not to
15582    early-clobber source registers in the process.
15583
15584    COUNT is the number of components into which the copy needs to be
15585    decomposed.  */
15586 void
15587 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
15588                                 unsigned int count)
15589 {
15590   unsigned int i;
15591   int rdest = REGNO (operands[0]);
15592   int rsrc = REGNO (operands[1]);
15593
15594   if (!reg_overlap_mentioned_p (operands[0], operands[1])
15595       || rdest < rsrc)
15596     for (i = 0; i < count; i++)
15597       emit_move_insn (gen_rtx_REG (mode, rdest + i),
15598                       gen_rtx_REG (mode, rsrc + i));
15599   else
15600     for (i = 0; i < count; i++)
15601       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
15602                       gen_rtx_REG (mode, rsrc + count - i - 1));
15603 }
15604
15605 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
15606    one of VSTRUCT modes: OI, CI, or XI.  */
15607 int
15608 aarch64_simd_attr_length_rglist (machine_mode mode)
15609 {
15610   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
15611   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
15612 }
15613
15614 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
15615    alignment of a vector to 128 bits.  SVE predicates have an alignment of
15616    16 bits.  */
15617 static HOST_WIDE_INT
15618 aarch64_simd_vector_alignment (const_tree type)
15619 {
15620   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15621     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15622        be set for non-predicate vectors of booleans.  Modes are the most
15623        direct way we have of identifying real SVE predicate types.  */
15624     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
15625   return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
15626 }
15627
15628 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
15629 static poly_uint64
15630 aarch64_vectorize_preferred_vector_alignment (const_tree type)
15631 {
15632   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
15633     {
15634       /* If the length of the vector is fixed, try to align to that length,
15635          otherwise don't try to align at all.  */
15636       HOST_WIDE_INT result;
15637       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
15638         result = TYPE_ALIGN (TREE_TYPE (type));
15639       return result;
15640     }
15641   return TYPE_ALIGN (type);
15642 }
15643
15644 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
15645 static bool
15646 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
15647 {
15648   if (is_packed)
15649     return false;
15650
15651   /* For fixed-length vectors, check that the vectorizer will aim for
15652      full-vector alignment.  This isn't true for generic GCC vectors
15653      that are wider than the ABI maximum of 128 bits.  */
15654   poly_uint64 preferred_alignment =
15655     aarch64_vectorize_preferred_vector_alignment (type);
15656   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15657       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
15658                    preferred_alignment))
15659     return false;
15660
15661   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
15662   return true;
15663 }
15664
15665 /* Return true if the vector misalignment factor is supported by the
15666    target.  */
15667 static bool
15668 aarch64_builtin_support_vector_misalignment (machine_mode mode,
15669                                              const_tree type, int misalignment,
15670                                              bool is_packed)
15671 {
15672   if (TARGET_SIMD && STRICT_ALIGNMENT)
15673     {
15674       /* Return if movmisalign pattern is not supported for this mode.  */
15675       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
15676         return false;
15677
15678       /* Misalignment factor is unknown at compile time.  */
15679       if (misalignment == -1)
15680         return false;
15681     }
15682   return default_builtin_support_vector_misalignment (mode, type, misalignment,
15683                                                       is_packed);
15684 }
15685
15686 /* If VALS is a vector constant that can be loaded into a register
15687    using DUP, generate instructions to do so and return an RTX to
15688    assign to the register.  Otherwise return NULL_RTX.  */
15689 static rtx
15690 aarch64_simd_dup_constant (rtx vals)
15691 {
15692   machine_mode mode = GET_MODE (vals);
15693   machine_mode inner_mode = GET_MODE_INNER (mode);
15694   rtx x;
15695
15696   if (!const_vec_duplicate_p (vals, &x))
15697     return NULL_RTX;
15698
15699   /* We can load this constant by using DUP and a constant in a
15700      single ARM register.  This will be cheaper than a vector
15701      load.  */
15702   x = copy_to_mode_reg (inner_mode, x);
15703   return gen_vec_duplicate (mode, x);
15704 }
15705
15706
15707 /* Generate code to load VALS, which is a PARALLEL containing only
15708    constants (for vec_init) or CONST_VECTOR, efficiently into a
15709    register.  Returns an RTX to copy into the register, or NULL_RTX
15710    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
15711 static rtx
15712 aarch64_simd_make_constant (rtx vals)
15713 {
15714   machine_mode mode = GET_MODE (vals);
15715   rtx const_dup;
15716   rtx const_vec = NULL_RTX;
15717   int n_const = 0;
15718   int i;
15719
15720   if (GET_CODE (vals) == CONST_VECTOR)
15721     const_vec = vals;
15722   else if (GET_CODE (vals) == PARALLEL)
15723     {
15724       /* A CONST_VECTOR must contain only CONST_INTs and
15725          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
15726          Only store valid constants in a CONST_VECTOR.  */
15727       int n_elts = XVECLEN (vals, 0);
15728       for (i = 0; i < n_elts; ++i)
15729         {
15730           rtx x = XVECEXP (vals, 0, i);
15731           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15732             n_const++;
15733         }
15734       if (n_const == n_elts)
15735         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
15736     }
15737   else
15738     gcc_unreachable ();
15739
15740   if (const_vec != NULL_RTX
15741       && aarch64_simd_valid_immediate (const_vec, NULL))
15742     /* Load using MOVI/MVNI.  */
15743     return const_vec;
15744   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
15745     /* Loaded using DUP.  */
15746     return const_dup;
15747   else if (const_vec != NULL_RTX)
15748     /* Load from constant pool. We cannot take advantage of single-cycle
15749        LD1 because we need a PC-relative addressing mode.  */
15750     return const_vec;
15751   else
15752     /* A PARALLEL containing something not valid inside CONST_VECTOR.
15753        We cannot construct an initializer.  */
15754     return NULL_RTX;
15755 }
15756
15757 /* Expand a vector initialisation sequence, such that TARGET is
15758    initialised to contain VALS.  */
15759
15760 void
15761 aarch64_expand_vector_init (rtx target, rtx vals)
15762 {
15763   machine_mode mode = GET_MODE (target);
15764   scalar_mode inner_mode = GET_MODE_INNER (mode);
15765   /* The number of vector elements.  */
15766   int n_elts = XVECLEN (vals, 0);
15767   /* The number of vector elements which are not constant.  */
15768   int n_var = 0;
15769   rtx any_const = NULL_RTX;
15770   /* The first element of vals.  */
15771   rtx v0 = XVECEXP (vals, 0, 0);
15772   bool all_same = true;
15773
15774   /* This is a special vec_init<M><N> where N is not an element mode but a
15775      vector mode with half the elements of M.  We expect to find two entries
15776      of mode N in VALS and we must put their concatentation into TARGET.  */
15777   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
15778     {
15779       gcc_assert (known_eq (GET_MODE_SIZE (mode),
15780                   2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
15781       rtx lo = XVECEXP (vals, 0, 0);
15782       rtx hi = XVECEXP (vals, 0, 1);
15783       machine_mode narrow_mode = GET_MODE (lo);
15784       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
15785       gcc_assert (narrow_mode == GET_MODE (hi));
15786
15787       /* When we want to concatenate a half-width vector with zeroes we can
15788          use the aarch64_combinez[_be] patterns.  Just make sure that the
15789          zeroes are in the right half.  */
15790       if (BYTES_BIG_ENDIAN
15791           && aarch64_simd_imm_zero (lo, narrow_mode)
15792           && general_operand (hi, narrow_mode))
15793         emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
15794       else if (!BYTES_BIG_ENDIAN
15795                && aarch64_simd_imm_zero (hi, narrow_mode)
15796                && general_operand (lo, narrow_mode))
15797         emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
15798       else
15799         {
15800           /* Else create the two half-width registers and combine them.  */
15801           if (!REG_P (lo))
15802             lo = force_reg (GET_MODE (lo), lo);
15803           if (!REG_P (hi))
15804             hi = force_reg (GET_MODE (hi), hi);
15805
15806           if (BYTES_BIG_ENDIAN)
15807             std::swap (lo, hi);
15808           emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
15809         }
15810      return;
15811    }
15812
15813   /* Count the number of variable elements to initialise.  */
15814   for (int i = 0; i < n_elts; ++i)
15815     {
15816       rtx x = XVECEXP (vals, 0, i);
15817       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
15818         ++n_var;
15819       else
15820         any_const = x;
15821
15822       all_same &= rtx_equal_p (x, v0);
15823     }
15824
15825   /* No variable elements, hand off to aarch64_simd_make_constant which knows
15826      how best to handle this.  */
15827   if (n_var == 0)
15828     {
15829       rtx constant = aarch64_simd_make_constant (vals);
15830       if (constant != NULL_RTX)
15831         {
15832           emit_move_insn (target, constant);
15833           return;
15834         }
15835     }
15836
15837   /* Splat a single non-constant element if we can.  */
15838   if (all_same)
15839     {
15840       rtx x = copy_to_mode_reg (inner_mode, v0);
15841       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15842       return;
15843     }
15844
15845   enum insn_code icode = optab_handler (vec_set_optab, mode);
15846   gcc_assert (icode != CODE_FOR_nothing);
15847
15848   /* If there are only variable elements, try to optimize
15849      the insertion using dup for the most common element
15850      followed by insertions.  */
15851
15852   /* The algorithm will fill matches[*][0] with the earliest matching element,
15853      and matches[X][1] with the count of duplicate elements (if X is the
15854      earliest element which has duplicates).  */
15855
15856   if (n_var == n_elts && n_elts <= 16)
15857     {
15858       int matches[16][2] = {0};
15859       for (int i = 0; i < n_elts; i++)
15860         {
15861           for (int j = 0; j <= i; j++)
15862             {
15863               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
15864                 {
15865                   matches[i][0] = j;
15866                   matches[j][1]++;
15867                   break;
15868                 }
15869             }
15870         }
15871       int maxelement = 0;
15872       int maxv = 0;
15873       for (int i = 0; i < n_elts; i++)
15874         if (matches[i][1] > maxv)
15875           {
15876             maxelement = i;
15877             maxv = matches[i][1];
15878           }
15879
15880       /* Create a duplicate of the most common element, unless all elements
15881          are equally useless to us, in which case just immediately set the
15882          vector register using the first element.  */
15883
15884       if (maxv == 1)
15885         {
15886           /* For vectors of two 64-bit elements, we can do even better.  */
15887           if (n_elts == 2
15888               && (inner_mode == E_DImode
15889                   || inner_mode == E_DFmode))
15890
15891             {
15892               rtx x0 = XVECEXP (vals, 0, 0);
15893               rtx x1 = XVECEXP (vals, 0, 1);
15894               /* Combine can pick up this case, but handling it directly
15895                  here leaves clearer RTL.
15896
15897                  This is load_pair_lanes<mode>, and also gives us a clean-up
15898                  for store_pair_lanes<mode>.  */
15899               if (memory_operand (x0, inner_mode)
15900                   && memory_operand (x1, inner_mode)
15901                   && !STRICT_ALIGNMENT
15902                   && rtx_equal_p (XEXP (x1, 0),
15903                                   plus_constant (Pmode,
15904                                                  XEXP (x0, 0),
15905                                                  GET_MODE_SIZE (inner_mode))))
15906                 {
15907                   rtx t;
15908                   if (inner_mode == DFmode)
15909                     t = gen_load_pair_lanesdf (target, x0, x1);
15910                   else
15911                     t = gen_load_pair_lanesdi (target, x0, x1);
15912                   emit_insn (t);
15913                   return;
15914                 }
15915             }
15916           /* The subreg-move sequence below will move into lane zero of the
15917              vector register.  For big-endian we want that position to hold
15918              the last element of VALS.  */
15919           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
15920           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15921           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
15922         }
15923       else
15924         {
15925           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15926           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15927         }
15928
15929       /* Insert the rest.  */
15930       for (int i = 0; i < n_elts; i++)
15931         {
15932           rtx x = XVECEXP (vals, 0, i);
15933           if (matches[i][0] == maxelement)
15934             continue;
15935           x = copy_to_mode_reg (inner_mode, x);
15936           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15937         }
15938       return;
15939     }
15940
15941   /* Initialise a vector which is part-variable.  We want to first try
15942      to build those lanes which are constant in the most efficient way we
15943      can.  */
15944   if (n_var != n_elts)
15945     {
15946       rtx copy = copy_rtx (vals);
15947
15948       /* Load constant part of vector.  We really don't care what goes into the
15949          parts we will overwrite, but we're more likely to be able to load the
15950          constant efficiently if it has fewer, larger, repeating parts
15951          (see aarch64_simd_valid_immediate).  */
15952       for (int i = 0; i < n_elts; i++)
15953         {
15954           rtx x = XVECEXP (vals, 0, i);
15955           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15956             continue;
15957           rtx subst = any_const;
15958           for (int bit = n_elts / 2; bit > 0; bit /= 2)
15959             {
15960               /* Look in the copied vector, as more elements are const.  */
15961               rtx test = XVECEXP (copy, 0, i ^ bit);
15962               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
15963                 {
15964                   subst = test;
15965                   break;
15966                 }
15967             }
15968           XVECEXP (copy, 0, i) = subst;
15969         }
15970       aarch64_expand_vector_init (target, copy);
15971     }
15972
15973   /* Insert the variable lanes directly.  */
15974   for (int i = 0; i < n_elts; i++)
15975     {
15976       rtx x = XVECEXP (vals, 0, i);
15977       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15978         continue;
15979       x = copy_to_mode_reg (inner_mode, x);
15980       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15981     }
15982 }
15983
15984 /* Emit RTL corresponding to:
15985    insr TARGET, ELEM.  */
15986
15987 static void
15988 emit_insr (rtx target, rtx elem)
15989 {
15990   machine_mode mode = GET_MODE (target);
15991   scalar_mode elem_mode = GET_MODE_INNER (mode);
15992   elem = force_reg (elem_mode, elem);
15993
15994   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
15995   gcc_assert (icode != CODE_FOR_nothing);
15996   emit_insn (GEN_FCN (icode) (target, target, elem));
15997 }
15998
15999 /* Subroutine of aarch64_sve_expand_vector_init for handling
16000    trailing constants.
16001    This function works as follows:
16002    (a) Create a new vector consisting of trailing constants.
16003    (b) Initialize TARGET with the constant vector using emit_move_insn.
16004    (c) Insert remaining elements in TARGET using insr.
16005    NELTS is the total number of elements in original vector while
16006    while NELTS_REQD is the number of elements that are actually
16007    significant.
16008
16009    ??? The heuristic used is to do above only if number of constants
16010    is at least half the total number of elements.  May need fine tuning.  */
16011
16012 static bool
16013 aarch64_sve_expand_vector_init_handle_trailing_constants
16014  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
16015 {
16016   machine_mode mode = GET_MODE (target);
16017   scalar_mode elem_mode = GET_MODE_INNER (mode);
16018   int n_trailing_constants = 0;
16019
16020   for (int i = nelts_reqd - 1;
16021        i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
16022        i--)
16023     n_trailing_constants++;
16024
16025   if (n_trailing_constants >= nelts_reqd / 2)
16026     {
16027       rtx_vector_builder v (mode, 1, nelts);
16028       for (int i = 0; i < nelts; i++)
16029         v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
16030       rtx const_vec = v.build ();
16031       emit_move_insn (target, const_vec);
16032
16033       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
16034         emit_insr (target, builder.elt (i));
16035
16036       return true;
16037     }
16038
16039   return false;
16040 }
16041
16042 /* Subroutine of aarch64_sve_expand_vector_init.
16043    Works as follows:
16044    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
16045    (b) Skip trailing elements from BUILDER, which are the same as
16046        element NELTS_REQD - 1.
16047    (c) Insert earlier elements in reverse order in TARGET using insr.  */
16048
16049 static void
16050 aarch64_sve_expand_vector_init_insert_elems (rtx target,
16051                                              const rtx_vector_builder &builder,
16052                                              int nelts_reqd)
16053 {
16054   machine_mode mode = GET_MODE (target);
16055   scalar_mode elem_mode = GET_MODE_INNER (mode);
16056
16057   struct expand_operand ops[2];
16058   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
16059   gcc_assert (icode != CODE_FOR_nothing);
16060
16061   create_output_operand (&ops[0], target, mode);
16062   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
16063   expand_insn (icode, 2, ops);
16064
16065   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16066   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
16067     emit_insr (target, builder.elt (i));
16068 }
16069
16070 /* Subroutine of aarch64_sve_expand_vector_init to handle case
16071    when all trailing elements of builder are same.
16072    This works as follows:
16073    (a) Use expand_insn interface to broadcast last vector element in TARGET.
16074    (b) Insert remaining elements in TARGET using insr.
16075
16076    ??? The heuristic used is to do above if number of same trailing elements
16077    is at least 3/4 of total number of elements, loosely based on
16078    heuristic from mostly_zeros_p.  May need fine-tuning.  */
16079
16080 static bool
16081 aarch64_sve_expand_vector_init_handle_trailing_same_elem
16082  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
16083 {
16084   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16085   if (ndups >= (3 * nelts_reqd) / 4)
16086     {
16087       aarch64_sve_expand_vector_init_insert_elems (target, builder,
16088                                                    nelts_reqd - ndups + 1);
16089       return true;
16090     }
16091
16092   return false;
16093 }
16094
16095 /* Initialize register TARGET from BUILDER. NELTS is the constant number
16096    of elements in BUILDER.
16097
16098    The function tries to initialize TARGET from BUILDER if it fits one
16099    of the special cases outlined below.
16100
16101    Failing that, the function divides BUILDER into two sub-vectors:
16102    v_even = even elements of BUILDER;
16103    v_odd = odd elements of BUILDER;
16104
16105    and recursively calls itself with v_even and v_odd.
16106
16107    if (recursive call succeeded for v_even or v_odd)
16108      TARGET = zip (v_even, v_odd)
16109
16110    The function returns true if it managed to build TARGET from BUILDER
16111    with one of the special cases, false otherwise.
16112
16113    Example: {a, 1, b, 2, c, 3, d, 4}
16114
16115    The vector gets divided into:
16116    v_even = {a, b, c, d}
16117    v_odd = {1, 2, 3, 4}
16118
16119    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
16120    initialize tmp2 from constant vector v_odd using emit_move_insn.
16121
16122    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
16123    4 elements, so we construct tmp1 from v_even using insr:
16124    tmp1 = dup(d)
16125    insr tmp1, c
16126    insr tmp1, b
16127    insr tmp1, a
16128
16129    And finally:
16130    TARGET = zip (tmp1, tmp2)
16131    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
16132
16133 static bool
16134 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
16135                                 int nelts, int nelts_reqd)
16136 {
16137   machine_mode mode = GET_MODE (target);
16138
16139   /* Case 1: Vector contains trailing constants.  */
16140
16141   if (aarch64_sve_expand_vector_init_handle_trailing_constants
16142        (target, builder, nelts, nelts_reqd))
16143     return true;
16144
16145   /* Case 2: Vector contains leading constants.  */
16146
16147   rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
16148   for (int i = 0; i < nelts_reqd; i++)
16149     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
16150   rev_builder.finalize ();
16151
16152   if (aarch64_sve_expand_vector_init_handle_trailing_constants
16153        (target, rev_builder, nelts, nelts_reqd))
16154     {
16155       emit_insn (gen_aarch64_sve_rev (mode, target, target));
16156       return true;
16157     }
16158
16159   /* Case 3: Vector contains trailing same element.  */
16160
16161   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16162        (target, builder, nelts_reqd))
16163     return true;
16164
16165   /* Case 4: Vector contains leading same element.  */
16166
16167   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16168        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
16169     {
16170       emit_insn (gen_aarch64_sve_rev (mode, target, target));
16171       return true;
16172     }
16173
16174   /* Avoid recursing below 4-elements.
16175      ??? The threshold 4 may need fine-tuning.  */
16176
16177   if (nelts_reqd <= 4)
16178     return false;
16179
16180   rtx_vector_builder v_even (mode, 1, nelts);
16181   rtx_vector_builder v_odd (mode, 1, nelts);
16182
16183   for (int i = 0; i < nelts * 2; i += 2)
16184     {
16185       v_even.quick_push (builder.elt (i));
16186       v_odd.quick_push (builder.elt (i + 1));
16187     }
16188
16189   v_even.finalize ();
16190   v_odd.finalize ();
16191
16192   rtx tmp1 = gen_reg_rtx (mode);
16193   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
16194                                                     nelts, nelts_reqd / 2);
16195
16196   rtx tmp2 = gen_reg_rtx (mode);
16197   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
16198                                                    nelts, nelts_reqd / 2);
16199
16200   if (!did_even_p && !did_odd_p)
16201     return false;
16202
16203   /* Initialize v_even and v_odd using INSR if it didn't match any of the
16204      special cases and zip v_even, v_odd.  */
16205
16206   if (!did_even_p)
16207     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
16208
16209   if (!did_odd_p)
16210     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
16211
16212   rtvec v = gen_rtvec (2, tmp1, tmp2);
16213   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
16214   return true;
16215 }
16216
16217 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
16218
16219 void
16220 aarch64_sve_expand_vector_init (rtx target, rtx vals)
16221 {
16222   machine_mode mode = GET_MODE (target);
16223   int nelts = XVECLEN (vals, 0);
16224
16225   rtx_vector_builder v (mode, 1, nelts);
16226   for (int i = 0; i < nelts; i++)
16227     v.quick_push (XVECEXP (vals, 0, i));
16228   v.finalize ();
16229
16230   /* If neither sub-vectors of v could be initialized specially,
16231      then use INSR to insert all elements from v into TARGET.
16232      ??? This might not be optimal for vectors with large
16233      initializers like 16-element or above.
16234      For nelts < 4, it probably isn't useful to handle specially.  */
16235
16236   if (nelts < 4
16237       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
16238     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
16239 }
16240
16241 static unsigned HOST_WIDE_INT
16242 aarch64_shift_truncation_mask (machine_mode mode)
16243 {
16244   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
16245     return 0;
16246   return GET_MODE_UNIT_BITSIZE (mode) - 1;
16247 }
16248
16249 /* Select a format to encode pointers in exception handling data.  */
16250 int
16251 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
16252 {
16253    int type;
16254    switch (aarch64_cmodel)
16255      {
16256      case AARCH64_CMODEL_TINY:
16257      case AARCH64_CMODEL_TINY_PIC:
16258      case AARCH64_CMODEL_SMALL:
16259      case AARCH64_CMODEL_SMALL_PIC:
16260      case AARCH64_CMODEL_SMALL_SPIC:
16261        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
16262           for everything.  */
16263        type = DW_EH_PE_sdata4;
16264        break;
16265      default:
16266        /* No assumptions here.  8-byte relocs required.  */
16267        type = DW_EH_PE_sdata8;
16268        break;
16269      }
16270    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
16271 }
16272
16273 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
16274
16275 static void
16276 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
16277 {
16278   if (aarch64_simd_decl_p (decl))
16279     {
16280       fprintf (stream, "\t.variant_pcs\t");
16281       assemble_name (stream, name);
16282       fprintf (stream, "\n");
16283     }
16284 }
16285
16286 /* The last .arch and .tune assembly strings that we printed.  */
16287 static std::string aarch64_last_printed_arch_string;
16288 static std::string aarch64_last_printed_tune_string;
16289
16290 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
16291    by the function fndecl.  */
16292
16293 void
16294 aarch64_declare_function_name (FILE *stream, const char* name,
16295                                 tree fndecl)
16296 {
16297   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
16298
16299   struct cl_target_option *targ_options;
16300   if (target_parts)
16301     targ_options = TREE_TARGET_OPTION (target_parts);
16302   else
16303     targ_options = TREE_TARGET_OPTION (target_option_current_node);
16304   gcc_assert (targ_options);
16305
16306   const struct processor *this_arch
16307     = aarch64_get_arch (targ_options->x_explicit_arch);
16308
16309   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
16310   std::string extension
16311     = aarch64_get_extension_string_for_isa_flags (isa_flags,
16312                                                   this_arch->flags);
16313   /* Only update the assembler .arch string if it is distinct from the last
16314      such string we printed.  */
16315   std::string to_print = this_arch->name + extension;
16316   if (to_print != aarch64_last_printed_arch_string)
16317     {
16318       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
16319       aarch64_last_printed_arch_string = to_print;
16320     }
16321
16322   /* Print the cpu name we're tuning for in the comments, might be
16323      useful to readers of the generated asm.  Do it only when it changes
16324      from function to function and verbose assembly is requested.  */
16325   const struct processor *this_tune
16326     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
16327
16328   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
16329     {
16330       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
16331                    this_tune->name);
16332       aarch64_last_printed_tune_string = this_tune->name;
16333     }
16334
16335   aarch64_asm_output_variant_pcs (stream, fndecl, name);
16336
16337   /* Don't forget the type directive for ELF.  */
16338   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
16339   ASM_OUTPUT_LABEL (stream, name);
16340 }
16341
16342 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
16343
16344 void
16345 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
16346 {
16347   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
16348   const char *value = IDENTIFIER_POINTER (target);
16349   aarch64_asm_output_variant_pcs (stream, decl, name);
16350   ASM_OUTPUT_DEF (stream, name, value);
16351 }
16352
16353 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
16354    function symbol references.  */
16355
16356 void
16357 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
16358 {
16359   default_elf_asm_output_external (stream, decl, name);
16360   aarch64_asm_output_variant_pcs (stream, decl, name);
16361 }
16362
16363 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16364    Used to output the .cfi_b_key_frame directive when signing the current
16365    function with the B key.  */
16366
16367 void
16368 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
16369 {
16370   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
16371       && aarch64_ra_sign_key == AARCH64_KEY_B)
16372         asm_fprintf (f, "\t.cfi_b_key_frame\n");
16373 }
16374
16375 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
16376
16377 static void
16378 aarch64_start_file (void)
16379 {
16380   struct cl_target_option *default_options
16381     = TREE_TARGET_OPTION (target_option_default_node);
16382
16383   const struct processor *default_arch
16384     = aarch64_get_arch (default_options->x_explicit_arch);
16385   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
16386   std::string extension
16387     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
16388                                                   default_arch->flags);
16389
16390    aarch64_last_printed_arch_string = default_arch->name + extension;
16391    aarch64_last_printed_tune_string = "";
16392    asm_fprintf (asm_out_file, "\t.arch %s\n",
16393                 aarch64_last_printed_arch_string.c_str ());
16394
16395    default_file_start ();
16396 }
16397
16398 /* Emit load exclusive.  */
16399
16400 static void
16401 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
16402                              rtx mem, rtx model_rtx)
16403 {
16404   emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
16405 }
16406
16407 /* Emit store exclusive.  */
16408
16409 static void
16410 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
16411                               rtx rval, rtx mem, rtx model_rtx)
16412 {
16413   emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
16414 }
16415
16416 /* Mark the previous jump instruction as unlikely.  */
16417
16418 static void
16419 aarch64_emit_unlikely_jump (rtx insn)
16420 {
16421   rtx_insn *jump = emit_jump_insn (insn);
16422   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
16423 }
16424
16425 /* Expand a compare and swap pattern.  */
16426
16427 void
16428 aarch64_expand_compare_and_swap (rtx operands[])
16429 {
16430   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
16431   machine_mode mode, r_mode;
16432
16433   bval = operands[0];
16434   rval = operands[1];
16435   mem = operands[2];
16436   oldval = operands[3];
16437   newval = operands[4];
16438   is_weak = operands[5];
16439   mod_s = operands[6];
16440   mod_f = operands[7];
16441   mode = GET_MODE (mem);
16442
16443   /* Normally the succ memory model must be stronger than fail, but in the
16444      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
16445      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
16446   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
16447       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
16448     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
16449
16450   r_mode = mode;
16451   if (mode == QImode || mode == HImode)
16452     {
16453       r_mode = SImode;
16454       rval = gen_reg_rtx (r_mode);
16455     }
16456
16457   if (TARGET_LSE)
16458     {
16459       /* The CAS insn requires oldval and rval overlap, but we need to
16460          have a copy of oldval saved across the operation to tell if
16461          the operation is successful.  */
16462       if (reg_overlap_mentioned_p (rval, oldval))
16463         rval = copy_to_mode_reg (r_mode, oldval);
16464       else
16465         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
16466
16467       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
16468                                                    newval, mod_s));
16469       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16470     }
16471   else
16472     {
16473       /* The oldval predicate varies by mode.  Test it and force to reg.  */
16474       insn_code code = code_for_aarch64_compare_and_swap (mode);
16475       if (!insn_data[code].operand[2].predicate (oldval, mode))
16476         oldval = force_reg (mode, oldval);
16477
16478       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
16479                                  is_weak, mod_s, mod_f));
16480       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
16481     }
16482
16483   if (r_mode != mode)
16484     rval = gen_lowpart (mode, rval);
16485   emit_move_insn (operands[1], rval);
16486
16487   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
16488   emit_insn (gen_rtx_SET (bval, x));
16489 }
16490
16491 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
16492    sequence implementing an atomic operation.  */
16493
16494 static void
16495 aarch64_emit_post_barrier (enum memmodel model)
16496 {
16497   const enum memmodel base_model = memmodel_base (model);
16498
16499   if (is_mm_sync (model)
16500       && (base_model == MEMMODEL_ACQUIRE
16501           || base_model == MEMMODEL_ACQ_REL
16502           || base_model == MEMMODEL_SEQ_CST))
16503     {
16504       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
16505     }
16506 }
16507
16508 /* Split a compare and swap pattern.  */
16509
16510 void
16511 aarch64_split_compare_and_swap (rtx operands[])
16512 {
16513   rtx rval, mem, oldval, newval, scratch;
16514   machine_mode mode;
16515   bool is_weak;
16516   rtx_code_label *label1, *label2;
16517   rtx x, cond;
16518   enum memmodel model;
16519   rtx model_rtx;
16520
16521   rval = operands[0];
16522   mem = operands[1];
16523   oldval = operands[2];
16524   newval = operands[3];
16525   is_weak = (operands[4] != const0_rtx);
16526   model_rtx = operands[5];
16527   scratch = operands[7];
16528   mode = GET_MODE (mem);
16529   model = memmodel_from_int (INTVAL (model_rtx));
16530
16531   /* When OLDVAL is zero and we want the strong version we can emit a tighter
16532     loop:
16533     .label1:
16534         LD[A]XR rval, [mem]
16535         CBNZ    rval, .label2
16536         ST[L]XR scratch, newval, [mem]
16537         CBNZ    scratch, .label1
16538     .label2:
16539         CMP     rval, 0.  */
16540   bool strong_zero_p = !is_weak && oldval == const0_rtx;
16541
16542   label1 = NULL;
16543   if (!is_weak)
16544     {
16545       label1 = gen_label_rtx ();
16546       emit_label (label1);
16547     }
16548   label2 = gen_label_rtx ();
16549
16550   /* The initial load can be relaxed for a __sync operation since a final
16551      barrier will be emitted to stop code hoisting.  */
16552   if (is_mm_sync (model))
16553     aarch64_emit_load_exclusive (mode, rval, mem,
16554                                  GEN_INT (MEMMODEL_RELAXED));
16555   else
16556     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
16557
16558   if (strong_zero_p)
16559     {
16560       if (aarch64_track_speculation)
16561         {
16562           /* Emit an explicit compare instruction, so that we can correctly
16563              track the condition codes.  */
16564           rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
16565           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16566         }
16567       else
16568         x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
16569
16570       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16571                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16572       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16573     }
16574   else
16575     {
16576       cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16577       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16578       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16579                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16580       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16581     }
16582
16583   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
16584
16585   if (!is_weak)
16586     {
16587       if (aarch64_track_speculation)
16588         {
16589           /* Emit an explicit compare instruction, so that we can correctly
16590              track the condition codes.  */
16591           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
16592           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16593         }
16594       else
16595         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
16596
16597       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16598                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
16599       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16600     }
16601   else
16602     {
16603       cond = gen_rtx_REG (CCmode, CC_REGNUM);
16604       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
16605       emit_insn (gen_rtx_SET (cond, x));
16606     }
16607
16608   emit_label (label2);
16609   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
16610      to set the condition flags.  If this is not used it will be removed by
16611      later passes.  */
16612   if (strong_zero_p)
16613     {
16614       cond = gen_rtx_REG (CCmode, CC_REGNUM);
16615       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
16616       emit_insn (gen_rtx_SET (cond, x));
16617     }
16618   /* Emit any final barrier needed for a __sync operation.  */
16619   if (is_mm_sync (model))
16620     aarch64_emit_post_barrier (model);
16621 }
16622
16623 /* Split an atomic operation.  */
16624
16625 void
16626 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
16627                          rtx value, rtx model_rtx, rtx cond)
16628 {
16629   machine_mode mode = GET_MODE (mem);
16630   machine_mode wmode = (mode == DImode ? DImode : SImode);
16631   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
16632   const bool is_sync = is_mm_sync (model);
16633   rtx_code_label *label;
16634   rtx x;
16635
16636   /* Split the atomic operation into a sequence.  */
16637   label = gen_label_rtx ();
16638   emit_label (label);
16639
16640   if (new_out)
16641     new_out = gen_lowpart (wmode, new_out);
16642   if (old_out)
16643     old_out = gen_lowpart (wmode, old_out);
16644   else
16645     old_out = new_out;
16646   value = simplify_gen_subreg (wmode, value, mode, 0);
16647
16648   /* The initial load can be relaxed for a __sync operation since a final
16649      barrier will be emitted to stop code hoisting.  */
16650  if (is_sync)
16651     aarch64_emit_load_exclusive (mode, old_out, mem,
16652                                  GEN_INT (MEMMODEL_RELAXED));
16653   else
16654     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
16655
16656   switch (code)
16657     {
16658     case SET:
16659       new_out = value;
16660       break;
16661
16662     case NOT:
16663       x = gen_rtx_AND (wmode, old_out, value);
16664       emit_insn (gen_rtx_SET (new_out, x));
16665       x = gen_rtx_NOT (wmode, new_out);
16666       emit_insn (gen_rtx_SET (new_out, x));
16667       break;
16668
16669     case MINUS:
16670       if (CONST_INT_P (value))
16671         {
16672           value = GEN_INT (-INTVAL (value));
16673           code = PLUS;
16674         }
16675       /* Fall through.  */
16676
16677     default:
16678       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
16679       emit_insn (gen_rtx_SET (new_out, x));
16680       break;
16681     }
16682
16683   aarch64_emit_store_exclusive (mode, cond, mem,
16684                                 gen_lowpart (mode, new_out), model_rtx);
16685
16686   if (aarch64_track_speculation)
16687     {
16688       /* Emit an explicit compare instruction, so that we can correctly
16689          track the condition codes.  */
16690       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
16691       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16692     }
16693   else
16694     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16695
16696   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16697                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
16698   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16699
16700   /* Emit any final barrier needed for a __sync operation.  */
16701   if (is_sync)
16702     aarch64_emit_post_barrier (model);
16703 }
16704
16705 static void
16706 aarch64_init_libfuncs (void)
16707 {
16708    /* Half-precision float operations.  The compiler handles all operations
16709      with NULL libfuncs by converting to SFmode.  */
16710
16711   /* Conversions.  */
16712   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
16713   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
16714
16715   /* Arithmetic.  */
16716   set_optab_libfunc (add_optab, HFmode, NULL);
16717   set_optab_libfunc (sdiv_optab, HFmode, NULL);
16718   set_optab_libfunc (smul_optab, HFmode, NULL);
16719   set_optab_libfunc (neg_optab, HFmode, NULL);
16720   set_optab_libfunc (sub_optab, HFmode, NULL);
16721
16722   /* Comparisons.  */
16723   set_optab_libfunc (eq_optab, HFmode, NULL);
16724   set_optab_libfunc (ne_optab, HFmode, NULL);
16725   set_optab_libfunc (lt_optab, HFmode, NULL);
16726   set_optab_libfunc (le_optab, HFmode, NULL);
16727   set_optab_libfunc (ge_optab, HFmode, NULL);
16728   set_optab_libfunc (gt_optab, HFmode, NULL);
16729   set_optab_libfunc (unord_optab, HFmode, NULL);
16730 }
16731
16732 /* Target hook for c_mode_for_suffix.  */
16733 static machine_mode
16734 aarch64_c_mode_for_suffix (char suffix)
16735 {
16736   if (suffix == 'q')
16737     return TFmode;
16738
16739   return VOIDmode;
16740 }
16741
16742 /* We can only represent floating point constants which will fit in
16743    "quarter-precision" values.  These values are characterised by
16744    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
16745    by:
16746
16747    (-1)^s * (n/16) * 2^r
16748
16749    Where:
16750      's' is the sign bit.
16751      'n' is an integer in the range 16 <= n <= 31.
16752      'r' is an integer in the range -3 <= r <= 4.  */
16753
16754 /* Return true iff X can be represented by a quarter-precision
16755    floating point immediate operand X.  Note, we cannot represent 0.0.  */
16756 bool
16757 aarch64_float_const_representable_p (rtx x)
16758 {
16759   /* This represents our current view of how many bits
16760      make up the mantissa.  */
16761   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
16762   int exponent;
16763   unsigned HOST_WIDE_INT mantissa, mask;
16764   REAL_VALUE_TYPE r, m;
16765   bool fail;
16766
16767   if (!CONST_DOUBLE_P (x))
16768     return false;
16769
16770   if (GET_MODE (x) == VOIDmode
16771       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
16772     return false;
16773
16774   r = *CONST_DOUBLE_REAL_VALUE (x);
16775
16776   /* We cannot represent infinities, NaNs or +/-zero.  We won't
16777      know if we have +zero until we analyse the mantissa, but we
16778      can reject the other invalid values.  */
16779   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
16780       || REAL_VALUE_MINUS_ZERO (r))
16781     return false;
16782
16783   /* Extract exponent.  */
16784   r = real_value_abs (&r);
16785   exponent = REAL_EXP (&r);
16786
16787   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
16788      highest (sign) bit, with a fixed binary point at bit point_pos.
16789      m1 holds the low part of the mantissa, m2 the high part.
16790      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
16791      bits for the mantissa, this can fail (low bits will be lost).  */
16792   real_ldexp (&m, &r, point_pos - exponent);
16793   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
16794
16795   /* If the low part of the mantissa has bits set we cannot represent
16796      the value.  */
16797   if (w.ulow () != 0)
16798     return false;
16799   /* We have rejected the lower HOST_WIDE_INT, so update our
16800      understanding of how many bits lie in the mantissa and
16801      look only at the high HOST_WIDE_INT.  */
16802   mantissa = w.elt (1);
16803   point_pos -= HOST_BITS_PER_WIDE_INT;
16804
16805   /* We can only represent values with a mantissa of the form 1.xxxx.  */
16806   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
16807   if ((mantissa & mask) != 0)
16808     return false;
16809
16810   /* Having filtered unrepresentable values, we may now remove all
16811      but the highest 5 bits.  */
16812   mantissa >>= point_pos - 5;
16813
16814   /* We cannot represent the value 0.0, so reject it.  This is handled
16815      elsewhere.  */
16816   if (mantissa == 0)
16817     return false;
16818
16819   /* Then, as bit 4 is always set, we can mask it off, leaving
16820      the mantissa in the range [0, 15].  */
16821   mantissa &= ~(1 << 4);
16822   gcc_assert (mantissa <= 15);
16823
16824   /* GCC internally does not use IEEE754-like encoding (where normalized
16825      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
16826      Our mantissa values are shifted 4 places to the left relative to
16827      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
16828      by 5 places to correct for GCC's representation.  */
16829   exponent = 5 - exponent;
16830
16831   return (exponent >= 0 && exponent <= 7);
16832 }
16833
16834 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
16835    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
16836    output MOVI/MVNI, ORR or BIC immediate.  */
16837 char*
16838 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
16839                                    enum simd_immediate_check which)
16840 {
16841   bool is_valid;
16842   static char templ[40];
16843   const char *mnemonic;
16844   const char *shift_op;
16845   unsigned int lane_count = 0;
16846   char element_char;
16847
16848   struct simd_immediate_info info;
16849
16850   /* This will return true to show const_vector is legal for use as either
16851      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
16852      It will also update INFO to show how the immediate should be generated.
16853      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
16854   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
16855   gcc_assert (is_valid);
16856
16857   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
16858   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
16859
16860   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
16861     {
16862       gcc_assert (info.insn == simd_immediate_info::MOV
16863                   && info.u.mov.shift == 0);
16864       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
16865          move immediate path.  */
16866       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
16867         info.u.mov.value = GEN_INT (0);
16868       else
16869         {
16870           const unsigned int buf_size = 20;
16871           char float_buf[buf_size] = {'\0'};
16872           real_to_decimal_for_mode (float_buf,
16873                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
16874                                     buf_size, buf_size, 1, info.elt_mode);
16875
16876           if (lane_count == 1)
16877             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
16878           else
16879             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
16880                       lane_count, element_char, float_buf);
16881           return templ;
16882         }
16883     }
16884
16885   gcc_assert (CONST_INT_P (info.u.mov.value));
16886
16887   if (which == AARCH64_CHECK_MOV)
16888     {
16889       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
16890       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
16891                   ? "msl" : "lsl");
16892       if (lane_count == 1)
16893         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
16894                   mnemonic, UINTVAL (info.u.mov.value));
16895       else if (info.u.mov.shift)
16896         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
16897                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
16898                   element_char, UINTVAL (info.u.mov.value), shift_op,
16899                   info.u.mov.shift);
16900       else
16901         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
16902                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
16903                   element_char, UINTVAL (info.u.mov.value));
16904     }
16905   else
16906     {
16907       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
16908       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
16909       if (info.u.mov.shift)
16910         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
16911                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
16912                   element_char, UINTVAL (info.u.mov.value), "lsl",
16913                   info.u.mov.shift);
16914       else
16915         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
16916                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
16917                   element_char, UINTVAL (info.u.mov.value));
16918     }
16919   return templ;
16920 }
16921
16922 char*
16923 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
16924 {
16925
16926   /* If a floating point number was passed and we desire to use it in an
16927      integer mode do the conversion to integer.  */
16928   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
16929     {
16930       unsigned HOST_WIDE_INT ival;
16931       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
16932           gcc_unreachable ();
16933       immediate = gen_int_mode (ival, mode);
16934     }
16935
16936   machine_mode vmode;
16937   /* use a 64 bit mode for everything except for DI/DF mode, where we use
16938      a 128 bit vector mode.  */
16939   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
16940
16941   vmode = aarch64_simd_container_mode (mode, width);
16942   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
16943   return aarch64_output_simd_mov_immediate (v_op, width);
16944 }
16945
16946 /* Return the output string to use for moving immediate CONST_VECTOR
16947    into an SVE register.  */
16948
16949 char *
16950 aarch64_output_sve_mov_immediate (rtx const_vector)
16951 {
16952   static char templ[40];
16953   struct simd_immediate_info info;
16954   char element_char;
16955
16956   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
16957   gcc_assert (is_valid);
16958
16959   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
16960
16961   machine_mode vec_mode = GET_MODE (const_vector);
16962   if (aarch64_sve_pred_mode_p (vec_mode))
16963     {
16964       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
16965       if (info.insn == simd_immediate_info::MOV)
16966         {
16967           gcc_assert (info.u.mov.value == const0_rtx);
16968           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
16969         }
16970       else
16971         {
16972           gcc_assert (info.insn == simd_immediate_info::PTRUE);
16973           unsigned int total_bytes;
16974           if (info.u.pattern == AARCH64_SV_ALL
16975               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
16976             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
16977                       total_bytes / GET_MODE_SIZE (info.elt_mode));
16978           else
16979             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
16980                       svpattern_token (info.u.pattern));
16981         }
16982       return buf;
16983     }
16984
16985   if (info.insn == simd_immediate_info::INDEX)
16986     {
16987       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
16988                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
16989                 element_char, INTVAL (info.u.index.base),
16990                 INTVAL (info.u.index.step));
16991       return templ;
16992     }
16993
16994   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
16995     {
16996       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
16997         info.u.mov.value = GEN_INT (0);
16998       else
16999         {
17000           const int buf_size = 20;
17001           char float_buf[buf_size] = {};
17002           real_to_decimal_for_mode (float_buf,
17003                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17004                                     buf_size, buf_size, 1, info.elt_mode);
17005
17006           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
17007                     element_char, float_buf);
17008           return templ;
17009         }
17010     }
17011
17012   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
17013             element_char, INTVAL (info.u.mov.value));
17014   return templ;
17015 }
17016
17017 /* Split operands into moves from op[1] + op[2] into op[0].  */
17018
17019 void
17020 aarch64_split_combinev16qi (rtx operands[3])
17021 {
17022   unsigned int dest = REGNO (operands[0]);
17023   unsigned int src1 = REGNO (operands[1]);
17024   unsigned int src2 = REGNO (operands[2]);
17025   machine_mode halfmode = GET_MODE (operands[1]);
17026   unsigned int halfregs = REG_NREGS (operands[1]);
17027   rtx destlo, desthi;
17028
17029   gcc_assert (halfmode == V16QImode);
17030
17031   if (src1 == dest && src2 == dest + halfregs)
17032     {
17033       /* No-op move.  Can't split to nothing; emit something.  */
17034       emit_note (NOTE_INSN_DELETED);
17035       return;
17036     }
17037
17038   /* Preserve register attributes for variable tracking.  */
17039   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
17040   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
17041                                GET_MODE_SIZE (halfmode));
17042
17043   /* Special case of reversed high/low parts.  */
17044   if (reg_overlap_mentioned_p (operands[2], destlo)
17045       && reg_overlap_mentioned_p (operands[1], desthi))
17046     {
17047       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17048       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
17049       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17050     }
17051   else if (!reg_overlap_mentioned_p (operands[2], destlo))
17052     {
17053       /* Try to avoid unnecessary moves if part of the result
17054          is in the right place already.  */
17055       if (src1 != dest)
17056         emit_move_insn (destlo, operands[1]);
17057       if (src2 != dest + halfregs)
17058         emit_move_insn (desthi, operands[2]);
17059     }
17060   else
17061     {
17062       if (src2 != dest + halfregs)
17063         emit_move_insn (desthi, operands[2]);
17064       if (src1 != dest)
17065         emit_move_insn (destlo, operands[1]);
17066     }
17067 }
17068
17069 /* vec_perm support.  */
17070
17071 struct expand_vec_perm_d
17072 {
17073   rtx target, op0, op1;
17074   vec_perm_indices perm;
17075   machine_mode vmode;
17076   unsigned int vec_flags;
17077   bool one_vector_p;
17078   bool testing_p;
17079 };
17080
17081 /* Generate a variable permutation.  */
17082
17083 static void
17084 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
17085 {
17086   machine_mode vmode = GET_MODE (target);
17087   bool one_vector_p = rtx_equal_p (op0, op1);
17088
17089   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
17090   gcc_checking_assert (GET_MODE (op0) == vmode);
17091   gcc_checking_assert (GET_MODE (op1) == vmode);
17092   gcc_checking_assert (GET_MODE (sel) == vmode);
17093   gcc_checking_assert (TARGET_SIMD);
17094
17095   if (one_vector_p)
17096     {
17097       if (vmode == V8QImode)
17098         {
17099           /* Expand the argument to a V16QI mode by duplicating it.  */
17100           rtx pair = gen_reg_rtx (V16QImode);
17101           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
17102           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17103         }
17104       else
17105         {
17106           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
17107         }
17108     }
17109   else
17110     {
17111       rtx pair;
17112
17113       if (vmode == V8QImode)
17114         {
17115           pair = gen_reg_rtx (V16QImode);
17116           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
17117           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17118         }
17119       else
17120         {
17121           pair = gen_reg_rtx (OImode);
17122           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
17123           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
17124         }
17125     }
17126 }
17127
17128 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
17129    NELT is the number of elements in the vector.  */
17130
17131 void
17132 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
17133                          unsigned int nelt)
17134 {
17135   machine_mode vmode = GET_MODE (target);
17136   bool one_vector_p = rtx_equal_p (op0, op1);
17137   rtx mask;
17138
17139   /* The TBL instruction does not use a modulo index, so we must take care
17140      of that ourselves.  */
17141   mask = aarch64_simd_gen_const_vector_dup (vmode,
17142       one_vector_p ? nelt - 1 : 2 * nelt - 1);
17143   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
17144
17145   /* For big-endian, we also need to reverse the index within the vector
17146      (but not which vector).  */
17147   if (BYTES_BIG_ENDIAN)
17148     {
17149       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
17150       if (!one_vector_p)
17151         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
17152       sel = expand_simple_binop (vmode, XOR, sel, mask,
17153                                  NULL, 0, OPTAB_LIB_WIDEN);
17154     }
17155   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
17156 }
17157
17158 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
17159
17160 static void
17161 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
17162 {
17163   emit_insn (gen_rtx_SET (target,
17164                           gen_rtx_UNSPEC (GET_MODE (target),
17165                                           gen_rtvec (2, op0, op1), code)));
17166 }
17167
17168 /* Expand an SVE vec_perm with the given operands.  */
17169
17170 void
17171 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
17172 {
17173   machine_mode data_mode = GET_MODE (target);
17174   machine_mode sel_mode = GET_MODE (sel);
17175   /* Enforced by the pattern condition.  */
17176   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
17177
17178   /* Note: vec_perm indices are supposed to wrap when they go beyond the
17179      size of the two value vectors, i.e. the upper bits of the indices
17180      are effectively ignored.  SVE TBL instead produces 0 for any
17181      out-of-range indices, so we need to modulo all the vec_perm indices
17182      to ensure they are all in range.  */
17183   rtx sel_reg = force_reg (sel_mode, sel);
17184
17185   /* Check if the sel only references the first values vector.  */
17186   if (GET_CODE (sel) == CONST_VECTOR
17187       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
17188     {
17189       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
17190       return;
17191     }
17192
17193   /* Check if the two values vectors are the same.  */
17194   if (rtx_equal_p (op0, op1))
17195     {
17196       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
17197       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17198                                          NULL, 0, OPTAB_DIRECT);
17199       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
17200       return;
17201     }
17202
17203   /* Run TBL on for each value vector and combine the results.  */
17204
17205   rtx res0 = gen_reg_rtx (data_mode);
17206   rtx res1 = gen_reg_rtx (data_mode);
17207   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
17208   if (GET_CODE (sel) != CONST_VECTOR
17209       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
17210     {
17211       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
17212                                                        2 * nunits - 1);
17213       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17214                                      NULL, 0, OPTAB_DIRECT);
17215     }
17216   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
17217   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
17218                                      NULL, 0, OPTAB_DIRECT);
17219   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
17220   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
17221     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
17222   else
17223     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
17224 }
17225
17226 /* Recognize patterns suitable for the TRN instructions.  */
17227 static bool
17228 aarch64_evpc_trn (struct expand_vec_perm_d *d)
17229 {
17230   HOST_WIDE_INT odd;
17231   poly_uint64 nelt = d->perm.length ();
17232   rtx out, in0, in1, x;
17233   machine_mode vmode = d->vmode;
17234
17235   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17236     return false;
17237
17238   /* Note that these are little-endian tests.
17239      We correct for big-endian later.  */
17240   if (!d->perm[0].is_constant (&odd)
17241       || (odd != 0 && odd != 1)
17242       || !d->perm.series_p (0, 2, odd, 2)
17243       || !d->perm.series_p (1, 2, nelt + odd, 2))
17244     return false;
17245
17246   /* Success!  */
17247   if (d->testing_p)
17248     return true;
17249
17250   in0 = d->op0;
17251   in1 = d->op1;
17252   /* We don't need a big-endian lane correction for SVE; see the comment
17253      at the head of aarch64-sve.md for details.  */
17254   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17255     {
17256       x = in0, in0 = in1, in1 = x;
17257       odd = !odd;
17258     }
17259   out = d->target;
17260
17261   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17262                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
17263   return true;
17264 }
17265
17266 /* Recognize patterns suitable for the UZP instructions.  */
17267 static bool
17268 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
17269 {
17270   HOST_WIDE_INT odd;
17271   rtx out, in0, in1, x;
17272   machine_mode vmode = d->vmode;
17273
17274   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17275     return false;
17276
17277   /* Note that these are little-endian tests.
17278      We correct for big-endian later.  */
17279   if (!d->perm[0].is_constant (&odd)
17280       || (odd != 0 && odd != 1)
17281       || !d->perm.series_p (0, 1, odd, 2))
17282     return false;
17283
17284   /* Success!  */
17285   if (d->testing_p)
17286     return true;
17287
17288   in0 = d->op0;
17289   in1 = d->op1;
17290   /* We don't need a big-endian lane correction for SVE; see the comment
17291      at the head of aarch64-sve.md for details.  */
17292   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17293     {
17294       x = in0, in0 = in1, in1 = x;
17295       odd = !odd;
17296     }
17297   out = d->target;
17298
17299   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17300                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
17301   return true;
17302 }
17303
17304 /* Recognize patterns suitable for the ZIP instructions.  */
17305 static bool
17306 aarch64_evpc_zip (struct expand_vec_perm_d *d)
17307 {
17308   unsigned int high;
17309   poly_uint64 nelt = d->perm.length ();
17310   rtx out, in0, in1, x;
17311   machine_mode vmode = d->vmode;
17312
17313   if (GET_MODE_UNIT_SIZE (vmode) > 8)
17314     return false;
17315
17316   /* Note that these are little-endian tests.
17317      We correct for big-endian later.  */
17318   poly_uint64 first = d->perm[0];
17319   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
17320       || !d->perm.series_p (0, 2, first, 1)
17321       || !d->perm.series_p (1, 2, first + nelt, 1))
17322     return false;
17323   high = maybe_ne (first, 0U);
17324
17325   /* Success!  */
17326   if (d->testing_p)
17327     return true;
17328
17329   in0 = d->op0;
17330   in1 = d->op1;
17331   /* We don't need a big-endian lane correction for SVE; see the comment
17332      at the head of aarch64-sve.md for details.  */
17333   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17334     {
17335       x = in0, in0 = in1, in1 = x;
17336       high = !high;
17337     }
17338   out = d->target;
17339
17340   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17341                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
17342   return true;
17343 }
17344
17345 /* Recognize patterns for the EXT insn.  */
17346
17347 static bool
17348 aarch64_evpc_ext (struct expand_vec_perm_d *d)
17349 {
17350   HOST_WIDE_INT location;
17351   rtx offset;
17352
17353   /* The first element always refers to the first vector.
17354      Check if the extracted indices are increasing by one.  */
17355   if (d->vec_flags == VEC_SVE_PRED
17356       || !d->perm[0].is_constant (&location)
17357       || !d->perm.series_p (0, 1, location, 1))
17358     return false;
17359
17360   /* Success! */
17361   if (d->testing_p)
17362     return true;
17363
17364   /* The case where (location == 0) is a no-op for both big- and little-endian,
17365      and is removed by the mid-end at optimization levels -O1 and higher.
17366
17367      We don't need a big-endian lane correction for SVE; see the comment
17368      at the head of aarch64-sve.md for details.  */
17369   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
17370     {
17371       /* After setup, we want the high elements of the first vector (stored
17372          at the LSB end of the register), and the low elements of the second
17373          vector (stored at the MSB end of the register). So swap.  */
17374       std::swap (d->op0, d->op1);
17375       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17376          to_constant () is safe since this is restricted to Advanced SIMD
17377          vectors.  */
17378       location = d->perm.length ().to_constant () - location;
17379     }
17380
17381   offset = GEN_INT (location);
17382   emit_set_insn (d->target,
17383                  gen_rtx_UNSPEC (d->vmode,
17384                                  gen_rtvec (3, d->op0, d->op1, offset),
17385                                  UNSPEC_EXT));
17386   return true;
17387 }
17388
17389 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
17390    within each 64-bit, 32-bit or 16-bit granule.  */
17391
17392 static bool
17393 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
17394 {
17395   HOST_WIDE_INT diff;
17396   unsigned int i, size, unspec;
17397   machine_mode pred_mode;
17398
17399   if (d->vec_flags == VEC_SVE_PRED
17400       || !d->one_vector_p
17401       || !d->perm[0].is_constant (&diff))
17402     return false;
17403
17404   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
17405   if (size == 8)
17406     {
17407       unspec = UNSPEC_REV64;
17408       pred_mode = VNx2BImode;
17409     }
17410   else if (size == 4)
17411     {
17412       unspec = UNSPEC_REV32;
17413       pred_mode = VNx4BImode;
17414     }
17415   else if (size == 2)
17416     {
17417       unspec = UNSPEC_REV16;
17418       pred_mode = VNx8BImode;
17419     }
17420   else
17421     return false;
17422
17423   unsigned int step = diff + 1;
17424   for (i = 0; i < step; ++i)
17425     if (!d->perm.series_p (i, step, diff - i, step))
17426       return false;
17427
17428   /* Success! */
17429   if (d->testing_p)
17430     return true;
17431
17432   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
17433   if (d->vec_flags == VEC_SVE_DATA)
17434     {
17435       rtx pred = aarch64_ptrue_reg (pred_mode);
17436       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
17437                             UNSPEC_MERGE_PTRUE);
17438     }
17439   emit_set_insn (d->target, src);
17440   return true;
17441 }
17442
17443 /* Recognize patterns for the REV insn, which reverses elements within
17444    a full vector.  */
17445
17446 static bool
17447 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
17448 {
17449   poly_uint64 nelt = d->perm.length ();
17450
17451   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
17452     return false;
17453
17454   if (!d->perm.series_p (0, 1, nelt - 1, -1))
17455     return false;
17456
17457   /* Success! */
17458   if (d->testing_p)
17459     return true;
17460
17461   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
17462   emit_set_insn (d->target, src);
17463   return true;
17464 }
17465
17466 static bool
17467 aarch64_evpc_dup (struct expand_vec_perm_d *d)
17468 {
17469   rtx out = d->target;
17470   rtx in0;
17471   HOST_WIDE_INT elt;
17472   machine_mode vmode = d->vmode;
17473   rtx lane;
17474
17475   if (d->vec_flags == VEC_SVE_PRED
17476       || d->perm.encoding ().encoded_nelts () != 1
17477       || !d->perm[0].is_constant (&elt))
17478     return false;
17479
17480   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
17481     return false;
17482
17483   /* Success! */
17484   if (d->testing_p)
17485     return true;
17486
17487   /* The generic preparation in aarch64_expand_vec_perm_const_1
17488      swaps the operand order and the permute indices if it finds
17489      d->perm[0] to be in the second operand.  Thus, we can always
17490      use d->op0 and need not do any extra arithmetic to get the
17491      correct lane number.  */
17492   in0 = d->op0;
17493   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
17494
17495   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
17496   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
17497   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
17498   return true;
17499 }
17500
17501 static bool
17502 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
17503 {
17504   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
17505   machine_mode vmode = d->vmode;
17506
17507   /* Make sure that the indices are constant.  */
17508   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
17509   for (unsigned int i = 0; i < encoded_nelts; ++i)
17510     if (!d->perm[i].is_constant ())
17511       return false;
17512
17513   if (d->testing_p)
17514     return true;
17515
17516   /* Generic code will try constant permutation twice.  Once with the
17517      original mode and again with the elements lowered to QImode.
17518      So wait and don't do the selector expansion ourselves.  */
17519   if (vmode != V8QImode && vmode != V16QImode)
17520     return false;
17521
17522   /* to_constant is safe since this routine is specific to Advanced SIMD
17523      vectors.  */
17524   unsigned int nelt = d->perm.length ().to_constant ();
17525   for (unsigned int i = 0; i < nelt; ++i)
17526     /* If big-endian and two vectors we end up with a weird mixed-endian
17527        mode on NEON.  Reverse the index within each word but not the word
17528        itself.  to_constant is safe because we checked is_constant above.  */
17529     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
17530                         ? d->perm[i].to_constant () ^ (nelt - 1)
17531                         : d->perm[i].to_constant ());
17532
17533   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
17534   sel = force_reg (vmode, sel);
17535
17536   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
17537   return true;
17538 }
17539
17540 /* Try to implement D using an SVE TBL instruction.  */
17541
17542 static bool
17543 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
17544 {
17545   unsigned HOST_WIDE_INT nelt;
17546
17547   /* Permuting two variable-length vectors could overflow the
17548      index range.  */
17549   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
17550     return false;
17551
17552   if (d->testing_p)
17553     return true;
17554
17555   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
17556   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
17557   if (d->one_vector_p)
17558     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
17559   else
17560     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
17561   return true;
17562 }
17563
17564 static bool
17565 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
17566 {
17567   /* The pattern matching functions above are written to look for a small
17568      number to begin the sequence (0, 1, N/2).  If we begin with an index
17569      from the second operand, we can swap the operands.  */
17570   poly_int64 nelt = d->perm.length ();
17571   if (known_ge (d->perm[0], nelt))
17572     {
17573       d->perm.rotate_inputs (1);
17574       std::swap (d->op0, d->op1);
17575     }
17576
17577   if ((d->vec_flags == VEC_ADVSIMD
17578        || d->vec_flags == VEC_SVE_DATA
17579        || d->vec_flags == VEC_SVE_PRED)
17580       && known_gt (nelt, 1))
17581     {
17582       if (aarch64_evpc_rev_local (d))
17583         return true;
17584       else if (aarch64_evpc_rev_global (d))
17585         return true;
17586       else if (aarch64_evpc_ext (d))
17587         return true;
17588       else if (aarch64_evpc_dup (d))
17589         return true;
17590       else if (aarch64_evpc_zip (d))
17591         return true;
17592       else if (aarch64_evpc_uzp (d))
17593         return true;
17594       else if (aarch64_evpc_trn (d))
17595         return true;
17596       if (d->vec_flags == VEC_SVE_DATA)
17597         return aarch64_evpc_sve_tbl (d);
17598       else if (d->vec_flags == VEC_ADVSIMD)
17599         return aarch64_evpc_tbl (d);
17600     }
17601   return false;
17602 }
17603
17604 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
17605
17606 static bool
17607 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
17608                                   rtx op1, const vec_perm_indices &sel)
17609 {
17610   struct expand_vec_perm_d d;
17611
17612   /* Check whether the mask can be applied to a single vector.  */
17613   if (sel.ninputs () == 1
17614       || (op0 && rtx_equal_p (op0, op1)))
17615     d.one_vector_p = true;
17616   else if (sel.all_from_input_p (0))
17617     {
17618       d.one_vector_p = true;
17619       op1 = op0;
17620     }
17621   else if (sel.all_from_input_p (1))
17622     {
17623       d.one_vector_p = true;
17624       op0 = op1;
17625     }
17626   else
17627     d.one_vector_p = false;
17628
17629   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
17630                      sel.nelts_per_input ());
17631   d.vmode = vmode;
17632   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
17633   d.target = target;
17634   d.op0 = op0;
17635   d.op1 = op1;
17636   d.testing_p = !target;
17637
17638   if (!d.testing_p)
17639     return aarch64_expand_vec_perm_const_1 (&d);
17640
17641   rtx_insn *last = get_last_insn ();
17642   bool ret = aarch64_expand_vec_perm_const_1 (&d);
17643   gcc_assert (last == get_last_insn ());
17644
17645   return ret;
17646 }
17647
17648 /* Generate a byte permute mask for a register of mode MODE,
17649    which has NUNITS units.  */
17650
17651 rtx
17652 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
17653 {
17654   /* We have to reverse each vector because we dont have
17655      a permuted load that can reverse-load according to ABI rules.  */
17656   rtx mask;
17657   rtvec v = rtvec_alloc (16);
17658   unsigned int i, j;
17659   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
17660
17661   gcc_assert (BYTES_BIG_ENDIAN);
17662   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
17663
17664   for (i = 0; i < nunits; i++)
17665     for (j = 0; j < usize; j++)
17666       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
17667   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
17668   return force_reg (V16QImode, mask);
17669 }
17670
17671 /* Return true if X is a valid second operand for the SVE instruction
17672    that implements integer comparison OP_CODE.  */
17673
17674 static bool
17675 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
17676 {
17677   if (register_operand (x, VOIDmode))
17678     return true;
17679
17680   switch (op_code)
17681     {
17682     case LTU:
17683     case LEU:
17684     case GEU:
17685     case GTU:
17686       return aarch64_sve_cmp_immediate_p (x, false);
17687     case LT:
17688     case LE:
17689     case GE:
17690     case GT:
17691     case NE:
17692     case EQ:
17693       return aarch64_sve_cmp_immediate_p (x, true);
17694     default:
17695       gcc_unreachable ();
17696     }
17697 }
17698
17699 /* Use predicated SVE instructions to implement the equivalent of:
17700
17701      (set TARGET OP)
17702
17703    given that PTRUE is an all-true predicate of the appropriate mode.  */
17704
17705 static void
17706 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
17707 {
17708   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
17709                                gen_rtvec (2, ptrue, op),
17710                                UNSPEC_MERGE_PTRUE);
17711   rtx_insn *insn = emit_set_insn (target, unspec);
17712   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
17713 }
17714
17715 /* Likewise, but also clobber the condition codes.  */
17716
17717 static void
17718 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
17719 {
17720   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
17721                                gen_rtvec (2, ptrue, op),
17722                                UNSPEC_MERGE_PTRUE);
17723   rtx_insn *insn = emit_insn (gen_set_clobber_cc_nzc (target, unspec));
17724   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
17725 }
17726
17727 /* Return the UNSPEC_COND_* code for comparison CODE.  */
17728
17729 static unsigned int
17730 aarch64_unspec_cond_code (rtx_code code)
17731 {
17732   switch (code)
17733     {
17734     case NE:
17735       return UNSPEC_COND_FCMNE;
17736     case EQ:
17737       return UNSPEC_COND_FCMEQ;
17738     case LT:
17739       return UNSPEC_COND_FCMLT;
17740     case GT:
17741       return UNSPEC_COND_FCMGT;
17742     case LE:
17743       return UNSPEC_COND_FCMLE;
17744     case GE:
17745       return UNSPEC_COND_FCMGE;
17746     default:
17747       gcc_unreachable ();
17748     }
17749 }
17750
17751 /* Emit:
17752
17753       (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
17754
17755    where <X> is the operation associated with comparison CODE.  This form
17756    of instruction is used when (and (CODE OP0 OP1) PRED) would have different
17757    semantics, such as when PRED might not be all-true and when comparing
17758    inactive lanes could have side effects.  */
17759
17760 static void
17761 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
17762                                   rtx pred, rtx op0, rtx op1)
17763 {
17764   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
17765                                gen_rtvec (3, pred, op0, op1),
17766                                aarch64_unspec_cond_code (code));
17767   emit_set_insn (target, unspec);
17768 }
17769
17770 /* Expand an SVE integer comparison using the SVE equivalent of:
17771
17772      (set TARGET (CODE OP0 OP1)).  */
17773
17774 void
17775 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
17776 {
17777   machine_mode pred_mode = GET_MODE (target);
17778   machine_mode data_mode = GET_MODE (op0);
17779
17780   if (!aarch64_sve_cmp_operand_p (code, op1))
17781     op1 = force_reg (data_mode, op1);
17782
17783   rtx ptrue = aarch64_ptrue_reg (pred_mode);
17784   rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17785   aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
17786 }
17787
17788 /* Emit the SVE equivalent of:
17789
17790       (set TMP1 (CODE1 OP0 OP1))
17791       (set TMP2 (CODE2 OP0 OP1))
17792       (set TARGET (ior:PRED_MODE TMP1 TMP2))
17793
17794    PTRUE is an all-true predicate with the same mode as TARGET.  */
17795
17796 static void
17797 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
17798                            rtx ptrue, rtx op0, rtx op1)
17799 {
17800   machine_mode pred_mode = GET_MODE (ptrue);
17801   rtx tmp1 = gen_reg_rtx (pred_mode);
17802   aarch64_emit_sve_ptrue_op (tmp1, ptrue,
17803                              gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
17804   rtx tmp2 = gen_reg_rtx (pred_mode);
17805   aarch64_emit_sve_ptrue_op (tmp2, ptrue,
17806                              gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
17807   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
17808 }
17809
17810 /* Emit the SVE equivalent of:
17811
17812       (set TMP (CODE OP0 OP1))
17813       (set TARGET (not TMP))
17814
17815    PTRUE is an all-true predicate with the same mode as TARGET.  */
17816
17817 static void
17818 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
17819                                 rtx op0, rtx op1)
17820 {
17821   machine_mode pred_mode = GET_MODE (ptrue);
17822   rtx tmp = gen_reg_rtx (pred_mode);
17823   aarch64_emit_sve_ptrue_op (tmp, ptrue,
17824                              gen_rtx_fmt_ee (code, pred_mode, op0, op1));
17825   aarch64_emit_unop (target, one_cmpl_optab, tmp);
17826 }
17827
17828 /* Expand an SVE floating-point comparison using the SVE equivalent of:
17829
17830      (set TARGET (CODE OP0 OP1))
17831
17832    If CAN_INVERT_P is true, the caller can also handle inverted results;
17833    return true if the result is in fact inverted.  */
17834
17835 bool
17836 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
17837                                   rtx op0, rtx op1, bool can_invert_p)
17838 {
17839   machine_mode pred_mode = GET_MODE (target);
17840   machine_mode data_mode = GET_MODE (op0);
17841
17842   rtx ptrue = aarch64_ptrue_reg (pred_mode);
17843   switch (code)
17844     {
17845     case UNORDERED:
17846       /* UNORDERED has no immediate form.  */
17847       op1 = force_reg (data_mode, op1);
17848       /* fall through */
17849     case LT:
17850     case LE:
17851     case GT:
17852     case GE:
17853     case EQ:
17854     case NE:
17855       {
17856         /* There is native support for the comparison.  */
17857         rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17858         aarch64_emit_sve_ptrue_op (target, ptrue, cond);
17859         return false;
17860       }
17861
17862     case LTGT:
17863       /* This is a trapping operation (LT or GT).  */
17864       aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
17865       return false;
17866
17867     case UNEQ:
17868       if (!flag_trapping_math)
17869         {
17870           /* This would trap for signaling NaNs.  */
17871           op1 = force_reg (data_mode, op1);
17872           aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
17873           return false;
17874         }
17875       /* fall through */
17876     case UNLT:
17877     case UNLE:
17878     case UNGT:
17879     case UNGE:
17880       if (flag_trapping_math)
17881         {
17882           /* Work out which elements are ordered.  */
17883           rtx ordered = gen_reg_rtx (pred_mode);
17884           op1 = force_reg (data_mode, op1);
17885           aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
17886
17887           /* Test the opposite condition for the ordered elements,
17888              then invert the result.  */
17889           if (code == UNEQ)
17890             code = NE;
17891           else
17892             code = reverse_condition_maybe_unordered (code);
17893           if (can_invert_p)
17894             {
17895               aarch64_emit_sve_predicated_cond (target, code,
17896                                                 ordered, op0, op1);
17897               return true;
17898             }
17899           rtx tmp = gen_reg_rtx (pred_mode);
17900           aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
17901           aarch64_emit_unop (target, one_cmpl_optab, tmp);
17902           return false;
17903         }
17904       break;
17905
17906     case ORDERED:
17907       /* ORDERED has no immediate form.  */
17908       op1 = force_reg (data_mode, op1);
17909       break;
17910
17911     default:
17912       gcc_unreachable ();
17913     }
17914
17915   /* There is native support for the inverse comparison.  */
17916   code = reverse_condition_maybe_unordered (code);
17917   if (can_invert_p)
17918     {
17919       rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17920       aarch64_emit_sve_ptrue_op (target, ptrue, cond);
17921       return true;
17922     }
17923   aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
17924   return false;
17925 }
17926
17927 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
17928    of the data being selected and CMP_MODE is the mode of the values being
17929    compared.  */
17930
17931 void
17932 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
17933                           rtx *ops)
17934 {
17935   machine_mode pred_mode
17936     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
17937                              GET_MODE_SIZE (cmp_mode)).require ();
17938   rtx pred = gen_reg_rtx (pred_mode);
17939   if (FLOAT_MODE_P (cmp_mode))
17940     {
17941       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
17942                                             ops[4], ops[5], true))
17943         std::swap (ops[1], ops[2]);
17944     }
17945   else
17946     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
17947
17948   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
17949   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
17950 }
17951
17952 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
17953    true.  However due to issues with register allocation it is preferable
17954    to avoid tieing integer scalar and FP scalar modes.  Executing integer
17955    operations in general registers is better than treating them as scalar
17956    vector operations.  This reduces latency and avoids redundant int<->FP
17957    moves.  So tie modes if they are either the same class, or vector modes
17958    with other vector modes, vector structs or any scalar mode.  */
17959
17960 static bool
17961 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
17962 {
17963   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
17964     return true;
17965
17966   /* We specifically want to allow elements of "structure" modes to
17967      be tieable to the structure.  This more general condition allows
17968      other rarer situations too.  The reason we don't extend this to
17969      predicate modes is that there are no predicate structure modes
17970      nor any specific instructions for extracting part of a predicate
17971      register.  */
17972   if (aarch64_vector_data_mode_p (mode1)
17973       && aarch64_vector_data_mode_p (mode2))
17974     return true;
17975
17976   /* Also allow any scalar modes with vectors.  */
17977   if (aarch64_vector_mode_supported_p (mode1)
17978       || aarch64_vector_mode_supported_p (mode2))
17979     return true;
17980
17981   return false;
17982 }
17983
17984 /* Return a new RTX holding the result of moving POINTER forward by
17985    AMOUNT bytes.  */
17986
17987 static rtx
17988 aarch64_move_pointer (rtx pointer, poly_int64 amount)
17989 {
17990   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
17991
17992   return adjust_automodify_address (pointer, GET_MODE (pointer),
17993                                     next, amount);
17994 }
17995
17996 /* Return a new RTX holding the result of moving POINTER forward by the
17997    size of the mode it points to.  */
17998
17999 static rtx
18000 aarch64_progress_pointer (rtx pointer)
18001 {
18002   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
18003 }
18004
18005 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
18006    MODE bytes.  */
18007
18008 static void
18009 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
18010                                               machine_mode mode)
18011 {
18012   rtx reg = gen_reg_rtx (mode);
18013
18014   /* "Cast" the pointers to the correct mode.  */
18015   *src = adjust_address (*src, mode, 0);
18016   *dst = adjust_address (*dst, mode, 0);
18017   /* Emit the memcpy.  */
18018   emit_move_insn (reg, *src);
18019   emit_move_insn (*dst, reg);
18020   /* Move the pointers forward.  */
18021   *src = aarch64_progress_pointer (*src);
18022   *dst = aarch64_progress_pointer (*dst);
18023 }
18024
18025 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
18026    we succeed, otherwise return false.  */
18027
18028 bool
18029 aarch64_expand_cpymem (rtx *operands)
18030 {
18031   int n, mode_bits;
18032   rtx dst = operands[0];
18033   rtx src = operands[1];
18034   rtx base;
18035   machine_mode cur_mode = BLKmode, next_mode;
18036   bool speed_p = !optimize_function_for_size_p (cfun);
18037
18038   /* When optimizing for size, give a better estimate of the length of a
18039      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
18040      will always require an even number of instructions to do now.  And each
18041      operation requires both a load+store, so devide the max number by 2.  */
18042   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
18043
18044   /* We can't do anything smart if the amount to copy is not constant.  */
18045   if (!CONST_INT_P (operands[2]))
18046     return false;
18047
18048   n = INTVAL (operands[2]);
18049
18050   /* Try to keep the number of instructions low.  For all cases we will do at
18051      most two moves for the residual amount, since we'll always overlap the
18052      remainder.  */
18053   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
18054     return false;
18055
18056   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
18057   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
18058
18059   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
18060   src = adjust_automodify_address (src, VOIDmode, base, 0);
18061
18062   /* Convert n to bits to make the rest of the code simpler.  */
18063   n = n * BITS_PER_UNIT;
18064
18065   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
18066      larger than TImode, but we should not use them for loads/stores here.  */
18067   const int copy_limit = GET_MODE_BITSIZE (TImode);
18068
18069   while (n > 0)
18070     {
18071       /* Find the largest mode in which to do the copy in without over reading
18072          or writing.  */
18073       opt_scalar_int_mode mode_iter;
18074       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
18075         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
18076           cur_mode = mode_iter.require ();
18077
18078       gcc_assert (cur_mode != BLKmode);
18079
18080       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
18081       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
18082
18083       n -= mode_bits;
18084
18085       /* Do certain trailing copies as overlapping if it's going to be
18086          cheaper.  i.e. less instructions to do so.  For instance doing a 15
18087          byte copy it's more efficient to do two overlapping 8 byte copies than
18088          8 + 6 + 1.  */
18089       if (n > 0 && n <= 8 * BITS_PER_UNIT)
18090         {
18091           next_mode = smallest_mode_for_size (n, MODE_INT);
18092           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
18093           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
18094           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
18095           n = n_bits;
18096         }
18097     }
18098
18099   return true;
18100 }
18101
18102 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
18103    SImode stores.  Handle the case when the constant has identical
18104    bottom and top halves.  This is beneficial when the two stores can be
18105    merged into an STP and we avoid synthesising potentially expensive
18106    immediates twice.  Return true if such a split is possible.  */
18107
18108 bool
18109 aarch64_split_dimode_const_store (rtx dst, rtx src)
18110 {
18111   rtx lo = gen_lowpart (SImode, src);
18112   rtx hi = gen_highpart_mode (SImode, DImode, src);
18113
18114   bool size_p = optimize_function_for_size_p (cfun);
18115
18116   if (!rtx_equal_p (lo, hi))
18117     return false;
18118
18119   unsigned int orig_cost
18120     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
18121   unsigned int lo_cost
18122     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
18123
18124   /* We want to transform:
18125      MOV        x1, 49370
18126      MOVK       x1, 0x140, lsl 16
18127      MOVK       x1, 0xc0da, lsl 32
18128      MOVK       x1, 0x140, lsl 48
18129      STR        x1, [x0]
18130    into:
18131      MOV        w1, 49370
18132      MOVK       w1, 0x140, lsl 16
18133      STP        w1, w1, [x0]
18134    So we want to perform this only when we save two instructions
18135    or more.  When optimizing for size, however, accept any code size
18136    savings we can.  */
18137   if (size_p && orig_cost <= lo_cost)
18138     return false;
18139
18140   if (!size_p
18141       && (orig_cost <= lo_cost + 1))
18142     return false;
18143
18144   rtx mem_lo = adjust_address (dst, SImode, 0);
18145   if (!aarch64_mem_pair_operand (mem_lo, SImode))
18146     return false;
18147
18148   rtx tmp_reg = gen_reg_rtx (SImode);
18149   aarch64_expand_mov_immediate (tmp_reg, lo);
18150   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
18151   /* Don't emit an explicit store pair as this may not be always profitable.
18152      Let the sched-fusion logic decide whether to merge them.  */
18153   emit_move_insn (mem_lo, tmp_reg);
18154   emit_move_insn (mem_hi, tmp_reg);
18155
18156   return true;
18157 }
18158
18159 /* Generate RTL for a conditional branch with rtx comparison CODE in
18160    mode CC_MODE.  The destination of the unlikely conditional branch
18161    is LABEL_REF.  */
18162
18163 void
18164 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
18165                               rtx label_ref)
18166 {
18167   rtx x;
18168   x = gen_rtx_fmt_ee (code, VOIDmode,
18169                       gen_rtx_REG (cc_mode, CC_REGNUM),
18170                       const0_rtx);
18171
18172   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18173                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
18174                             pc_rtx);
18175   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18176 }
18177
18178 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18179
18180    OP1 represents the TImode destination operand 1
18181    OP2 represents the TImode destination operand 2
18182    LOW_DEST represents the low half (DImode) of TImode operand 0
18183    LOW_IN1 represents the low half (DImode) of TImode operand 1
18184    LOW_IN2 represents the low half (DImode) of TImode operand 2
18185    HIGH_DEST represents the high half (DImode) of TImode operand 0
18186    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18187    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
18188
18189 void
18190 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18191                             rtx *low_in1, rtx *low_in2,
18192                             rtx *high_dest, rtx *high_in1,
18193                             rtx *high_in2)
18194 {
18195   *low_dest = gen_reg_rtx (DImode);
18196   *low_in1 = gen_lowpart (DImode, op1);
18197   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18198                                   subreg_lowpart_offset (DImode, TImode));
18199   *high_dest = gen_reg_rtx (DImode);
18200   *high_in1 = gen_highpart (DImode, op1);
18201   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18202                                    subreg_highpart_offset (DImode, TImode));
18203 }
18204
18205 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18206
18207    This function differs from 'arch64_addti_scratch_regs' in that
18208    OP1 can be an immediate constant (zero). We must call
18209    subreg_highpart_offset with DImode and TImode arguments, otherwise
18210    VOIDmode will be used for the const_int which generates an internal
18211    error from subreg_size_highpart_offset which does not expect a size of zero.
18212
18213    OP1 represents the TImode destination operand 1
18214    OP2 represents the TImode destination operand 2
18215    LOW_DEST represents the low half (DImode) of TImode operand 0
18216    LOW_IN1 represents the low half (DImode) of TImode operand 1
18217    LOW_IN2 represents the low half (DImode) of TImode operand 2
18218    HIGH_DEST represents the high half (DImode) of TImode operand 0
18219    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18220    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
18221
18222
18223 void
18224 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18225                              rtx *low_in1, rtx *low_in2,
18226                              rtx *high_dest, rtx *high_in1,
18227                              rtx *high_in2)
18228 {
18229   *low_dest = gen_reg_rtx (DImode);
18230   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
18231                                   subreg_lowpart_offset (DImode, TImode));
18232
18233   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18234                                   subreg_lowpart_offset (DImode, TImode));
18235   *high_dest = gen_reg_rtx (DImode);
18236
18237   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
18238                                    subreg_highpart_offset (DImode, TImode));
18239   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18240                                    subreg_highpart_offset (DImode, TImode));
18241 }
18242
18243 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18244
18245    OP0 represents the TImode destination operand 0
18246    LOW_DEST represents the low half (DImode) of TImode operand 0
18247    LOW_IN1 represents the low half (DImode) of TImode operand 1
18248    LOW_IN2 represents the low half (DImode) of TImode operand 2
18249    HIGH_DEST represents the high half (DImode) of TImode operand 0
18250    HIGH_IN1 represents the high half (DImode) of TImode operand 1
18251    HIGH_IN2 represents the high half (DImode) of TImode operand 2
18252    UNSIGNED_P is true if the operation is being performed on unsigned
18253    values.  */
18254 void
18255 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
18256                        rtx low_in2, rtx high_dest, rtx high_in1,
18257                        rtx high_in2, bool unsigned_p)
18258 {
18259   if (low_in2 == const0_rtx)
18260     {
18261       low_dest = low_in1;
18262       high_in2 = force_reg (DImode, high_in2);
18263       if (unsigned_p)
18264         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
18265       else
18266         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
18267     }
18268   else
18269     {
18270       if (CONST_INT_P (low_in2))
18271         {
18272           high_in2 = force_reg (DImode, high_in2);
18273           emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
18274                                               GEN_INT (-INTVAL (low_in2))));
18275         }
18276       else
18277         emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
18278
18279       if (unsigned_p)
18280         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
18281       else
18282         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
18283     }
18284
18285   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
18286   emit_move_insn (gen_highpart (DImode, op0), high_dest);
18287
18288 }
18289
18290 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
18291
18292 static unsigned HOST_WIDE_INT
18293 aarch64_asan_shadow_offset (void)
18294 {
18295   if (TARGET_ILP32)
18296     return (HOST_WIDE_INT_1 << 29);
18297   else
18298     return (HOST_WIDE_INT_1 << 36);
18299 }
18300
18301 static rtx
18302 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
18303                         int code, tree treeop0, tree treeop1)
18304 {
18305   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18306   rtx op0, op1;
18307   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18308   insn_code icode;
18309   struct expand_operand ops[4];
18310
18311   start_sequence ();
18312   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18313
18314   op_mode = GET_MODE (op0);
18315   if (op_mode == VOIDmode)
18316     op_mode = GET_MODE (op1);
18317
18318   switch (op_mode)
18319     {
18320     case E_QImode:
18321     case E_HImode:
18322     case E_SImode:
18323       cmp_mode = SImode;
18324       icode = CODE_FOR_cmpsi;
18325       break;
18326
18327     case E_DImode:
18328       cmp_mode = DImode;
18329       icode = CODE_FOR_cmpdi;
18330       break;
18331
18332     case E_SFmode:
18333       cmp_mode = SFmode;
18334       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18335       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
18336       break;
18337
18338     case E_DFmode:
18339       cmp_mode = DFmode;
18340       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18341       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
18342       break;
18343
18344     default:
18345       end_sequence ();
18346       return NULL_RTX;
18347     }
18348
18349   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
18350   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
18351   if (!op0 || !op1)
18352     {
18353       end_sequence ();
18354       return NULL_RTX;
18355     }
18356   *prep_seq = get_insns ();
18357   end_sequence ();
18358
18359   create_fixed_operand (&ops[0], op0);
18360   create_fixed_operand (&ops[1], op1);
18361
18362   start_sequence ();
18363   if (!maybe_expand_insn (icode, 2, ops))
18364     {
18365       end_sequence ();
18366       return NULL_RTX;
18367     }
18368   *gen_seq = get_insns ();
18369   end_sequence ();
18370
18371   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
18372                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
18373 }
18374
18375 static rtx
18376 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
18377                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
18378 {
18379   rtx op0, op1, target;
18380   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18381   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18382   insn_code icode;
18383   struct expand_operand ops[6];
18384   int aarch64_cond;
18385
18386   push_to_sequence (*prep_seq);
18387   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18388
18389   op_mode = GET_MODE (op0);
18390   if (op_mode == VOIDmode)
18391     op_mode = GET_MODE (op1);
18392
18393   switch (op_mode)
18394     {
18395     case E_QImode:
18396     case E_HImode:
18397     case E_SImode:
18398       cmp_mode = SImode;
18399       icode = CODE_FOR_ccmpsi;
18400       break;
18401
18402     case E_DImode:
18403       cmp_mode = DImode;
18404       icode = CODE_FOR_ccmpdi;
18405       break;
18406
18407     case E_SFmode:
18408       cmp_mode = SFmode;
18409       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18410       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
18411       break;
18412
18413     case E_DFmode:
18414       cmp_mode = DFmode;
18415       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18416       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
18417       break;
18418
18419     default:
18420       end_sequence ();
18421       return NULL_RTX;
18422     }
18423
18424   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
18425   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
18426   if (!op0 || !op1)
18427     {
18428       end_sequence ();
18429       return NULL_RTX;
18430     }
18431   *prep_seq = get_insns ();
18432   end_sequence ();
18433
18434   target = gen_rtx_REG (cc_mode, CC_REGNUM);
18435   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
18436
18437   if (bit_code != AND)
18438     {
18439       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
18440                                                 GET_MODE (XEXP (prev, 0))),
18441                              VOIDmode, XEXP (prev, 0), const0_rtx);
18442       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
18443     }
18444
18445   create_fixed_operand (&ops[0], XEXP (prev, 0));
18446   create_fixed_operand (&ops[1], target);
18447   create_fixed_operand (&ops[2], op0);
18448   create_fixed_operand (&ops[3], op1);
18449   create_fixed_operand (&ops[4], prev);
18450   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
18451
18452   push_to_sequence (*gen_seq);
18453   if (!maybe_expand_insn (icode, 6, ops))
18454     {
18455       end_sequence ();
18456       return NULL_RTX;
18457     }
18458
18459   *gen_seq = get_insns ();
18460   end_sequence ();
18461
18462   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
18463 }
18464
18465 #undef TARGET_GEN_CCMP_FIRST
18466 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
18467
18468 #undef TARGET_GEN_CCMP_NEXT
18469 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
18470
18471 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
18472    instruction fusion of some sort.  */
18473
18474 static bool
18475 aarch64_macro_fusion_p (void)
18476 {
18477   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
18478 }
18479
18480
18481 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
18482    should be kept together during scheduling.  */
18483
18484 static bool
18485 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
18486 {
18487   rtx set_dest;
18488   rtx prev_set = single_set (prev);
18489   rtx curr_set = single_set (curr);
18490   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
18491   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
18492
18493   if (!aarch64_macro_fusion_p ())
18494     return false;
18495
18496   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
18497     {
18498       /* We are trying to match:
18499          prev (mov)  == (set (reg r0) (const_int imm16))
18500          curr (movk) == (set (zero_extract (reg r0)
18501                                            (const_int 16)
18502                                            (const_int 16))
18503                              (const_int imm16_1))  */
18504
18505       set_dest = SET_DEST (curr_set);
18506
18507       if (GET_CODE (set_dest) == ZERO_EXTRACT
18508           && CONST_INT_P (SET_SRC (curr_set))
18509           && CONST_INT_P (SET_SRC (prev_set))
18510           && CONST_INT_P (XEXP (set_dest, 2))
18511           && INTVAL (XEXP (set_dest, 2)) == 16
18512           && REG_P (XEXP (set_dest, 0))
18513           && REG_P (SET_DEST (prev_set))
18514           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
18515         {
18516           return true;
18517         }
18518     }
18519
18520   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
18521     {
18522
18523       /*  We're trying to match:
18524           prev (adrp) == (set (reg r1)
18525                               (high (symbol_ref ("SYM"))))
18526           curr (add) == (set (reg r0)
18527                              (lo_sum (reg r1)
18528                                      (symbol_ref ("SYM"))))
18529           Note that r0 need not necessarily be the same as r1, especially
18530           during pre-regalloc scheduling.  */
18531
18532       if (satisfies_constraint_Ush (SET_SRC (prev_set))
18533           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18534         {
18535           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
18536               && REG_P (XEXP (SET_SRC (curr_set), 0))
18537               && REGNO (XEXP (SET_SRC (curr_set), 0))
18538                  == REGNO (SET_DEST (prev_set))
18539               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
18540                               XEXP (SET_SRC (curr_set), 1)))
18541             return true;
18542         }
18543     }
18544
18545   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
18546     {
18547
18548       /* We're trying to match:
18549          prev (movk) == (set (zero_extract (reg r0)
18550                                            (const_int 16)
18551                                            (const_int 32))
18552                              (const_int imm16_1))
18553          curr (movk) == (set (zero_extract (reg r0)
18554                                            (const_int 16)
18555                                            (const_int 48))
18556                              (const_int imm16_2))  */
18557
18558       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
18559           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
18560           && REG_P (XEXP (SET_DEST (prev_set), 0))
18561           && REG_P (XEXP (SET_DEST (curr_set), 0))
18562           && REGNO (XEXP (SET_DEST (prev_set), 0))
18563              == REGNO (XEXP (SET_DEST (curr_set), 0))
18564           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
18565           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
18566           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
18567           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
18568           && CONST_INT_P (SET_SRC (prev_set))
18569           && CONST_INT_P (SET_SRC (curr_set)))
18570         return true;
18571
18572     }
18573   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
18574     {
18575       /* We're trying to match:
18576           prev (adrp) == (set (reg r0)
18577                               (high (symbol_ref ("SYM"))))
18578           curr (ldr) == (set (reg r1)
18579                              (mem (lo_sum (reg r0)
18580                                              (symbol_ref ("SYM")))))
18581                  or
18582           curr (ldr) == (set (reg r1)
18583                              (zero_extend (mem
18584                                            (lo_sum (reg r0)
18585                                                    (symbol_ref ("SYM"))))))  */
18586       if (satisfies_constraint_Ush (SET_SRC (prev_set))
18587           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18588         {
18589           rtx curr_src = SET_SRC (curr_set);
18590
18591           if (GET_CODE (curr_src) == ZERO_EXTEND)
18592             curr_src = XEXP (curr_src, 0);
18593
18594           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
18595               && REG_P (XEXP (XEXP (curr_src, 0), 0))
18596               && REGNO (XEXP (XEXP (curr_src, 0), 0))
18597                  == REGNO (SET_DEST (prev_set))
18598               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
18599                               XEXP (SET_SRC (prev_set), 0)))
18600               return true;
18601         }
18602     }
18603
18604   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
18605       && any_condjump_p (curr))
18606     {
18607       unsigned int condreg1, condreg2;
18608       rtx cc_reg_1;
18609       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
18610       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
18611
18612       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
18613           && prev
18614           && modified_in_p (cc_reg_1, prev))
18615         {
18616           enum attr_type prev_type = get_attr_type (prev);
18617
18618           /* FIXME: this misses some which is considered simple arthematic
18619              instructions for ThunderX.  Simple shifts are missed here.  */
18620           if (prev_type == TYPE_ALUS_SREG
18621               || prev_type == TYPE_ALUS_IMM
18622               || prev_type == TYPE_LOGICS_REG
18623               || prev_type == TYPE_LOGICS_IMM)
18624             return true;
18625         }
18626     }
18627
18628   if (prev_set
18629       && curr_set
18630       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
18631       && any_condjump_p (curr))
18632     {
18633       /* We're trying to match:
18634           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
18635           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
18636                                                          (const_int 0))
18637                                                  (label_ref ("SYM"))
18638                                                  (pc))  */
18639       if (SET_DEST (curr_set) == (pc_rtx)
18640           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
18641           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
18642           && REG_P (SET_DEST (prev_set))
18643           && REGNO (SET_DEST (prev_set))
18644              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
18645         {
18646           /* Fuse ALU operations followed by conditional branch instruction.  */
18647           switch (get_attr_type (prev))
18648             {
18649             case TYPE_ALU_IMM:
18650             case TYPE_ALU_SREG:
18651             case TYPE_ADC_REG:
18652             case TYPE_ADC_IMM:
18653             case TYPE_ADCS_REG:
18654             case TYPE_ADCS_IMM:
18655             case TYPE_LOGIC_REG:
18656             case TYPE_LOGIC_IMM:
18657             case TYPE_CSEL:
18658             case TYPE_ADR:
18659             case TYPE_MOV_IMM:
18660             case TYPE_SHIFT_REG:
18661             case TYPE_SHIFT_IMM:
18662             case TYPE_BFM:
18663             case TYPE_RBIT:
18664             case TYPE_REV:
18665             case TYPE_EXTEND:
18666               return true;
18667
18668             default:;
18669             }
18670         }
18671     }
18672
18673   return false;
18674 }
18675
18676 /* Return true iff the instruction fusion described by OP is enabled.  */
18677
18678 bool
18679 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
18680 {
18681   return (aarch64_tune_params.fusible_ops & op) != 0;
18682 }
18683
18684 /* If MEM is in the form of [base+offset], extract the two parts
18685    of address and set to BASE and OFFSET, otherwise return false
18686    after clearing BASE and OFFSET.  */
18687
18688 bool
18689 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
18690 {
18691   rtx addr;
18692
18693   gcc_assert (MEM_P (mem));
18694
18695   addr = XEXP (mem, 0);
18696
18697   if (REG_P (addr))
18698     {
18699       *base = addr;
18700       *offset = const0_rtx;
18701       return true;
18702     }
18703
18704   if (GET_CODE (addr) == PLUS
18705       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
18706     {
18707       *base = XEXP (addr, 0);
18708       *offset = XEXP (addr, 1);
18709       return true;
18710     }
18711
18712   *base = NULL_RTX;
18713   *offset = NULL_RTX;
18714
18715   return false;
18716 }
18717
18718 /* Types for scheduling fusion.  */
18719 enum sched_fusion_type
18720 {
18721   SCHED_FUSION_NONE = 0,
18722   SCHED_FUSION_LD_SIGN_EXTEND,
18723   SCHED_FUSION_LD_ZERO_EXTEND,
18724   SCHED_FUSION_LD,
18725   SCHED_FUSION_ST,
18726   SCHED_FUSION_NUM
18727 };
18728
18729 /* If INSN is a load or store of address in the form of [base+offset],
18730    extract the two parts and set to BASE and OFFSET.  Return scheduling
18731    fusion type this INSN is.  */
18732
18733 static enum sched_fusion_type
18734 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
18735 {
18736   rtx x, dest, src;
18737   enum sched_fusion_type fusion = SCHED_FUSION_LD;
18738
18739   gcc_assert (INSN_P (insn));
18740   x = PATTERN (insn);
18741   if (GET_CODE (x) != SET)
18742     return SCHED_FUSION_NONE;
18743
18744   src = SET_SRC (x);
18745   dest = SET_DEST (x);
18746
18747   machine_mode dest_mode = GET_MODE (dest);
18748
18749   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
18750     return SCHED_FUSION_NONE;
18751
18752   if (GET_CODE (src) == SIGN_EXTEND)
18753     {
18754       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
18755       src = XEXP (src, 0);
18756       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18757         return SCHED_FUSION_NONE;
18758     }
18759   else if (GET_CODE (src) == ZERO_EXTEND)
18760     {
18761       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
18762       src = XEXP (src, 0);
18763       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18764         return SCHED_FUSION_NONE;
18765     }
18766
18767   if (GET_CODE (src) == MEM && REG_P (dest))
18768     extract_base_offset_in_addr (src, base, offset);
18769   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
18770     {
18771       fusion = SCHED_FUSION_ST;
18772       extract_base_offset_in_addr (dest, base, offset);
18773     }
18774   else
18775     return SCHED_FUSION_NONE;
18776
18777   if (*base == NULL_RTX || *offset == NULL_RTX)
18778     fusion = SCHED_FUSION_NONE;
18779
18780   return fusion;
18781 }
18782
18783 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
18784
18785    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
18786    and PRI are only calculated for these instructions.  For other instruction,
18787    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
18788    type instruction fusion can be added by returning different priorities.
18789
18790    It's important that irrelevant instructions get the largest FUSION_PRI.  */
18791
18792 static void
18793 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
18794                                int *fusion_pri, int *pri)
18795 {
18796   int tmp, off_val;
18797   rtx base, offset;
18798   enum sched_fusion_type fusion;
18799
18800   gcc_assert (INSN_P (insn));
18801
18802   tmp = max_pri - 1;
18803   fusion = fusion_load_store (insn, &base, &offset);
18804   if (fusion == SCHED_FUSION_NONE)
18805     {
18806       *pri = tmp;
18807       *fusion_pri = tmp;
18808       return;
18809     }
18810
18811   /* Set FUSION_PRI according to fusion type and base register.  */
18812   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
18813
18814   /* Calculate PRI.  */
18815   tmp /= 2;
18816
18817   /* INSN with smaller offset goes first.  */
18818   off_val = (int)(INTVAL (offset));
18819   if (off_val >= 0)
18820     tmp -= (off_val & 0xfffff);
18821   else
18822     tmp += ((- off_val) & 0xfffff);
18823
18824   *pri = tmp;
18825   return;
18826 }
18827
18828 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
18829    Adjust priority of sha1h instructions so they are scheduled before
18830    other SHA1 instructions.  */
18831
18832 static int
18833 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
18834 {
18835   rtx x = PATTERN (insn);
18836
18837   if (GET_CODE (x) == SET)
18838     {
18839       x = SET_SRC (x);
18840
18841       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
18842         return priority + 10;
18843     }
18844
18845   return priority;
18846 }
18847
18848 /* Given OPERANDS of consecutive load/store, check if we can merge
18849    them into ldp/stp.  LOAD is true if they are load instructions.
18850    MODE is the mode of memory operands.  */
18851
18852 bool
18853 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
18854                                 machine_mode mode)
18855 {
18856   HOST_WIDE_INT offval_1, offval_2, msize;
18857   enum reg_class rclass_1, rclass_2;
18858   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
18859
18860   if (load)
18861     {
18862       mem_1 = operands[1];
18863       mem_2 = operands[3];
18864       reg_1 = operands[0];
18865       reg_2 = operands[2];
18866       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
18867       if (REGNO (reg_1) == REGNO (reg_2))
18868         return false;
18869     }
18870   else
18871     {
18872       mem_1 = operands[0];
18873       mem_2 = operands[2];
18874       reg_1 = operands[1];
18875       reg_2 = operands[3];
18876     }
18877
18878   /* The mems cannot be volatile.  */
18879   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
18880     return false;
18881
18882   /* If we have SImode and slow unaligned ldp,
18883      check the alignment to be at least 8 byte. */
18884   if (mode == SImode
18885       && (aarch64_tune_params.extra_tuning_flags
18886           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18887       && !optimize_size
18888       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
18889     return false;
18890
18891   /* Check if the addresses are in the form of [base+offset].  */
18892   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
18893   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
18894     return false;
18895   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
18896   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
18897     return false;
18898
18899   /* Check if the bases are same.  */
18900   if (!rtx_equal_p (base_1, base_2))
18901     return false;
18902
18903   /* The operands must be of the same size.  */
18904   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
18905                          GET_MODE_SIZE (GET_MODE (mem_2))));
18906
18907   offval_1 = INTVAL (offset_1);
18908   offval_2 = INTVAL (offset_2);
18909   /* We should only be trying this for fixed-sized modes.  There is no
18910      SVE LDP/STP instruction.  */
18911   msize = GET_MODE_SIZE (mode).to_constant ();
18912   /* Check if the offsets are consecutive.  */
18913   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
18914     return false;
18915
18916   /* Check if the addresses are clobbered by load.  */
18917   if (load)
18918     {
18919       if (reg_mentioned_p (reg_1, mem_1))
18920         return false;
18921
18922       /* In increasing order, the last load can clobber the address.  */
18923       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
18924         return false;
18925     }
18926
18927   /* One of the memory accesses must be a mempair operand.
18928      If it is not the first one, they need to be swapped by the
18929      peephole.  */
18930   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
18931        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
18932     return false;
18933
18934   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
18935     rclass_1 = FP_REGS;
18936   else
18937     rclass_1 = GENERAL_REGS;
18938
18939   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
18940     rclass_2 = FP_REGS;
18941   else
18942     rclass_2 = GENERAL_REGS;
18943
18944   /* Check if the registers are of same class.  */
18945   if (rclass_1 != rclass_2)
18946     return false;
18947
18948   return true;
18949 }
18950
18951 /* Given OPERANDS of consecutive load/store that can be merged,
18952    swap them if they are not in ascending order.  */
18953 void
18954 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
18955 {
18956   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
18957   HOST_WIDE_INT offval_1, offval_2;
18958
18959   if (load)
18960     {
18961       mem_1 = operands[1];
18962       mem_2 = operands[3];
18963     }
18964   else
18965     {
18966       mem_1 = operands[0];
18967       mem_2 = operands[2];
18968     }
18969
18970   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
18971   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
18972
18973   offval_1 = INTVAL (offset_1);
18974   offval_2 = INTVAL (offset_2);
18975
18976   if (offval_1 > offval_2)
18977     {
18978       /* Irrespective of whether this is a load or a store,
18979          we do the same swap.  */
18980       std::swap (operands[0], operands[2]);
18981       std::swap (operands[1], operands[3]);
18982     }
18983 }
18984
18985 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
18986    comparison between the two.  */
18987 int
18988 aarch64_host_wide_int_compare (const void *x, const void *y)
18989 {
18990   return wi::cmps (* ((const HOST_WIDE_INT *) x),
18991                    * ((const HOST_WIDE_INT *) y));
18992 }
18993
18994 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
18995    other pointing to a REG rtx containing an offset, compare the offsets
18996    of the two pairs.
18997
18998    Return:
18999
19000         1 iff offset (X) > offset (Y)
19001         0 iff offset (X) == offset (Y)
19002         -1 iff offset (X) < offset (Y)  */
19003 int
19004 aarch64_ldrstr_offset_compare (const void *x, const void *y)
19005 {
19006   const rtx * operands_1 = (const rtx *) x;
19007   const rtx * operands_2 = (const rtx *) y;
19008   rtx mem_1, mem_2, base, offset_1, offset_2;
19009
19010   if (MEM_P (operands_1[0]))
19011     mem_1 = operands_1[0];
19012   else
19013     mem_1 = operands_1[1];
19014
19015   if (MEM_P (operands_2[0]))
19016     mem_2 = operands_2[0];
19017   else
19018     mem_2 = operands_2[1];
19019
19020   /* Extract the offsets.  */
19021   extract_base_offset_in_addr (mem_1, &base, &offset_1);
19022   extract_base_offset_in_addr (mem_2, &base, &offset_2);
19023
19024   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
19025
19026   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
19027 }
19028
19029 /* Given OPERANDS of consecutive load/store, check if we can merge
19030    them into ldp/stp by adjusting the offset.  LOAD is true if they
19031    are load instructions.  MODE is the mode of memory operands.
19032
19033    Given below consecutive stores:
19034
19035      str  w1, [xb, 0x100]
19036      str  w1, [xb, 0x104]
19037      str  w1, [xb, 0x108]
19038      str  w1, [xb, 0x10c]
19039
19040    Though the offsets are out of the range supported by stp, we can
19041    still pair them after adjusting the offset, like:
19042
19043      add  scratch, xb, 0x100
19044      stp  w1, w1, [scratch]
19045      stp  w1, w1, [scratch, 0x8]
19046
19047    The peephole patterns detecting this opportunity should guarantee
19048    the scratch register is avaliable.  */
19049
19050 bool
19051 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
19052                                        scalar_mode mode)
19053 {
19054   const int num_insns = 4;
19055   enum reg_class rclass;
19056   HOST_WIDE_INT offvals[num_insns], msize;
19057   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
19058
19059   if (load)
19060     {
19061       for (int i = 0; i < num_insns; i++)
19062         {
19063           reg[i] = operands[2 * i];
19064           mem[i] = operands[2 * i + 1];
19065
19066           gcc_assert (REG_P (reg[i]));
19067         }
19068
19069       /* Do not attempt to merge the loads if the loads clobber each other.  */
19070       for (int i = 0; i < 8; i += 2)
19071         for (int j = i + 2; j < 8; j += 2)
19072           if (reg_overlap_mentioned_p (operands[i], operands[j]))
19073             return false;
19074     }
19075   else
19076     for (int i = 0; i < num_insns; i++)
19077       {
19078         mem[i] = operands[2 * i];
19079         reg[i] = operands[2 * i + 1];
19080       }
19081
19082   /* Skip if memory operand is by itself valid for ldp/stp.  */
19083   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
19084     return false;
19085
19086   for (int i = 0; i < num_insns; i++)
19087     {
19088       /* The mems cannot be volatile.  */
19089       if (MEM_VOLATILE_P (mem[i]))
19090         return false;
19091
19092       /* Check if the addresses are in the form of [base+offset].  */
19093       extract_base_offset_in_addr (mem[i], base + i, offset + i);
19094       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
19095         return false;
19096     }
19097
19098   /* Check if the registers are of same class.  */
19099   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
19100     ? FP_REGS : GENERAL_REGS;
19101
19102   for (int i = 1; i < num_insns; i++)
19103     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
19104       {
19105         if (rclass != FP_REGS)
19106           return false;
19107       }
19108     else
19109       {
19110         if (rclass != GENERAL_REGS)
19111           return false;
19112       }
19113
19114   /* Only the last register in the order in which they occur
19115      may be clobbered by the load.  */
19116   if (rclass == GENERAL_REGS && load)
19117     for (int i = 0; i < num_insns - 1; i++)
19118       if (reg_mentioned_p (reg[i], mem[i]))
19119         return false;
19120
19121   /* Check if the bases are same.  */
19122   for (int i = 0; i < num_insns - 1; i++)
19123     if (!rtx_equal_p (base[i], base[i + 1]))
19124       return false;
19125
19126   for (int i = 0; i < num_insns; i++)
19127     offvals[i] = INTVAL (offset[i]);
19128
19129   msize = GET_MODE_SIZE (mode);
19130
19131   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
19132   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
19133          aarch64_host_wide_int_compare);
19134
19135   if (!(offvals[1] == offvals[0] + msize
19136         && offvals[3] == offvals[2] + msize))
19137     return false;
19138
19139   /* Check that offsets are within range of each other.  The ldp/stp
19140      instructions have 7 bit immediate offsets, so use 0x80.  */
19141   if (offvals[2] - offvals[0] >= msize * 0x80)
19142     return false;
19143
19144   /* The offsets must be aligned with respect to each other.  */
19145   if (offvals[0] % msize != offvals[2] % msize)
19146     return false;
19147
19148   /* If we have SImode and slow unaligned ldp,
19149      check the alignment to be at least 8 byte. */
19150   if (mode == SImode
19151       && (aarch64_tune_params.extra_tuning_flags
19152           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19153       && !optimize_size
19154       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
19155     return false;
19156
19157   return true;
19158 }
19159
19160 /* Given OPERANDS of consecutive load/store, this function pairs them
19161    into LDP/STP after adjusting the offset.  It depends on the fact
19162    that the operands can be sorted so the offsets are correct for STP.
19163    MODE is the mode of memory operands.  CODE is the rtl operator
19164    which should be applied to all memory operands, it's SIGN_EXTEND,
19165    ZERO_EXTEND or UNKNOWN.  */
19166
19167 bool
19168 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
19169                              scalar_mode mode, RTX_CODE code)
19170 {
19171   rtx base, offset_1, offset_3, t1, t2;
19172   rtx mem_1, mem_2, mem_3, mem_4;
19173   rtx temp_operands[8];
19174   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
19175                 stp_off_upper_limit, stp_off_lower_limit, msize;
19176
19177   /* We make changes on a copy as we may still bail out.  */
19178   for (int i = 0; i < 8; i ++)
19179     temp_operands[i] = operands[i];
19180
19181   /* Sort the operands.  */
19182   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
19183
19184   /* Copy the memory operands so that if we have to bail for some
19185      reason the original addresses are unchanged.  */
19186   if (load)
19187     {
19188       mem_1 = copy_rtx (temp_operands[1]);
19189       mem_2 = copy_rtx (temp_operands[3]);
19190       mem_3 = copy_rtx (temp_operands[5]);
19191       mem_4 = copy_rtx (temp_operands[7]);
19192     }
19193   else
19194     {
19195       mem_1 = copy_rtx (temp_operands[0]);
19196       mem_2 = copy_rtx (temp_operands[2]);
19197       mem_3 = copy_rtx (temp_operands[4]);
19198       mem_4 = copy_rtx (temp_operands[6]);
19199       gcc_assert (code == UNKNOWN);
19200     }
19201
19202   extract_base_offset_in_addr (mem_1, &base, &offset_1);
19203   extract_base_offset_in_addr (mem_3, &base, &offset_3);
19204   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
19205               && offset_3 != NULL_RTX);
19206
19207   /* Adjust offset so it can fit in LDP/STP instruction.  */
19208   msize = GET_MODE_SIZE (mode);
19209   stp_off_upper_limit = msize * (0x40 - 1);
19210   stp_off_lower_limit = - msize * 0x40;
19211
19212   off_val_1 = INTVAL (offset_1);
19213   off_val_3 = INTVAL (offset_3);
19214
19215   /* The base offset is optimally half way between the two STP/LDP offsets.  */
19216   if (msize <= 4)
19217     base_off = (off_val_1 + off_val_3) / 2;
19218   else
19219     /* However, due to issues with negative LDP/STP offset generation for
19220        larger modes, for DF, DI and vector modes. we must not use negative
19221        addresses smaller than 9 signed unadjusted bits can store.  This
19222        provides the most range in this case.  */
19223     base_off = off_val_1;
19224
19225   /* Adjust the base so that it is aligned with the addresses but still
19226      optimal.  */
19227   if (base_off % msize != off_val_1 % msize)
19228     /* Fix the offset, bearing in mind we want to make it bigger not
19229        smaller.  */
19230     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19231   else if (msize <= 4)
19232     /* The negative range of LDP/STP is one larger than the positive range.  */
19233     base_off += msize;
19234
19235   /* Check if base offset is too big or too small.  We can attempt to resolve
19236      this issue by setting it to the maximum value and seeing if the offsets
19237      still fit.  */
19238   if (base_off >= 0x1000)
19239     {
19240       base_off = 0x1000 - 1;
19241       /* We must still make sure that the base offset is aligned with respect
19242          to the address.  But it may may not be made any bigger.  */
19243       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19244     }
19245
19246   /* Likewise for the case where the base is too small.  */
19247   if (base_off <= -0x1000)
19248     {
19249       base_off = -0x1000 + 1;
19250       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19251     }
19252
19253   /* Offset of the first STP/LDP.  */
19254   new_off_1 = off_val_1 - base_off;
19255
19256   /* Offset of the second STP/LDP.  */
19257   new_off_3 = off_val_3 - base_off;
19258
19259   /* The offsets must be within the range of the LDP/STP instructions.  */
19260   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
19261       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
19262     return false;
19263
19264   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
19265                                                   new_off_1), true);
19266   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
19267                                                   new_off_1 + msize), true);
19268   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
19269                                                   new_off_3), true);
19270   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
19271                                                   new_off_3 + msize), true);
19272
19273   if (!aarch64_mem_pair_operand (mem_1, mode)
19274       || !aarch64_mem_pair_operand (mem_3, mode))
19275     return false;
19276
19277   if (code == ZERO_EXTEND)
19278     {
19279       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
19280       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
19281       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
19282       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
19283     }
19284   else if (code == SIGN_EXTEND)
19285     {
19286       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
19287       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
19288       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
19289       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
19290     }
19291
19292   if (load)
19293     {
19294       operands[0] = temp_operands[0];
19295       operands[1] = mem_1;
19296       operands[2] = temp_operands[2];
19297       operands[3] = mem_2;
19298       operands[4] = temp_operands[4];
19299       operands[5] = mem_3;
19300       operands[6] = temp_operands[6];
19301       operands[7] = mem_4;
19302     }
19303   else
19304     {
19305       operands[0] = mem_1;
19306       operands[1] = temp_operands[1];
19307       operands[2] = mem_2;
19308       operands[3] = temp_operands[3];
19309       operands[4] = mem_3;
19310       operands[5] = temp_operands[5];
19311       operands[6] = mem_4;
19312       operands[7] = temp_operands[7];
19313     }
19314
19315   /* Emit adjusting instruction.  */
19316   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
19317   /* Emit ldp/stp instructions.  */
19318   t1 = gen_rtx_SET (operands[0], operands[1]);
19319   t2 = gen_rtx_SET (operands[2], operands[3]);
19320   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19321   t1 = gen_rtx_SET (operands[4], operands[5]);
19322   t2 = gen_rtx_SET (operands[6], operands[7]);
19323   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19324   return true;
19325 }
19326
19327 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
19328    it isn't worth branching around empty masked ops (including masked
19329    stores).  */
19330
19331 static bool
19332 aarch64_empty_mask_is_expensive (unsigned)
19333 {
19334   return false;
19335 }
19336
19337 /* Return 1 if pseudo register should be created and used to hold
19338    GOT address for PIC code.  */
19339
19340 bool
19341 aarch64_use_pseudo_pic_reg (void)
19342 {
19343   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
19344 }
19345
19346 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
19347
19348 static int
19349 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
19350 {
19351   switch (XINT (x, 1))
19352     {
19353     case UNSPEC_GOTSMALLPIC:
19354     case UNSPEC_GOTSMALLPIC28K:
19355     case UNSPEC_GOTTINYPIC:
19356       return 0;
19357     default:
19358       break;
19359     }
19360
19361   return default_unspec_may_trap_p (x, flags);
19362 }
19363
19364
19365 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19366    return the log2 of that value.  Otherwise return -1.  */
19367
19368 int
19369 aarch64_fpconst_pow_of_2 (rtx x)
19370 {
19371   const REAL_VALUE_TYPE *r;
19372
19373   if (!CONST_DOUBLE_P (x))
19374     return -1;
19375
19376   r = CONST_DOUBLE_REAL_VALUE (x);
19377
19378   if (REAL_VALUE_NEGATIVE (*r)
19379       || REAL_VALUE_ISNAN (*r)
19380       || REAL_VALUE_ISINF (*r)
19381       || !real_isinteger (r, DFmode))
19382     return -1;
19383
19384   return exact_log2 (real_to_integer (r));
19385 }
19386
19387 /* If X is a vector of equal CONST_DOUBLE values and that value is
19388    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
19389
19390 int
19391 aarch64_vec_fpconst_pow_of_2 (rtx x)
19392 {
19393   int nelts;
19394   if (GET_CODE (x) != CONST_VECTOR
19395       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
19396     return -1;
19397
19398   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
19399     return -1;
19400
19401   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
19402   if (firstval <= 0)
19403     return -1;
19404
19405   for (int i = 1; i < nelts; i++)
19406     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
19407       return -1;
19408
19409   return firstval;
19410 }
19411
19412 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
19413    to float.
19414
19415    __fp16 always promotes through this hook.
19416    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
19417    through the generic excess precision logic rather than here.  */
19418
19419 static tree
19420 aarch64_promoted_type (const_tree t)
19421 {
19422   if (SCALAR_FLOAT_TYPE_P (t)
19423       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
19424     return float_type_node;
19425
19426   return NULL_TREE;
19427 }
19428
19429 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
19430
19431 static bool
19432 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
19433                            optimization_type opt_type)
19434 {
19435   switch (op)
19436     {
19437     case rsqrt_optab:
19438       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
19439
19440     default:
19441       return true;
19442     }
19443 }
19444
19445 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
19446
19447 static unsigned int
19448 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
19449                                         int *offset)
19450 {
19451   /* Polynomial invariant 1 == (VG / 2) - 1.  */
19452   gcc_assert (i == 1);
19453   *factor = 2;
19454   *offset = 1;
19455   return AARCH64_DWARF_VG;
19456 }
19457
19458 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
19459    if MODE is HFmode, and punt to the generic implementation otherwise.  */
19460
19461 static bool
19462 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
19463 {
19464   return (mode == HFmode
19465           ? true
19466           : default_libgcc_floating_mode_supported_p (mode));
19467 }
19468
19469 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
19470    if MODE is HFmode, and punt to the generic implementation otherwise.  */
19471
19472 static bool
19473 aarch64_scalar_mode_supported_p (scalar_mode mode)
19474 {
19475   return (mode == HFmode
19476           ? true
19477           : default_scalar_mode_supported_p (mode));
19478 }
19479
19480 /* Set the value of FLT_EVAL_METHOD.
19481    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
19482
19483     0: evaluate all operations and constants, whose semantic type has at
19484        most the range and precision of type float, to the range and
19485        precision of float; evaluate all other operations and constants to
19486        the range and precision of the semantic type;
19487
19488     N, where _FloatN is a supported interchange floating type
19489        evaluate all operations and constants, whose semantic type has at
19490        most the range and precision of _FloatN type, to the range and
19491        precision of the _FloatN type; evaluate all other operations and
19492        constants to the range and precision of the semantic type;
19493
19494    If we have the ARMv8.2-A extensions then we support _Float16 in native
19495    precision, so we should set this to 16.  Otherwise, we support the type,
19496    but want to evaluate expressions in float precision, so set this to
19497    0.  */
19498
19499 static enum flt_eval_method
19500 aarch64_excess_precision (enum excess_precision_type type)
19501 {
19502   switch (type)
19503     {
19504       case EXCESS_PRECISION_TYPE_FAST:
19505       case EXCESS_PRECISION_TYPE_STANDARD:
19506         /* We can calculate either in 16-bit range and precision or
19507            32-bit range and precision.  Make that decision based on whether
19508            we have native support for the ARMv8.2-A 16-bit floating-point
19509            instructions or not.  */
19510         return (TARGET_FP_F16INST
19511                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
19512                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
19513       case EXCESS_PRECISION_TYPE_IMPLICIT:
19514         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
19515       default:
19516         gcc_unreachable ();
19517     }
19518   return FLT_EVAL_METHOD_UNPREDICTABLE;
19519 }
19520
19521 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
19522    scheduled for speculative execution.  Reject the long-running division
19523    and square-root instructions.  */
19524
19525 static bool
19526 aarch64_sched_can_speculate_insn (rtx_insn *insn)
19527 {
19528   switch (get_attr_type (insn))
19529     {
19530       case TYPE_SDIV:
19531       case TYPE_UDIV:
19532       case TYPE_FDIVS:
19533       case TYPE_FDIVD:
19534       case TYPE_FSQRTS:
19535       case TYPE_FSQRTD:
19536       case TYPE_NEON_FP_SQRT_S:
19537       case TYPE_NEON_FP_SQRT_D:
19538       case TYPE_NEON_FP_SQRT_S_Q:
19539       case TYPE_NEON_FP_SQRT_D_Q:
19540       case TYPE_NEON_FP_DIV_S:
19541       case TYPE_NEON_FP_DIV_D:
19542       case TYPE_NEON_FP_DIV_S_Q:
19543       case TYPE_NEON_FP_DIV_D_Q:
19544         return false;
19545       default:
19546         return true;
19547     }
19548 }
19549
19550 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
19551
19552 static int
19553 aarch64_compute_pressure_classes (reg_class *classes)
19554 {
19555   int i = 0;
19556   classes[i++] = GENERAL_REGS;
19557   classes[i++] = FP_REGS;
19558   /* PR_REGS isn't a useful pressure class because many predicate pseudo
19559      registers need to go in PR_LO_REGS at some point during their
19560      lifetime.  Splitting it into two halves has the effect of making
19561      all predicates count against PR_LO_REGS, so that we try whenever
19562      possible to restrict the number of live predicates to 8.  This
19563      greatly reduces the amount of spilling in certain loops.  */
19564   classes[i++] = PR_LO_REGS;
19565   classes[i++] = PR_HI_REGS;
19566   return i;
19567 }
19568
19569 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
19570
19571 static bool
19572 aarch64_can_change_mode_class (machine_mode from,
19573                                machine_mode to, reg_class_t)
19574 {
19575   if (BYTES_BIG_ENDIAN)
19576     {
19577       bool from_sve_p = aarch64_sve_data_mode_p (from);
19578       bool to_sve_p = aarch64_sve_data_mode_p (to);
19579
19580       /* Don't allow changes between SVE data modes and non-SVE modes.
19581          See the comment at the head of aarch64-sve.md for details.  */
19582       if (from_sve_p != to_sve_p)
19583         return false;
19584
19585       /* Don't allow changes in element size: lane 0 of the new vector
19586          would not then be lane 0 of the old vector.  See the comment
19587          above aarch64_maybe_expand_sve_subreg_move for a more detailed
19588          description.
19589
19590          In the worst case, this forces a register to be spilled in
19591          one mode and reloaded in the other, which handles the
19592          endianness correctly.  */
19593       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
19594         return false;
19595     }
19596   return true;
19597 }
19598
19599 /* Implement TARGET_EARLY_REMAT_MODES.  */
19600
19601 static void
19602 aarch64_select_early_remat_modes (sbitmap modes)
19603 {
19604   /* SVE values are not normally live across a call, so it should be
19605      worth doing early rematerialization even in VL-specific mode.  */
19606   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
19607     {
19608       machine_mode mode = (machine_mode) i;
19609       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19610       if (vec_flags & VEC_ANY_SVE)
19611         bitmap_set_bit (modes, i);
19612     }
19613 }
19614
19615 /* Override the default target speculation_safe_value.  */
19616 static rtx
19617 aarch64_speculation_safe_value (machine_mode mode,
19618                                 rtx result, rtx val, rtx failval)
19619 {
19620   /* Maybe we should warn if falling back to hard barriers.  They are
19621      likely to be noticably more expensive than the alternative below.  */
19622   if (!aarch64_track_speculation)
19623     return default_speculation_safe_value (mode, result, val, failval);
19624
19625   if (!REG_P (val))
19626     val = copy_to_mode_reg (mode, val);
19627
19628   if (!aarch64_reg_or_zero (failval, mode))
19629     failval = copy_to_mode_reg (mode, failval);
19630
19631   emit_insn (gen_despeculate_copy (mode, result, val, failval));
19632   return result;
19633 }
19634
19635 /* Implement TARGET_ESTIMATED_POLY_VALUE.
19636    Look into the tuning structure for an estimate.
19637    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
19638    Advanced SIMD 128 bits.  */
19639
19640 static HOST_WIDE_INT
19641 aarch64_estimated_poly_value (poly_int64 val)
19642 {
19643   enum aarch64_sve_vector_bits_enum width_source
19644     = aarch64_tune_params.sve_width;
19645
19646   /* If we still don't have an estimate, use the default.  */
19647   if (width_source == SVE_SCALABLE)
19648     return default_estimated_poly_value (val);
19649
19650   HOST_WIDE_INT over_128 = width_source - 128;
19651   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
19652 }
19653
19654
19655 /* Return true for types that could be supported as SIMD return or
19656    argument types.  */
19657
19658 static bool
19659 supported_simd_type (tree t)
19660 {
19661   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
19662     {
19663       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
19664       return s == 1 || s == 2 || s == 4 || s == 8;
19665     }
19666   return false;
19667 }
19668
19669 /* Return true for types that currently are supported as SIMD return
19670    or argument types.  */
19671
19672 static bool
19673 currently_supported_simd_type (tree t, tree b)
19674 {
19675   if (COMPLEX_FLOAT_TYPE_P (t))
19676     return false;
19677
19678   if (TYPE_SIZE (t) != TYPE_SIZE (b))
19679     return false;
19680
19681   return supported_simd_type (t);
19682 }
19683
19684 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
19685
19686 static int
19687 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
19688                                         struct cgraph_simd_clone *clonei,
19689                                         tree base_type, int num)
19690 {
19691   tree t, ret_type, arg_type;
19692   unsigned int elt_bits, vec_bits, count;
19693
19694   if (!TARGET_SIMD)
19695     return 0;
19696
19697   if (clonei->simdlen
19698       && (clonei->simdlen < 2
19699           || clonei->simdlen > 1024
19700           || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
19701     {
19702       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19703                   "unsupported simdlen %d", clonei->simdlen);
19704       return 0;
19705     }
19706
19707   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
19708   if (TREE_CODE (ret_type) != VOID_TYPE
19709       && !currently_supported_simd_type (ret_type, base_type))
19710     {
19711       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
19712         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19713                     "GCC does not currently support mixed size types "
19714                     "for %<simd%> functions");
19715       else if (supported_simd_type (ret_type))
19716         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19717                     "GCC does not currently support return type %qT "
19718                     "for %<simd%> functions", ret_type);
19719       else
19720         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19721                     "unsupported return type %qT for %<simd%> functions",
19722                     ret_type);
19723       return 0;
19724     }
19725
19726   for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
19727     {
19728       arg_type = TREE_TYPE (t);
19729
19730       if (!currently_supported_simd_type (arg_type, base_type))
19731         {
19732           if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
19733             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19734                         "GCC does not currently support mixed size types "
19735                         "for %<simd%> functions");
19736           else
19737             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19738                         "GCC does not currently support argument type %qT "
19739                         "for %<simd%> functions", arg_type);
19740           return 0;
19741         }
19742     }
19743
19744   clonei->vecsize_mangle = 'n';
19745   clonei->mask_mode = VOIDmode;
19746   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
19747   if (clonei->simdlen == 0)
19748     {
19749       count = 2;
19750       vec_bits = (num == 0 ? 64 : 128);
19751       clonei->simdlen = vec_bits / elt_bits;
19752     }
19753   else
19754     {
19755       count = 1;
19756       vec_bits = clonei->simdlen * elt_bits;
19757       if (vec_bits != 64 && vec_bits != 128)
19758         {
19759           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19760                       "GCC does not currently support simdlen %d for type %qT",
19761                       clonei->simdlen, base_type);
19762           return 0;
19763         }
19764     }
19765   clonei->vecsize_int = vec_bits;
19766   clonei->vecsize_float = vec_bits;
19767   return count;
19768 }
19769
19770 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
19771
19772 static void
19773 aarch64_simd_clone_adjust (struct cgraph_node *node)
19774 {
19775   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
19776      use the correct ABI.  */
19777
19778   tree t = TREE_TYPE (node->decl);
19779   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
19780                                         TYPE_ATTRIBUTES (t));
19781 }
19782
19783 /* Implement TARGET_SIMD_CLONE_USABLE.  */
19784
19785 static int
19786 aarch64_simd_clone_usable (struct cgraph_node *node)
19787 {
19788   switch (node->simdclone->vecsize_mangle)
19789     {
19790     case 'n':
19791       if (!TARGET_SIMD)
19792         return -1;
19793       return 0;
19794     default:
19795       gcc_unreachable ();
19796     }
19797 }
19798
19799 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
19800
19801 static int
19802 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
19803 {
19804   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
19805       != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
19806     return 0;
19807   return 1;
19808 }
19809
19810 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
19811
19812 static const char *
19813 aarch64_get_multilib_abi_name (void)
19814 {
19815   if (TARGET_BIG_END)
19816     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
19817   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
19818 }
19819
19820 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
19821    global variable based guard use the default else
19822    return a null tree.  */
19823 static tree
19824 aarch64_stack_protect_guard (void)
19825 {
19826   if (aarch64_stack_protector_guard == SSP_GLOBAL)
19827     return default_stack_protect_guard ();
19828
19829   return NULL_TREE;
19830 }
19831
19832 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
19833    section at the end if needed.  */
19834 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
19835 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
19836 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
19837 void
19838 aarch64_file_end_indicate_exec_stack ()
19839 {
19840   file_end_indicate_exec_stack ();
19841
19842   unsigned feature_1_and = 0;
19843   if (aarch64_bti_enabled ())
19844     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
19845
19846   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
19847     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
19848
19849   if (feature_1_and)
19850     {
19851       /* Generate .note.gnu.property section.  */
19852       switch_to_section (get_section (".note.gnu.property",
19853                                       SECTION_NOTYPE, NULL));
19854
19855       /* PT_NOTE header: namesz, descsz, type.
19856          namesz = 4 ("GNU\0")
19857          descsz = 16 (Size of the program property array)
19858                   [(12 + padding) * Number of array elements]
19859          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
19860       assemble_align (POINTER_SIZE);
19861       assemble_integer (GEN_INT (4), 4, 32, 1);
19862       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
19863       assemble_integer (GEN_INT (5), 4, 32, 1);
19864
19865       /* PT_NOTE name.  */
19866       assemble_string ("GNU", 4);
19867
19868       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
19869          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
19870          datasz = 4
19871          data   = feature_1_and.  */
19872       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
19873       assemble_integer (GEN_INT (4), 4, 32, 1);
19874       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
19875
19876       /* Pad the size of the note to the required alignment.  */
19877       assemble_align (POINTER_SIZE);
19878     }
19879 }
19880 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
19881 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
19882 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
19883
19884 /* Target-specific selftests.  */
19885
19886 #if CHECKING_P
19887
19888 namespace selftest {
19889
19890 /* Selftest for the RTL loader.
19891    Verify that the RTL loader copes with a dump from
19892    print_rtx_function.  This is essentially just a test that class
19893    function_reader can handle a real dump, but it also verifies
19894    that lookup_reg_by_dump_name correctly handles hard regs.
19895    The presence of hard reg names in the dump means that the test is
19896    target-specific, hence it is in this file.  */
19897
19898 static void
19899 aarch64_test_loading_full_dump ()
19900 {
19901   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
19902
19903   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
19904
19905   rtx_insn *insn_1 = get_insn_by_uid (1);
19906   ASSERT_EQ (NOTE, GET_CODE (insn_1));
19907
19908   rtx_insn *insn_15 = get_insn_by_uid (15);
19909   ASSERT_EQ (INSN, GET_CODE (insn_15));
19910   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
19911
19912   /* Verify crtl->return_rtx.  */
19913   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
19914   ASSERT_EQ (0, REGNO (crtl->return_rtx));
19915   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
19916 }
19917
19918 /* Run all target-specific selftests.  */
19919
19920 static void
19921 aarch64_run_selftests (void)
19922 {
19923   aarch64_test_loading_full_dump ();
19924 }
19925
19926 } // namespace selftest
19927
19928 #endif /* #if CHECKING_P */
19929
19930 #undef TARGET_STACK_PROTECT_GUARD
19931 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
19932
19933 #undef TARGET_ADDRESS_COST
19934 #define TARGET_ADDRESS_COST aarch64_address_cost
19935
19936 /* This hook will determines whether unnamed bitfields affect the alignment
19937    of the containing structure.  The hook returns true if the structure
19938    should inherit the alignment requirements of an unnamed bitfield's
19939    type.  */
19940 #undef TARGET_ALIGN_ANON_BITFIELD
19941 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
19942
19943 #undef TARGET_ASM_ALIGNED_DI_OP
19944 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
19945
19946 #undef TARGET_ASM_ALIGNED_HI_OP
19947 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
19948
19949 #undef TARGET_ASM_ALIGNED_SI_OP
19950 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
19951
19952 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
19953 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
19954   hook_bool_const_tree_hwi_hwi_const_tree_true
19955
19956 #undef TARGET_ASM_FILE_START
19957 #define TARGET_ASM_FILE_START aarch64_start_file
19958
19959 #undef TARGET_ASM_OUTPUT_MI_THUNK
19960 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
19961
19962 #undef TARGET_ASM_SELECT_RTX_SECTION
19963 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
19964
19965 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
19966 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
19967
19968 #undef TARGET_BUILD_BUILTIN_VA_LIST
19969 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
19970
19971 #undef TARGET_CALLEE_COPIES
19972 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
19973
19974 #undef TARGET_CAN_ELIMINATE
19975 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
19976
19977 #undef TARGET_CAN_INLINE_P
19978 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
19979
19980 #undef TARGET_CANNOT_FORCE_CONST_MEM
19981 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
19982
19983 #undef TARGET_CASE_VALUES_THRESHOLD
19984 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
19985
19986 #undef TARGET_CONDITIONAL_REGISTER_USAGE
19987 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
19988
19989 /* Only the least significant bit is used for initialization guard
19990    variables.  */
19991 #undef TARGET_CXX_GUARD_MASK_BIT
19992 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
19993
19994 #undef TARGET_C_MODE_FOR_SUFFIX
19995 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
19996
19997 #ifdef TARGET_BIG_ENDIAN_DEFAULT
19998 #undef  TARGET_DEFAULT_TARGET_FLAGS
19999 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
20000 #endif
20001
20002 #undef TARGET_CLASS_MAX_NREGS
20003 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
20004
20005 #undef TARGET_BUILTIN_DECL
20006 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
20007
20008 #undef TARGET_BUILTIN_RECIPROCAL
20009 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
20010
20011 #undef TARGET_C_EXCESS_PRECISION
20012 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
20013
20014 #undef  TARGET_EXPAND_BUILTIN
20015 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
20016
20017 #undef TARGET_EXPAND_BUILTIN_VA_START
20018 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
20019
20020 #undef TARGET_FOLD_BUILTIN
20021 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
20022
20023 #undef TARGET_FUNCTION_ARG
20024 #define TARGET_FUNCTION_ARG aarch64_function_arg
20025
20026 #undef TARGET_FUNCTION_ARG_ADVANCE
20027 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
20028
20029 #undef TARGET_FUNCTION_ARG_BOUNDARY
20030 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
20031
20032 #undef TARGET_FUNCTION_ARG_PADDING
20033 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
20034
20035 #undef TARGET_GET_RAW_RESULT_MODE
20036 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
20037 #undef TARGET_GET_RAW_ARG_MODE
20038 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
20039
20040 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
20041 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
20042
20043 #undef TARGET_FUNCTION_VALUE
20044 #define TARGET_FUNCTION_VALUE aarch64_function_value
20045
20046 #undef TARGET_FUNCTION_VALUE_REGNO_P
20047 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
20048
20049 #undef TARGET_GIMPLE_FOLD_BUILTIN
20050 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
20051
20052 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
20053 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
20054
20055 #undef  TARGET_INIT_BUILTINS
20056 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
20057
20058 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
20059 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
20060   aarch64_ira_change_pseudo_allocno_class
20061
20062 #undef TARGET_LEGITIMATE_ADDRESS_P
20063 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
20064
20065 #undef TARGET_LEGITIMATE_CONSTANT_P
20066 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
20067
20068 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
20069 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
20070   aarch64_legitimize_address_displacement
20071
20072 #undef TARGET_LIBGCC_CMP_RETURN_MODE
20073 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
20074
20075 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
20076 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
20077 aarch64_libgcc_floating_mode_supported_p
20078
20079 #undef TARGET_MANGLE_TYPE
20080 #define TARGET_MANGLE_TYPE aarch64_mangle_type
20081
20082 #undef TARGET_MEMORY_MOVE_COST
20083 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
20084
20085 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
20086 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
20087
20088 #undef TARGET_MUST_PASS_IN_STACK
20089 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
20090
20091 /* This target hook should return true if accesses to volatile bitfields
20092    should use the narrowest mode possible.  It should return false if these
20093    accesses should use the bitfield container type.  */
20094 #undef TARGET_NARROW_VOLATILE_BITFIELD
20095 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
20096
20097 #undef  TARGET_OPTION_OVERRIDE
20098 #define TARGET_OPTION_OVERRIDE aarch64_override_options
20099
20100 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
20101 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
20102   aarch64_override_options_after_change
20103
20104 #undef TARGET_OPTION_SAVE
20105 #define TARGET_OPTION_SAVE aarch64_option_save
20106
20107 #undef TARGET_OPTION_RESTORE
20108 #define TARGET_OPTION_RESTORE aarch64_option_restore
20109
20110 #undef TARGET_OPTION_PRINT
20111 #define TARGET_OPTION_PRINT aarch64_option_print
20112
20113 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
20114 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
20115
20116 #undef TARGET_SET_CURRENT_FUNCTION
20117 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
20118
20119 #undef TARGET_PASS_BY_REFERENCE
20120 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
20121
20122 #undef TARGET_PREFERRED_RELOAD_CLASS
20123 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
20124
20125 #undef TARGET_SCHED_REASSOCIATION_WIDTH
20126 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
20127
20128 #undef TARGET_PROMOTED_TYPE
20129 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
20130
20131 #undef TARGET_SECONDARY_RELOAD
20132 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
20133
20134 #undef TARGET_SHIFT_TRUNCATION_MASK
20135 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
20136
20137 #undef TARGET_SETUP_INCOMING_VARARGS
20138 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
20139
20140 #undef TARGET_STRUCT_VALUE_RTX
20141 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
20142
20143 #undef TARGET_REGISTER_MOVE_COST
20144 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
20145
20146 #undef TARGET_RETURN_IN_MEMORY
20147 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
20148
20149 #undef TARGET_RETURN_IN_MSB
20150 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
20151
20152 #undef TARGET_RTX_COSTS
20153 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
20154
20155 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20156 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20157
20158 #undef TARGET_SCHED_ISSUE_RATE
20159 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20160
20161 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20162 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20163   aarch64_sched_first_cycle_multipass_dfa_lookahead
20164
20165 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20166 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20167   aarch64_first_cycle_multipass_dfa_lookahead_guard
20168
20169 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20170 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20171   aarch64_get_separate_components
20172
20173 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20174 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20175   aarch64_components_for_bb
20176
20177 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20178 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20179   aarch64_disqualify_components
20180
20181 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20182 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20183   aarch64_emit_prologue_components
20184
20185 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20186 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20187   aarch64_emit_epilogue_components
20188
20189 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20190 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20191   aarch64_set_handled_components
20192
20193 #undef TARGET_TRAMPOLINE_INIT
20194 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20195
20196 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20197 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20198
20199 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20200 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20201
20202 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20203 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20204   aarch64_builtin_support_vector_misalignment
20205
20206 #undef TARGET_ARRAY_MODE
20207 #define TARGET_ARRAY_MODE aarch64_array_mode
20208
20209 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20210 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20211
20212 #undef TARGET_VECTORIZE_ADD_STMT_COST
20213 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20214
20215 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20216 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20217   aarch64_builtin_vectorization_cost
20218
20219 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20220 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20221
20222 #undef TARGET_VECTORIZE_BUILTINS
20223 #define TARGET_VECTORIZE_BUILTINS
20224
20225 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20226 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20227   aarch64_builtin_vectorized_function
20228
20229 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20230 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20231   aarch64_autovectorize_vector_sizes
20232
20233 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20234 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20235   aarch64_atomic_assign_expand_fenv
20236
20237 /* Section anchor support.  */
20238
20239 #undef TARGET_MIN_ANCHOR_OFFSET
20240 #define TARGET_MIN_ANCHOR_OFFSET -256
20241
20242 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20243    byte offset; we can do much more for larger data types, but have no way
20244    to determine the size of the access.  We assume accesses are aligned.  */
20245 #undef TARGET_MAX_ANCHOR_OFFSET
20246 #define TARGET_MAX_ANCHOR_OFFSET 4095
20247
20248 #undef TARGET_VECTOR_ALIGNMENT
20249 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20250
20251 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20252 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20253   aarch64_vectorize_preferred_vector_alignment
20254 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20255 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20256   aarch64_simd_vector_alignment_reachable
20257
20258 /* vec_perm support.  */
20259
20260 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20261 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20262   aarch64_vectorize_vec_perm_const
20263
20264 #undef TARGET_VECTORIZE_GET_MASK_MODE
20265 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20266 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20267 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20268   aarch64_empty_mask_is_expensive
20269 #undef TARGET_PREFERRED_ELSE_VALUE
20270 #define TARGET_PREFERRED_ELSE_VALUE \
20271   aarch64_preferred_else_value
20272
20273 #undef TARGET_INIT_LIBFUNCS
20274 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20275
20276 #undef TARGET_FIXED_CONDITION_CODE_REGS
20277 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20278
20279 #undef TARGET_FLAGS_REGNUM
20280 #define TARGET_FLAGS_REGNUM CC_REGNUM
20281
20282 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20283 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20284
20285 #undef TARGET_ASAN_SHADOW_OFFSET
20286 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20287
20288 #undef TARGET_LEGITIMIZE_ADDRESS
20289 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20290
20291 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20292 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20293
20294 #undef TARGET_CAN_USE_DOLOOP_P
20295 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20296
20297 #undef TARGET_SCHED_ADJUST_PRIORITY
20298 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20299
20300 #undef TARGET_SCHED_MACRO_FUSION_P
20301 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20302
20303 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20304 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20305
20306 #undef TARGET_SCHED_FUSION_PRIORITY
20307 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20308
20309 #undef TARGET_UNSPEC_MAY_TRAP_P
20310 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20311
20312 #undef TARGET_USE_PSEUDO_PIC_REG
20313 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20314
20315 #undef TARGET_PRINT_OPERAND
20316 #define TARGET_PRINT_OPERAND aarch64_print_operand
20317
20318 #undef TARGET_PRINT_OPERAND_ADDRESS
20319 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20320
20321 #undef TARGET_OPTAB_SUPPORTED_P
20322 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20323
20324 #undef TARGET_OMIT_STRUCT_RETURN_REG
20325 #define TARGET_OMIT_STRUCT_RETURN_REG true
20326
20327 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20328 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20329   aarch64_dwarf_poly_indeterminate_value
20330
20331 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
20332 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20333 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20334
20335 #undef TARGET_HARD_REGNO_NREGS
20336 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20337 #undef TARGET_HARD_REGNO_MODE_OK
20338 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20339
20340 #undef TARGET_MODES_TIEABLE_P
20341 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20342
20343 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20344 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20345   aarch64_hard_regno_call_part_clobbered
20346
20347 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
20348 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
20349   aarch64_remove_extra_call_preserved_regs
20350
20351 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
20352 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
20353   aarch64_return_call_with_max_clobbers
20354
20355 #undef TARGET_CONSTANT_ALIGNMENT
20356 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20357
20358 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20359 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20360   aarch64_stack_clash_protection_alloca_probe_range
20361
20362 #undef TARGET_COMPUTE_PRESSURE_CLASSES
20363 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
20364
20365 #undef TARGET_CAN_CHANGE_MODE_CLASS
20366 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
20367
20368 #undef TARGET_SELECT_EARLY_REMAT_MODES
20369 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
20370
20371 #undef TARGET_SPECULATION_SAFE_VALUE
20372 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
20373
20374 #undef TARGET_ESTIMATED_POLY_VALUE
20375 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
20376
20377 #undef TARGET_ATTRIBUTE_TABLE
20378 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
20379
20380 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
20381 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
20382   aarch64_simd_clone_compute_vecsize_and_simdlen
20383
20384 #undef TARGET_SIMD_CLONE_ADJUST
20385 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
20386
20387 #undef TARGET_SIMD_CLONE_USABLE
20388 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
20389
20390 #undef TARGET_COMP_TYPE_ATTRIBUTES
20391 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
20392
20393 #undef TARGET_GET_MULTILIB_ABI_NAME
20394 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
20395
20396 #if CHECKING_P
20397 #undef TARGET_RUN_TARGET_SELFTESTS
20398 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
20399 #endif /* #if CHECKING_P */
20400
20401 #undef TARGET_ASM_POST_CFI_STARTPROC
20402 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
20403
20404 struct gcc_target targetm = TARGET_INITIALIZER;
20405
20406 #include "gt-aarch64.h"