gcc/config/aarch64/aarch64.cc

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2023 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #define INCLUDE_STRING
  24 #define INCLUDE_ALGORITHM
  25 #include "config.h"
  26 #include "system.h"
  27 #include "coretypes.h"
  28 #include "backend.h"
  29 #include "target.h"
  30 #include "rtl.h"
  31 #include "tree.h"
  32 #include "memmodel.h"
  33 #include "gimple.h"
  34 #include "cfghooks.h"
  35 #include "cfgloop.h"
  36 #include "df.h"
  37 #include "tm_p.h"
  38 #include "stringpool.h"
  39 #include "attribs.h"
  40 #include "optabs.h"
  41 #include "regs.h"
  42 #include "emit-rtl.h"
  43 #include "recog.h"
  44 #include "cgraph.h"
  45 #include "diagnostic.h"
  46 #include "insn-attr.h"
  47 #include "alias.h"
  48 #include "fold-const.h"
  49 #include "stor-layout.h"
  50 #include "calls.h"
  51 #include "varasm.h"
  52 #include "output.h"
  53 #include "flags.h"
  54 #include "explow.h"
  55 #include "expr.h"
  56 #include "reload.h"
  57 #include "langhooks.h"
  58 #include "opts.h"
  59 #include "gimplify.h"
  60 #include "dwarf2.h"
  61 #include "gimple-iterator.h"
  62 #include "tree-vectorizer.h"
  63 #include "aarch64-cost-tables.h"
  64 #include "dumpfile.h"
  65 #include "builtins.h"
  66 #include "rtl-iter.h"
  67 #include "tm-constrs.h"
  68 #include "sched-int.h"
  69 #include "target-globals.h"
  70 #include "common/common-target.h"
  71 #include "cfgrtl.h"
  72 #include "selftest.h"
  73 #include "selftest-rtl.h"
  74 #include "rtx-vector-builder.h"
  75 #include "intl.h"
  76 #include "expmed.h"
  77 #include "function-abi.h"
  78 #include "gimple-pretty-print.h"
  79 #include "tree-ssa-loop-niter.h"
  80 #include "fractional-cost.h"
  81 #include "rtlanal.h"
  82 #include "tree-dfa.h"
  83 #include "asan.h"
  84 #include "aarch64-feature-deps.h"
  85
  86 /* This file should be included last.  */
  87 #include "target-def.h"
  88
  89 /* Defined for convenience.  */
  90 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  91
  92 /* Information about a legitimate vector immediate operand.  */
  93 struct simd_immediate_info
  94 {
  95   enum insn_type { MOV, MVN, INDEX, PTRUE };
  96   enum modifier_type { LSL, MSL };
  97
  98   simd_immediate_info () {}
  99   simd_immediate_info (scalar_float_mode, rtx);
 100   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
 101                        insn_type = MOV, modifier_type = LSL,
 102                        unsigned int = 0);
 103   simd_immediate_info (scalar_mode, rtx, rtx);
 104   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
 105
 106   /* The mode of the elements.  */
 107   scalar_mode elt_mode;
 108
 109   /* The instruction to use to move the immediate into a vector.  */
 110   insn_type insn;
 111
 112   union
 113   {
 114     /* For MOV and MVN.  */
 115     struct
 116     {
 117       /* The value of each element.  */
 118       rtx value;
 119
 120       /* The kind of shift modifier to use, and the number of bits to shift.
 121          This is (LSL, 0) if no shift is needed.  */
 122       modifier_type modifier;
 123       unsigned int shift;
 124     } mov;
 125
 126     /* For INDEX.  */
 127     struct
 128     {
 129       /* The value of the first element and the step to be added for each
 130          subsequent element.  */
 131       rtx base, step;
 132     } index;
 133
 134     /* For PTRUE.  */
 135     aarch64_svpattern pattern;
 136   } u;
 137 };
 138
 139 /* Construct a floating-point immediate in which each element has mode
 140    ELT_MODE_IN and value VALUE_IN.  */
 141 inline simd_immediate_info
 142 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 143   : elt_mode (elt_mode_in), insn (MOV)
 144 {
 145   u.mov.value = value_in;
 146   u.mov.modifier = LSL;
 147   u.mov.shift = 0;
 148 }
 149
 150 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 151    and value VALUE_IN.  The other parameters are as for the structure
 152    fields.  */
 153 inline simd_immediate_info
 154 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 155                        unsigned HOST_WIDE_INT value_in,
 156                        insn_type insn_in, modifier_type modifier_in,
 157                        unsigned int shift_in)
 158   : elt_mode (elt_mode_in), insn (insn_in)
 159 {
 160   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 161   u.mov.modifier = modifier_in;
 162   u.mov.shift = shift_in;
 163 }
 164
 165 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 166    and where element I is equal to BASE_IN + I * STEP_IN.  */
 167 inline simd_immediate_info
 168 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 169   : elt_mode (elt_mode_in), insn (INDEX)
 170 {
 171   u.index.base = base_in;
 172   u.index.step = step_in;
 173 }
 174
 175 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 176    and has PTRUE pattern PATTERN_IN.  */
 177 inline simd_immediate_info
 178 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 179                        aarch64_svpattern pattern_in)
 180   : elt_mode (elt_mode_in), insn (PTRUE)
 181 {
 182   u.pattern = pattern_in;
 183 }
 184
 185 namespace {
 186
 187 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64.  */
 188 class pure_scalable_type_info
 189 {
 190 public:
 191   /* Represents the result of analyzing a type.  All values are nonzero,
 192      in the possibly forlorn hope that accidental conversions to bool
 193      trigger a warning.  */
 194   enum analysis_result
 195   {
 196     /* The type does not have an ABI identity; i.e. it doesn't contain
 197        at least one object whose type is a Fundamental Data Type.  */
 198     NO_ABI_IDENTITY = 1,
 199
 200     /* The type is definitely a Pure Scalable Type.  */
 201     IS_PST,
 202
 203     /* The type is definitely not a Pure Scalable Type.  */
 204     ISNT_PST,
 205
 206     /* It doesn't matter for PCS purposes whether the type is a Pure
 207        Scalable Type or not, since the type will be handled the same
 208        way regardless.
 209
 210        Specifically, this means that if the type is a Pure Scalable Type,
 211        there aren't enough argument registers to hold it, and so it will
 212        need to be passed or returned in memory.  If the type isn't a
 213        Pure Scalable Type, it's too big to be passed or returned in core
 214        or SIMD&FP registers, and so again will need to go in memory.  */
 215     DOESNT_MATTER
 216   };
 217
 218   /* Aggregates of 17 bytes or more are normally passed and returned
 219      in memory, so aggregates of that size can safely be analyzed as
 220      DOESNT_MATTER.  We need to be able to collect enough pieces to
 221      represent a PST that is smaller than that.  Since predicates are
 222      2 bytes in size for -msve-vector-bits=128, that means we need to be
 223      able to store at least 8 pieces.
 224
 225      We also need to be able to store enough pieces to represent
 226      a single vector in each vector argument register and a single
 227      predicate in each predicate argument register.  This means that
 228      we need at least 12 pieces.  */
 229   static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
 230   static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
 231
 232   /* Describes one piece of a PST.  Each piece is one of:
 233
 234      - a single Scalable Vector Type (SVT)
 235      - a single Scalable Predicate Type (SPT)
 236      - a PST containing 2, 3 or 4 SVTs, with no padding
 237
 238      It either represents a single built-in type or a PST formed from
 239      multiple homogeneous built-in types.  */
 240   struct piece
 241   {
 242     rtx get_rtx (unsigned int, unsigned int) const;
 243
 244     /* The number of vector and predicate registers that the piece
 245        occupies.  One of the two is always zero.  */
 246     unsigned int num_zr;
 247     unsigned int num_pr;
 248
 249     /* The mode of the registers described above.  */
 250     machine_mode mode;
 251
 252     /* If this piece is formed from multiple homogeneous built-in types,
 253        this is the mode of the built-in types, otherwise it is MODE.  */
 254     machine_mode orig_mode;
 255
 256     /* The offset in bytes of the piece from the start of the type.  */
 257     poly_uint64_pod offset;
 258   };
 259
 260   /* Divides types analyzed as IS_PST into individual pieces.  The pieces
 261      are in memory order.  */
 262   auto_vec<piece, MAX_PIECES> pieces;
 263
 264   unsigned int num_zr () const;
 265   unsigned int num_pr () const;
 266
 267   rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
 268
 269   analysis_result analyze (const_tree);
 270   bool analyze_registers (const_tree);
 271
 272 private:
 273   analysis_result analyze_array (const_tree);
 274   analysis_result analyze_record (const_tree);
 275   void add_piece (const piece &);
 276 };
 277 }
 278
 279 /* The current code model.  */
 280 enum aarch64_code_model aarch64_cmodel;
 281
 282 /* The number of 64-bit elements in an SVE vector.  */
 283 poly_uint16 aarch64_sve_vg;
 284
 285 #ifdef HAVE_AS_TLS
 286 #undef TARGET_HAVE_TLS
 287 #define TARGET_HAVE_TLS 1
 288 #endif
 289
 290 static bool aarch64_composite_type_p (const_tree, machine_mode);
 291 static bool aarch64_return_in_memory_1 (const_tree);
 292 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 293                                                      const_tree,
 294                                                      machine_mode *, int *,
 295                                                      bool *, bool);
 296 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 297 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 298 static void aarch64_override_options_after_change (void);
 299 static bool aarch64_vector_mode_supported_p (machine_mode);
 300 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 301 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 302                                                          const_tree type,
 303                                                          int misalignment,
 304                                                          bool is_packed);
 305 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 306 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 307                                             aarch64_addr_query_type);
 308
 309 /* The processor for which instructions should be scheduled.  */
 310 enum aarch64_processor aarch64_tune = cortexa53;
 311
 312 /* Mask to specify which instruction scheduling options should be used.  */
 313 uint64_t aarch64_tune_flags = 0;
 314
 315 /* Global flag for PC relative loads.  */
 316 bool aarch64_pcrelative_literal_loads;
 317
 318 /* Global flag for whether frame pointer is enabled.  */
 319 bool aarch64_use_frame_pointer;
 320
 321 #define BRANCH_PROTECT_STR_MAX 255
 322 char *accepted_branch_protection_string = NULL;
 323
 324 static enum aarch64_parse_opt_result
 325 aarch64_parse_branch_protection (const char*, char**);
 326
 327 /* Support for command line parsing of boolean flags in the tuning
 328    structures.  */
 329 struct aarch64_flag_desc
 330 {
 331   const char* name;
 332   unsigned int flag;
 333 };
 334
 335 #define AARCH64_FUSION_PAIR(name, internal_name) \
 336   { name, AARCH64_FUSE_##internal_name },
 337 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 338 {
 339   { "none", AARCH64_FUSE_NOTHING },
 340 #include "aarch64-fusion-pairs.def"
 341   { "all", AARCH64_FUSE_ALL },
 342   { NULL, AARCH64_FUSE_NOTHING }
 343 };
 344
 345 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 346   { name, AARCH64_EXTRA_TUNE_##internal_name },
 347 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 348 {
 349   { "none", AARCH64_EXTRA_TUNE_NONE },
 350 #include "aarch64-tuning-flags.def"
 351   { "all", AARCH64_EXTRA_TUNE_ALL },
 352   { NULL, AARCH64_EXTRA_TUNE_NONE }
 353 };
 354
 355 /* Tuning parameters.  */
 356
 357 static const struct cpu_addrcost_table generic_addrcost_table =
 358 {
 359     {
 360       1, /* hi  */
 361       0, /* si  */
 362       0, /* di  */
 363       1, /* ti  */
 364     },
 365   0, /* pre_modify  */
 366   0, /* post_modify  */
 367   0, /* post_modify_ld3_st3  */
 368   0, /* post_modify_ld4_st4  */
 369   0, /* register_offset  */
 370   0, /* register_sextend  */
 371   0, /* register_zextend  */
 372   0 /* imm_offset  */
 373 };
 374
 375 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 376 {
 377     {
 378       0, /* hi  */
 379       0, /* si  */
 380       0, /* di  */
 381       2, /* ti  */
 382     },
 383   0, /* pre_modify  */
 384   0, /* post_modify  */
 385   0, /* post_modify_ld3_st3  */
 386   0, /* post_modify_ld4_st4  */
 387   1, /* register_offset  */
 388   1, /* register_sextend  */
 389   2, /* register_zextend  */
 390   0, /* imm_offset  */
 391 };
 392
 393 static const struct cpu_addrcost_table xgene1_addrcost_table =
 394 {
 395     {
 396       1, /* hi  */
 397       0, /* si  */
 398       0, /* di  */
 399       1, /* ti  */
 400     },
 401   1, /* pre_modify  */
 402   1, /* post_modify  */
 403   1, /* post_modify_ld3_st3  */
 404   1, /* post_modify_ld4_st4  */
 405   0, /* register_offset  */
 406   1, /* register_sextend  */
 407   1, /* register_zextend  */
 408   0, /* imm_offset  */
 409 };
 410
 411 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 412 {
 413     {
 414       1, /* hi  */
 415       1, /* si  */
 416       1, /* di  */
 417       2, /* ti  */
 418     },
 419   0, /* pre_modify  */
 420   0, /* post_modify  */
 421   0, /* post_modify_ld3_st3  */
 422   0, /* post_modify_ld4_st4  */
 423   2, /* register_offset  */
 424   3, /* register_sextend  */
 425   3, /* register_zextend  */
 426   0, /* imm_offset  */
 427 };
 428
 429 static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
 430 {
 431     {
 432       1, /* hi  */
 433       1, /* si  */
 434       1, /* di  */
 435       2, /* ti  */
 436     },
 437   0, /* pre_modify  */
 438   0, /* post_modify  */
 439   0, /* post_modify_ld3_st3  */
 440   0, /* post_modify_ld4_st4  */
 441   2, /* register_offset  */
 442   3, /* register_sextend  */
 443   3, /* register_zextend  */
 444   0, /* imm_offset  */
 445 };
 446
 447 static const struct cpu_addrcost_table tsv110_addrcost_table =
 448 {
 449     {
 450       1, /* hi  */
 451       0, /* si  */
 452       0, /* di  */
 453       1, /* ti  */
 454     },
 455   0, /* pre_modify  */
 456   0, /* post_modify  */
 457   0, /* post_modify_ld3_st3  */
 458   0, /* post_modify_ld4_st4  */
 459   0, /* register_offset  */
 460   1, /* register_sextend  */
 461   1, /* register_zextend  */
 462   0, /* imm_offset  */
 463 };
 464
 465 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 466 {
 467     {
 468       1, /* hi  */
 469       1, /* si  */
 470       1, /* di  */
 471       2, /* ti  */
 472     },
 473   1, /* pre_modify  */
 474   1, /* post_modify  */
 475   1, /* post_modify_ld3_st3  */
 476   1, /* post_modify_ld4_st4  */
 477   3, /* register_offset  */
 478   3, /* register_sextend  */
 479   3, /* register_zextend  */
 480   2, /* imm_offset  */
 481 };
 482
 483 static const struct cpu_addrcost_table a64fx_addrcost_table =
 484 {
 485     {
 486       1, /* hi  */
 487       1, /* si  */
 488       1, /* di  */
 489       2, /* ti  */
 490     },
 491   0, /* pre_modify  */
 492   0, /* post_modify  */
 493   0, /* post_modify_ld3_st3  */
 494   0, /* post_modify_ld4_st4  */
 495   2, /* register_offset  */
 496   3, /* register_sextend  */
 497   3, /* register_zextend  */
 498   0, /* imm_offset  */
 499 };
 500
 501 static const struct cpu_addrcost_table neoversev1_addrcost_table =
 502 {
 503     {
 504       1, /* hi  */
 505       0, /* si  */
 506       0, /* di  */
 507       1, /* ti  */
 508     },
 509   0, /* pre_modify  */
 510   0, /* post_modify  */
 511   3, /* post_modify_ld3_st3  */
 512   3, /* post_modify_ld4_st4  */
 513   0, /* register_offset  */
 514   0, /* register_sextend  */
 515   0, /* register_zextend  */
 516   0 /* imm_offset  */
 517 };
 518
 519 static const struct cpu_addrcost_table neoversen2_addrcost_table =
 520 {
 521     {
 522       1, /* hi  */
 523       0, /* si  */
 524       0, /* di  */
 525       1, /* ti  */
 526     },
 527   0, /* pre_modify  */
 528   0, /* post_modify  */
 529   2, /* post_modify_ld3_st3  */
 530   2, /* post_modify_ld4_st4  */
 531   0, /* register_offset  */
 532   0, /* register_sextend  */
 533   0, /* register_zextend  */
 534   0 /* imm_offset  */
 535 };
 536
 537 static const struct cpu_addrcost_table neoversev2_addrcost_table =
 538 {
 539     {
 540       1, /* hi  */
 541       0, /* si  */
 542       0, /* di  */
 543       1, /* ti  */
 544     },
 545   0, /* pre_modify  */
 546   0, /* post_modify  */
 547   2, /* post_modify_ld3_st3  */
 548   2, /* post_modify_ld4_st4  */
 549   0, /* register_offset  */
 550   0, /* register_sextend  */
 551   0, /* register_zextend  */
 552   0 /* imm_offset  */
 553 };
 554
 555 static const struct cpu_regmove_cost generic_regmove_cost =
 556 {
 557   1, /* GP2GP  */
 558   /* Avoid the use of slow int<->fp moves for spilling by setting
 559      their cost higher than memmov_cost.  */
 560   5, /* GP2FP  */
 561   5, /* FP2GP  */
 562   2 /* FP2FP  */
 563 };
 564
 565 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 566 {
 567   1, /* GP2GP  */
 568   /* Avoid the use of slow int<->fp moves for spilling by setting
 569      their cost higher than memmov_cost.  */
 570   5, /* GP2FP  */
 571   5, /* FP2GP  */
 572   2 /* FP2FP  */
 573 };
 574
 575 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 576 {
 577   1, /* GP2GP  */
 578   /* Avoid the use of slow int<->fp moves for spilling by setting
 579      their cost higher than memmov_cost.  */
 580   5, /* GP2FP  */
 581   5, /* FP2GP  */
 582   2 /* FP2FP  */
 583 };
 584
 585 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 586 {
 587   1, /* GP2GP  */
 588   /* Avoid the use of slow int<->fp moves for spilling by setting
 589      their cost higher than memmov_cost (actual, 4 and 9).  */
 590   9, /* GP2FP  */
 591   9, /* FP2GP  */
 592   1 /* FP2FP  */
 593 };
 594
 595 static const struct cpu_regmove_cost thunderx_regmove_cost =
 596 {
 597   2, /* GP2GP  */
 598   2, /* GP2FP  */
 599   6, /* FP2GP  */
 600   4 /* FP2FP  */
 601 };
 602
 603 static const struct cpu_regmove_cost xgene1_regmove_cost =
 604 {
 605   1, /* GP2GP  */
 606   /* Avoid the use of slow int<->fp moves for spilling by setting
 607      their cost higher than memmov_cost.  */
 608   8, /* GP2FP  */
 609   8, /* FP2GP  */
 610   2 /* FP2FP  */
 611 };
 612
 613 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 614 {
 615   2, /* GP2GP  */
 616   /* Avoid the use of int<->fp moves for spilling.  */
 617   6, /* GP2FP  */
 618   6, /* FP2GP  */
 619   4 /* FP2FP  */
 620 };
 621
 622 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 623 {
 624   1, /* GP2GP  */
 625   /* Avoid the use of int<->fp moves for spilling.  */
 626   5, /* GP2FP  */
 627   6, /* FP2GP  */
 628   3, /* FP2FP  */
 629 };
 630
 631 static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
 632 {
 633   1, /* GP2GP  */
 634   /* Avoid the use of int<->fp moves for spilling.  */
 635   4, /* GP2FP  */
 636   5, /* FP2GP  */
 637   4  /* FP2FP  */
 638 };
 639
 640 static const struct cpu_regmove_cost tsv110_regmove_cost =
 641 {
 642   1, /* GP2GP  */
 643   /* Avoid the use of slow int<->fp moves for spilling by setting
 644      their cost higher than memmov_cost.  */
 645   2, /* GP2FP  */
 646   3, /* FP2GP  */
 647   2  /* FP2FP  */
 648 };
 649
 650 static const struct cpu_regmove_cost a64fx_regmove_cost =
 651 {
 652   1, /* GP2GP  */
 653   /* Avoid the use of slow int<->fp moves for spilling by setting
 654      their cost higher than memmov_cost.  */
 655   5, /* GP2FP  */
 656   7, /* FP2GP  */
 657   2 /* FP2FP  */
 658 };
 659
 660 static const struct cpu_regmove_cost neoversen2_regmove_cost =
 661 {
 662   1, /* GP2GP  */
 663   /* Spilling to int<->fp instead of memory is recommended so set
 664      realistic costs compared to memmov_cost.  */
 665   3, /* GP2FP  */
 666   2, /* FP2GP  */
 667   2 /* FP2FP  */
 668 };
 669
 670 static const struct cpu_regmove_cost neoversev1_regmove_cost =
 671 {
 672   1, /* GP2GP  */
 673   /* Spilling to int<->fp instead of memory is recommended so set
 674      realistic costs compared to memmov_cost.  */
 675   3, /* GP2FP  */
 676   2, /* FP2GP  */
 677   2 /* FP2FP  */
 678 };
 679
 680 static const struct cpu_regmove_cost neoversev2_regmove_cost =
 681 {
 682   1, /* GP2GP  */
 683   /* Spilling to int<->fp instead of memory is recommended so set
 684      realistic costs compared to memmov_cost.  */
 685   3, /* GP2FP  */
 686   2, /* FP2GP  */
 687   2 /* FP2FP  */
 688 };
 689
 690 /* Generic costs for Advanced SIMD vector operations.   */
 691 static const advsimd_vec_cost generic_advsimd_vector_cost =
 692 {
 693   1, /* int_stmt_cost  */
 694   1, /* fp_stmt_cost  */
 695   0, /* ld2_st2_permute_cost  */
 696   0, /* ld3_st3_permute_cost  */
 697   0, /* ld4_st4_permute_cost  */
 698   2, /* permute_cost  */
 699   2, /* reduc_i8_cost  */
 700   2, /* reduc_i16_cost  */
 701   2, /* reduc_i32_cost  */
 702   2, /* reduc_i64_cost  */
 703   2, /* reduc_f16_cost  */
 704   2, /* reduc_f32_cost  */
 705   2, /* reduc_f64_cost  */
 706   2, /* store_elt_extra_cost  */
 707   2, /* vec_to_scalar_cost  */
 708   1, /* scalar_to_vec_cost  */
 709   1, /* align_load_cost  */
 710   1, /* unalign_load_cost  */
 711   1, /* unalign_store_cost  */
 712   1  /* store_cost  */
 713 };
 714
 715 /* Generic costs for SVE vector operations.  */
 716 static const sve_vec_cost generic_sve_vector_cost =
 717 {
 718   {
 719     1, /* int_stmt_cost  */
 720     1, /* fp_stmt_cost  */
 721     0, /* ld2_st2_permute_cost  */
 722     0, /* ld3_st3_permute_cost  */
 723     0, /* ld4_st4_permute_cost  */
 724     2, /* permute_cost  */
 725     2, /* reduc_i8_cost  */
 726     2, /* reduc_i16_cost  */
 727     2, /* reduc_i32_cost  */
 728     2, /* reduc_i64_cost  */
 729     2, /* reduc_f16_cost  */
 730     2, /* reduc_f32_cost  */
 731     2, /* reduc_f64_cost  */
 732     2, /* store_elt_extra_cost  */
 733     2, /* vec_to_scalar_cost  */
 734     1, /* scalar_to_vec_cost  */
 735     1, /* align_load_cost  */
 736     1, /* unalign_load_cost  */
 737     1, /* unalign_store_cost  */
 738     1  /* store_cost  */
 739   },
 740   2, /* clast_cost  */
 741   2, /* fadda_f16_cost  */
 742   2, /* fadda_f32_cost  */
 743   2, /* fadda_f64_cost  */
 744   4, /* gather_load_x32_cost  */
 745   2, /* gather_load_x64_cost  */
 746   1 /* scatter_store_elt_cost  */
 747 };
 748
 749 /* Generic costs for vector insn classes.  */
 750 static const struct cpu_vector_cost generic_vector_cost =
 751 {
 752   1, /* scalar_int_stmt_cost  */
 753   1, /* scalar_fp_stmt_cost  */
 754   1, /* scalar_load_cost  */
 755   1, /* scalar_store_cost  */
 756   3, /* cond_taken_branch_cost  */
 757   1, /* cond_not_taken_branch_cost  */
 758   &generic_advsimd_vector_cost, /* advsimd  */
 759   &generic_sve_vector_cost, /* sve */
 760   nullptr /* issue_info  */
 761 };
 762
 763 static const advsimd_vec_cost a64fx_advsimd_vector_cost =
 764 {
 765   2, /* int_stmt_cost  */
 766   5, /* fp_stmt_cost  */
 767   0, /* ld2_st2_permute_cost  */
 768   0, /* ld3_st3_permute_cost  */
 769   0, /* ld4_st4_permute_cost  */
 770   3, /* permute_cost  */
 771   13, /* reduc_i8_cost  */
 772   13, /* reduc_i16_cost  */
 773   13, /* reduc_i32_cost  */
 774   13, /* reduc_i64_cost  */
 775   13, /* reduc_f16_cost  */
 776   13, /* reduc_f32_cost  */
 777   13, /* reduc_f64_cost  */
 778   13, /* store_elt_extra_cost  */
 779   13, /* vec_to_scalar_cost  */
 780   4, /* scalar_to_vec_cost  */
 781   6, /* align_load_cost  */
 782   6, /* unalign_load_cost  */
 783   1, /* unalign_store_cost  */
 784   1  /* store_cost  */
 785 };
 786
 787 static const sve_vec_cost a64fx_sve_vector_cost =
 788 {
 789   {
 790     2, /* int_stmt_cost  */
 791     5, /* fp_stmt_cost  */
 792     0, /* ld2_st2_permute_cost  */
 793     0, /* ld3_st3_permute_cost  */
 794     0, /* ld4_st4_permute_cost  */
 795     3, /* permute_cost  */
 796     13, /* reduc_i8_cost  */
 797     13, /* reduc_i16_cost  */
 798     13, /* reduc_i32_cost  */
 799     13, /* reduc_i64_cost  */
 800     13, /* reduc_f16_cost  */
 801     13, /* reduc_f32_cost  */
 802     13, /* reduc_f64_cost  */
 803     13, /* store_elt_extra_cost  */
 804     13, /* vec_to_scalar_cost  */
 805     4, /* scalar_to_vec_cost  */
 806     6, /* align_load_cost  */
 807     6, /* unalign_load_cost  */
 808     1, /* unalign_store_cost  */
 809     1  /* store_cost  */
 810   },
 811   13, /* clast_cost  */
 812   13, /* fadda_f16_cost  */
 813   13, /* fadda_f32_cost  */
 814   13, /* fadda_f64_cost  */
 815   64, /* gather_load_x32_cost  */
 816   32, /* gather_load_x64_cost  */
 817   1 /* scatter_store_elt_cost  */
 818 };
 819
 820 static const struct cpu_vector_cost a64fx_vector_cost =
 821 {
 822   1, /* scalar_int_stmt_cost  */
 823   5, /* scalar_fp_stmt_cost  */
 824   4, /* scalar_load_cost  */
 825   1, /* scalar_store_cost  */
 826   3, /* cond_taken_branch_cost  */
 827   1, /* cond_not_taken_branch_cost  */
 828   &a64fx_advsimd_vector_cost, /* advsimd  */
 829   &a64fx_sve_vector_cost, /* sve  */
 830   nullptr /* issue_info  */
 831 };
 832
 833 static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
 834 {
 835   1, /* int_stmt_cost  */
 836   3, /* fp_stmt_cost  */
 837   0, /* ld2_st2_permute_cost  */
 838   0, /* ld3_st3_permute_cost  */
 839   0, /* ld4_st4_permute_cost  */
 840   2, /* permute_cost  */
 841   1, /* reduc_i8_cost  */
 842   1, /* reduc_i16_cost  */
 843   1, /* reduc_i32_cost  */
 844   1, /* reduc_i64_cost  */
 845   1, /* reduc_f16_cost  */
 846   1, /* reduc_f32_cost  */
 847   1, /* reduc_f64_cost  */
 848   1, /* store_elt_extra_cost  */
 849   1, /* vec_to_scalar_cost  */
 850   1, /* scalar_to_vec_cost  */
 851   1, /* align_load_cost  */
 852   1, /* unalign_load_cost  */
 853   1, /* unalign_store_cost  */
 854   1  /* store_cost  */
 855 };
 856
 857 /* QDF24XX costs for vector insn classes.  */
 858 static const struct cpu_vector_cost qdf24xx_vector_cost =
 859 {
 860   1, /* scalar_int_stmt_cost  */
 861   1, /* scalar_fp_stmt_cost  */
 862   1, /* scalar_load_cost  */
 863   1, /* scalar_store_cost  */
 864   3, /* cond_taken_branch_cost  */
 865   1, /* cond_not_taken_branch_cost  */
 866   &qdf24xx_advsimd_vector_cost, /* advsimd  */
 867   nullptr, /* sve  */
 868   nullptr /* issue_info  */
 869 };
 870
 871
 872 static const advsimd_vec_cost thunderx_advsimd_vector_cost =
 873 {
 874   4, /* int_stmt_cost  */
 875   1, /* fp_stmt_cost  */
 876   0, /* ld2_st2_permute_cost  */
 877   0, /* ld3_st3_permute_cost  */
 878   0, /* ld4_st4_permute_cost  */
 879   4, /* permute_cost  */
 880   2, /* reduc_i8_cost  */
 881   2, /* reduc_i16_cost  */
 882   2, /* reduc_i32_cost  */
 883   2, /* reduc_i64_cost  */
 884   2, /* reduc_f16_cost  */
 885   2, /* reduc_f32_cost  */
 886   2, /* reduc_f64_cost  */
 887   2, /* store_elt_extra_cost  */
 888   2, /* vec_to_scalar_cost  */
 889   2, /* scalar_to_vec_cost  */
 890   3, /* align_load_cost  */
 891   5, /* unalign_load_cost  */
 892   5, /* unalign_store_cost  */
 893   1  /* store_cost  */
 894 };
 895
 896 /* ThunderX costs for vector insn classes.  */
 897 static const struct cpu_vector_cost thunderx_vector_cost =
 898 {
 899   1, /* scalar_int_stmt_cost  */
 900   1, /* scalar_fp_stmt_cost  */
 901   3, /* scalar_load_cost  */
 902   1, /* scalar_store_cost  */
 903   3, /* cond_taken_branch_cost  */
 904   3, /* cond_not_taken_branch_cost  */
 905   &thunderx_advsimd_vector_cost, /* advsimd  */
 906   nullptr, /* sve  */
 907   nullptr /* issue_info  */
 908 };
 909
 910 static const advsimd_vec_cost tsv110_advsimd_vector_cost =
 911 {
 912   2, /* int_stmt_cost  */
 913   2, /* fp_stmt_cost  */
 914   0, /* ld2_st2_permute_cost  */
 915   0, /* ld3_st3_permute_cost  */
 916   0, /* ld4_st4_permute_cost  */
 917   2, /* permute_cost  */
 918   3, /* reduc_i8_cost  */
 919   3, /* reduc_i16_cost  */
 920   3, /* reduc_i32_cost  */
 921   3, /* reduc_i64_cost  */
 922   3, /* reduc_f16_cost  */
 923   3, /* reduc_f32_cost  */
 924   3, /* reduc_f64_cost  */
 925   3, /* store_elt_extra_cost  */
 926   3, /* vec_to_scalar_cost  */
 927   2, /* scalar_to_vec_cost  */
 928   5, /* align_load_cost  */
 929   5, /* unalign_load_cost  */
 930   1, /* unalign_store_cost  */
 931   1  /* store_cost  */
 932 };
 933
 934 static const struct cpu_vector_cost tsv110_vector_cost =
 935 {
 936   1, /* scalar_int_stmt_cost  */
 937   1, /* scalar_fp_stmt_cost  */
 938   5, /* scalar_load_cost  */
 939   1, /* scalar_store_cost  */
 940   1, /* cond_taken_branch_cost  */
 941   1, /* cond_not_taken_branch_cost  */
 942   &tsv110_advsimd_vector_cost, /* advsimd  */
 943   nullptr, /* sve  */
 944   nullptr /* issue_info  */
 945 };
 946
 947 static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
 948 {
 949   2, /* int_stmt_cost  */
 950   2, /* fp_stmt_cost  */
 951   0, /* ld2_st2_permute_cost  */
 952   0, /* ld3_st3_permute_cost  */
 953   0, /* ld4_st4_permute_cost  */
 954   3, /* permute_cost  */
 955   8, /* reduc_i8_cost  */
 956   8, /* reduc_i16_cost  */
 957   8, /* reduc_i32_cost  */
 958   8, /* reduc_i64_cost  */
 959   8, /* reduc_f16_cost  */
 960   8, /* reduc_f32_cost  */
 961   8, /* reduc_f64_cost  */
 962   8, /* store_elt_extra_cost  */
 963   8, /* vec_to_scalar_cost  */
 964   8, /* scalar_to_vec_cost  */
 965   4, /* align_load_cost  */
 966   4, /* unalign_load_cost  */
 967   1, /* unalign_store_cost  */
 968   1  /* store_cost  */
 969 };
 970
 971 /* Cortex-A57 costs for vector insn classes.  */
 972 static const struct cpu_vector_cost cortexa57_vector_cost =
 973 {
 974   1, /* scalar_int_stmt_cost  */
 975   1, /* scalar_fp_stmt_cost  */
 976   4, /* scalar_load_cost  */
 977   1, /* scalar_store_cost  */
 978   1, /* cond_taken_branch_cost  */
 979   1, /* cond_not_taken_branch_cost  */
 980   &cortexa57_advsimd_vector_cost, /* advsimd  */
 981   nullptr, /* sve  */
 982   nullptr /* issue_info  */
 983 };
 984
 985 static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
 986 {
 987   3, /* int_stmt_cost  */
 988   3, /* fp_stmt_cost  */
 989   0, /* ld2_st2_permute_cost  */
 990   0, /* ld3_st3_permute_cost  */
 991   0, /* ld4_st4_permute_cost  */
 992   3, /* permute_cost  */
 993   3, /* reduc_i8_cost  */
 994   3, /* reduc_i16_cost  */
 995   3, /* reduc_i32_cost  */
 996   3, /* reduc_i64_cost  */
 997   3, /* reduc_f16_cost  */
 998   3, /* reduc_f32_cost  */
 999   3, /* reduc_f64_cost  */
1000   3, /* store_elt_extra_cost  */
1001   3, /* vec_to_scalar_cost  */
1002   3, /* scalar_to_vec_cost  */
1003   5, /* align_load_cost  */
1004   5, /* unalign_load_cost  */
1005   1, /* unalign_store_cost  */
1006   1  /* store_cost  */
1007 };
1008
1009 static const struct cpu_vector_cost exynosm1_vector_cost =
1010 {
1011   1, /* scalar_int_stmt_cost  */
1012   1, /* scalar_fp_stmt_cost  */
1013   5, /* scalar_load_cost  */
1014   1, /* scalar_store_cost  */
1015   1, /* cond_taken_branch_cost  */
1016   1, /* cond_not_taken_branch_cost  */
1017   &exynosm1_advsimd_vector_cost, /* advsimd  */
1018   nullptr, /* sve  */
1019   nullptr /* issue_info  */
1020 };
1021
1022 static const advsimd_vec_cost xgene1_advsimd_vector_cost =
1023 {
1024   2, /* int_stmt_cost  */
1025   2, /* fp_stmt_cost  */
1026   0, /* ld2_st2_permute_cost  */
1027   0, /* ld3_st3_permute_cost  */
1028   0, /* ld4_st4_permute_cost  */
1029   2, /* permute_cost  */
1030   4, /* reduc_i8_cost  */
1031   4, /* reduc_i16_cost  */
1032   4, /* reduc_i32_cost  */
1033   4, /* reduc_i64_cost  */
1034   4, /* reduc_f16_cost  */
1035   4, /* reduc_f32_cost  */
1036   4, /* reduc_f64_cost  */
1037   4, /* store_elt_extra_cost  */
1038   4, /* vec_to_scalar_cost  */
1039   4, /* scalar_to_vec_cost  */
1040   10, /* align_load_cost  */
1041   10, /* unalign_load_cost  */
1042   2, /* unalign_store_cost  */
1043   2  /* store_cost  */
1044 };
1045
1046 /* Generic costs for vector insn classes.  */
1047 static const struct cpu_vector_cost xgene1_vector_cost =
1048 {
1049   1, /* scalar_int_stmt_cost  */
1050   1, /* scalar_fp_stmt_cost  */
1051   5, /* scalar_load_cost  */
1052   1, /* scalar_store_cost  */
1053   2, /* cond_taken_branch_cost  */
1054   1, /* cond_not_taken_branch_cost  */
1055   &xgene1_advsimd_vector_cost, /* advsimd  */
1056   nullptr, /* sve  */
1057   nullptr /* issue_info  */
1058 };
1059
1060 static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
1061 {
1062   4, /* int_stmt_cost  */
1063   5, /* fp_stmt_cost  */
1064   0, /* ld2_st2_permute_cost  */
1065   0, /* ld3_st3_permute_cost  */
1066   0, /* ld4_st4_permute_cost  */
1067   10, /* permute_cost  */
1068   6, /* reduc_i8_cost  */
1069   6, /* reduc_i16_cost  */
1070   6, /* reduc_i32_cost  */
1071   6, /* reduc_i64_cost  */
1072   6, /* reduc_f16_cost  */
1073   6, /* reduc_f32_cost  */
1074   6, /* reduc_f64_cost  */
1075   6, /* store_elt_extra_cost  */
1076   6, /* vec_to_scalar_cost  */
1077   5, /* scalar_to_vec_cost  */
1078   4, /* align_load_cost  */
1079   4, /* unalign_load_cost  */
1080   1, /* unalign_store_cost  */
1081   1  /* store_cost  */
1082 };
1083
1084 /* Costs for vector insn classes for Vulcan.  */
1085 static const struct cpu_vector_cost thunderx2t99_vector_cost =
1086 {
1087   1, /* scalar_int_stmt_cost  */
1088   6, /* scalar_fp_stmt_cost  */
1089   4, /* scalar_load_cost  */
1090   1, /* scalar_store_cost  */
1091   2, /* cond_taken_branch_cost  */
1092   1,  /* cond_not_taken_branch_cost  */
1093   &thunderx2t99_advsimd_vector_cost, /* advsimd  */
1094   nullptr, /* sve  */
1095   nullptr /* issue_info  */
1096 };
1097
1098 static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
1099 {
1100   5, /* int_stmt_cost  */
1101   5, /* fp_stmt_cost  */
1102   0, /* ld2_st2_permute_cost  */
1103   0, /* ld3_st3_permute_cost  */
1104   0, /* ld4_st4_permute_cost  */
1105   10, /* permute_cost  */
1106   5, /* reduc_i8_cost  */
1107   5, /* reduc_i16_cost  */
1108   5, /* reduc_i32_cost  */
1109   5, /* reduc_i64_cost  */
1110   5, /* reduc_f16_cost  */
1111   5, /* reduc_f32_cost  */
1112   5, /* reduc_f64_cost  */
1113   5, /* store_elt_extra_cost  */
1114   5, /* vec_to_scalar_cost  */
1115   5, /* scalar_to_vec_cost  */
1116   4, /* align_load_cost  */
1117   4, /* unalign_load_cost  */
1118   4, /* unalign_store_cost  */
1119   4  /* store_cost  */
1120 };
1121
1122 static const struct cpu_vector_cost thunderx3t110_vector_cost =
1123 {
1124   1, /* scalar_int_stmt_cost  */
1125   5, /* scalar_fp_stmt_cost  */
1126   4, /* scalar_load_cost  */
1127   1, /* scalar_store_cost  */
1128   2, /* cond_taken_branch_cost  */
1129   1,  /* cond_not_taken_branch_cost  */
1130   &thunderx3t110_advsimd_vector_cost, /* advsimd  */
1131   nullptr, /* sve  */
1132   nullptr /* issue_info  */
1133 };
1134
1135 static const advsimd_vec_cost ampere1_advsimd_vector_cost =
1136 {
1137   3, /* int_stmt_cost  */
1138   3, /* fp_stmt_cost  */
1139   0, /* ld2_st2_permute_cost  */
1140   0, /* ld3_st3_permute_cost  */
1141   0, /* ld4_st4_permute_cost  */
1142   2, /* permute_cost  */
1143   12, /* reduc_i8_cost  */
1144   9, /* reduc_i16_cost  */
1145   6, /* reduc_i32_cost  */
1146   5, /* reduc_i64_cost  */
1147   9, /* reduc_f16_cost  */
1148   6, /* reduc_f32_cost  */
1149   5, /* reduc_f64_cost  */
1150   8, /* store_elt_extra_cost  */
1151   6, /* vec_to_scalar_cost  */
1152   7, /* scalar_to_vec_cost  */
1153   5, /* align_load_cost  */
1154   5, /* unalign_load_cost  */
1155   2, /* unalign_store_cost  */
1156   2  /* store_cost  */
1157 };
1158
1159 /* Ampere-1 costs for vector insn classes.  */
1160 static const struct cpu_vector_cost ampere1_vector_cost =
1161 {
1162   1, /* scalar_int_stmt_cost  */
1163   1, /* scalar_fp_stmt_cost  */
1164   4, /* scalar_load_cost  */
1165   1, /* scalar_store_cost  */
1166   1, /* cond_taken_branch_cost  */
1167   1, /* cond_not_taken_branch_cost  */
1168   &ampere1_advsimd_vector_cost, /* advsimd  */
1169   nullptr, /* sve  */
1170   nullptr  /* issue_info  */
1171 };
1172
1173 /* Generic costs for branch instructions.  */
1174 static const struct cpu_branch_cost generic_branch_cost =
1175 {
1176   1,  /* Predictable.  */
1177   3   /* Unpredictable.  */
1178 };
1179
1180 /* Generic approximation modes.  */
1181 static const cpu_approx_modes generic_approx_modes =
1182 {
1183   AARCH64_APPROX_NONE,  /* division  */
1184   AARCH64_APPROX_NONE,  /* sqrt  */
1185   AARCH64_APPROX_NONE   /* recip_sqrt  */
1186 };
1187
1188 /* Approximation modes for Exynos M1.  */
1189 static const cpu_approx_modes exynosm1_approx_modes =
1190 {
1191   AARCH64_APPROX_NONE,  /* division  */
1192   AARCH64_APPROX_ALL,   /* sqrt  */
1193   AARCH64_APPROX_ALL    /* recip_sqrt  */
1194 };
1195
1196 /* Approximation modes for X-Gene 1.  */
1197 static const cpu_approx_modes xgene1_approx_modes =
1198 {
1199   AARCH64_APPROX_NONE,  /* division  */
1200   AARCH64_APPROX_NONE,  /* sqrt  */
1201   AARCH64_APPROX_ALL    /* recip_sqrt  */
1202 };
1203
1204 /* Generic prefetch settings (which disable prefetch).  */
1205 static const cpu_prefetch_tune generic_prefetch_tune =
1206 {
1207   0,                    /* num_slots  */
1208   -1,                   /* l1_cache_size  */
1209   -1,                   /* l1_cache_line_size  */
1210   -1,                   /* l2_cache_size  */
1211   true,                 /* prefetch_dynamic_strides */
1212   -1,                   /* minimum_stride */
1213   -1                    /* default_opt_level  */
1214 };
1215
1216 static const cpu_prefetch_tune exynosm1_prefetch_tune =
1217 {
1218   0,                    /* num_slots  */
1219   -1,                   /* l1_cache_size  */
1220   64,                   /* l1_cache_line_size  */
1221   -1,                   /* l2_cache_size  */
1222   true,                 /* prefetch_dynamic_strides */
1223   -1,                   /* minimum_stride */
1224   -1                    /* default_opt_level  */
1225 };
1226
1227 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
1228 {
1229   4,                    /* num_slots  */
1230   32,                   /* l1_cache_size  */
1231   64,                   /* l1_cache_line_size  */
1232   512,                  /* l2_cache_size  */
1233   false,                /* prefetch_dynamic_strides */
1234   2048,                 /* minimum_stride */
1235   3                     /* default_opt_level  */
1236 };
1237
1238 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
1239 {
1240   8,                    /* num_slots  */
1241   32,                   /* l1_cache_size  */
1242   128,                  /* l1_cache_line_size  */
1243   16*1024,              /* l2_cache_size  */
1244   true,                 /* prefetch_dynamic_strides */
1245   -1,                   /* minimum_stride */
1246   3                     /* default_opt_level  */
1247 };
1248
1249 static const cpu_prefetch_tune thunderx_prefetch_tune =
1250 {
1251   8,                    /* num_slots  */
1252   32,                   /* l1_cache_size  */
1253   128,                  /* l1_cache_line_size  */
1254   -1,                   /* l2_cache_size  */
1255   true,                 /* prefetch_dynamic_strides */
1256   -1,                   /* minimum_stride */
1257   -1                    /* default_opt_level  */
1258 };
1259
1260 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
1261 {
1262   8,                    /* num_slots  */
1263   32,                   /* l1_cache_size  */
1264   64,                   /* l1_cache_line_size  */
1265   256,                  /* l2_cache_size  */
1266   true,                 /* prefetch_dynamic_strides */
1267   -1,                   /* minimum_stride */
1268   -1                    /* default_opt_level  */
1269 };
1270
1271 static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
1272 {
1273   8,                    /* num_slots  */
1274   32,                   /* l1_cache_size  */
1275   64,                   /* l1_cache_line_size  */
1276   256,                  /* l2_cache_size  */
1277   true,                 /* prefetch_dynamic_strides */
1278   -1,                   /* minimum_stride */
1279   -1                    /* default_opt_level  */
1280 };
1281
1282 static const cpu_prefetch_tune tsv110_prefetch_tune =
1283 {
1284   0,                    /* num_slots  */
1285   64,                   /* l1_cache_size  */
1286   64,                   /* l1_cache_line_size  */
1287   512,                  /* l2_cache_size  */
1288   true,                 /* prefetch_dynamic_strides */
1289   -1,                   /* minimum_stride */
1290   -1                    /* default_opt_level  */
1291 };
1292
1293 static const cpu_prefetch_tune xgene1_prefetch_tune =
1294 {
1295   8,                    /* num_slots  */
1296   32,                   /* l1_cache_size  */
1297   64,                   /* l1_cache_line_size  */
1298   256,                  /* l2_cache_size  */
1299   true,                 /* prefetch_dynamic_strides */
1300   -1,                   /* minimum_stride */
1301   -1                    /* default_opt_level  */
1302 };
1303
1304 static const cpu_prefetch_tune a64fx_prefetch_tune =
1305 {
1306   8,                    /* num_slots  */
1307   64,                   /* l1_cache_size  */
1308   256,                  /* l1_cache_line_size  */
1309   32768,                /* l2_cache_size  */
1310   true,                 /* prefetch_dynamic_strides */
1311   -1,                   /* minimum_stride */
1312   -1                    /* default_opt_level  */
1313 };
1314
1315 static const cpu_prefetch_tune ampere1_prefetch_tune =
1316 {
1317   0,                    /* num_slots  */
1318   64,                   /* l1_cache_size  */
1319   64,                   /* l1_cache_line_size  */
1320   2048,                 /* l2_cache_size  */
1321   true,                 /* prefetch_dynamic_strides */
1322   -1,                   /* minimum_stride */
1323   -1                    /* default_opt_level  */
1324 };
1325
1326 static const struct tune_params generic_tunings =
1327 {
1328   &cortexa57_extra_costs,
1329   &generic_addrcost_table,
1330   &generic_regmove_cost,
1331   &generic_vector_cost,
1332   &generic_branch_cost,
1333   &generic_approx_modes,
1334   SVE_NOT_IMPLEMENTED, /* sve_width  */
1335   { 4, /* load_int.  */
1336     4, /* store_int.  */
1337     4, /* load_fp.  */
1338     4, /* store_fp.  */
1339     4, /* load_pred.  */
1340     4 /* store_pred.  */
1341   }, /* memmov_cost.  */
1342   2, /* issue_rate  */
1343   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1344   "16:12",      /* function_align.  */
1345   "4",  /* jump_align.  */
1346   "8",  /* loop_align.  */
1347   2,    /* int_reassoc_width.  */
1348   4,    /* fp_reassoc_width.  */
1349   1,    /* fma_reassoc_width.  */
1350   1,    /* vec_reassoc_width.  */
1351   2,    /* min_div_recip_mul_sf.  */
1352   2,    /* min_div_recip_mul_df.  */
1353   0,    /* max_case_values.  */
1354   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1355   /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits
1356      Neoverse V1.  It does not have a noticeable effect on A64FX and should
1357      have at most a very minor effect on SVE2 cores.  */
1358   (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS),    /* tune_flags.  */
1359   &generic_prefetch_tune
1360 };
1361
1362 static const struct tune_params cortexa35_tunings =
1363 {
1364   &cortexa53_extra_costs,
1365   &generic_addrcost_table,
1366   &cortexa53_regmove_cost,
1367   &generic_vector_cost,
1368   &generic_branch_cost,
1369   &generic_approx_modes,
1370   SVE_NOT_IMPLEMENTED, /* sve_width  */
1371   { 4, /* load_int.  */
1372     4, /* store_int.  */
1373     4, /* load_fp.  */
1374     4, /* store_fp.  */
1375     4, /* load_pred.  */
1376     4 /* store_pred.  */
1377   }, /* memmov_cost.  */
1378   1, /* issue_rate  */
1379   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1380    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
1381   "16", /* function_align.  */
1382   "4",  /* jump_align.  */
1383   "8",  /* loop_align.  */
1384   2,    /* int_reassoc_width.  */
1385   4,    /* fp_reassoc_width.  */
1386   1,    /* fma_reassoc_width.  */
1387   1,    /* vec_reassoc_width.  */
1388   2,    /* min_div_recip_mul_sf.  */
1389   2,    /* min_div_recip_mul_df.  */
1390   0,    /* max_case_values.  */
1391   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1392   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1393   &generic_prefetch_tune
1394 };
1395
1396 static const struct tune_params cortexa53_tunings =
1397 {
1398   &cortexa53_extra_costs,
1399   &generic_addrcost_table,
1400   &cortexa53_regmove_cost,
1401   &generic_vector_cost,
1402   &generic_branch_cost,
1403   &generic_approx_modes,
1404   SVE_NOT_IMPLEMENTED, /* sve_width  */
1405   { 4, /* load_int.  */
1406     4, /* store_int.  */
1407     4, /* load_fp.  */
1408     4, /* store_fp.  */
1409     4, /* load_pred.  */
1410     4 /* store_pred.  */
1411   }, /* memmov_cost.  */
1412   2, /* issue_rate  */
1413   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1414    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
1415   "16", /* function_align.  */
1416   "4",  /* jump_align.  */
1417   "8",  /* loop_align.  */
1418   2,    /* int_reassoc_width.  */
1419   4,    /* fp_reassoc_width.  */
1420   1,    /* fma_reassoc_width.  */
1421   1,    /* vec_reassoc_width.  */
1422   2,    /* min_div_recip_mul_sf.  */
1423   2,    /* min_div_recip_mul_df.  */
1424   0,    /* max_case_values.  */
1425   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1426   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1427   &generic_prefetch_tune
1428 };
1429
1430 static const struct tune_params cortexa57_tunings =
1431 {
1432   &cortexa57_extra_costs,
1433   &generic_addrcost_table,
1434   &cortexa57_regmove_cost,
1435   &cortexa57_vector_cost,
1436   &generic_branch_cost,
1437   &generic_approx_modes,
1438   SVE_NOT_IMPLEMENTED, /* sve_width  */
1439   { 4, /* load_int.  */
1440     4, /* store_int.  */
1441     4, /* load_fp.  */
1442     4, /* store_fp.  */
1443     4, /* load_pred.  */
1444     4 /* store_pred.  */
1445   }, /* memmov_cost.  */
1446   3, /* issue_rate  */
1447   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1448    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
1449   "16", /* function_align.  */
1450   "4",  /* jump_align.  */
1451   "8",  /* loop_align.  */
1452   2,    /* int_reassoc_width.  */
1453   4,    /* fp_reassoc_width.  */
1454   1,    /* fma_reassoc_width.  */
1455   1,    /* vec_reassoc_width.  */
1456   2,    /* min_div_recip_mul_sf.  */
1457   2,    /* min_div_recip_mul_df.  */
1458   0,    /* max_case_values.  */
1459   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1460   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
1461   &generic_prefetch_tune
1462 };
1463
1464 static const struct tune_params cortexa72_tunings =
1465 {
1466   &cortexa57_extra_costs,
1467   &generic_addrcost_table,
1468   &cortexa57_regmove_cost,
1469   &cortexa57_vector_cost,
1470   &generic_branch_cost,
1471   &generic_approx_modes,
1472   SVE_NOT_IMPLEMENTED, /* sve_width  */
1473   { 4, /* load_int.  */
1474     4, /* store_int.  */
1475     4, /* load_fp.  */
1476     4, /* store_fp.  */
1477     4, /* load_pred.  */
1478     4 /* store_pred.  */
1479   }, /* memmov_cost.  */
1480   3, /* issue_rate  */
1481   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1482    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
1483   "16", /* function_align.  */
1484   "4",  /* jump_align.  */
1485   "8",  /* loop_align.  */
1486   2,    /* int_reassoc_width.  */
1487   4,    /* fp_reassoc_width.  */
1488   1,    /* fma_reassoc_width.  */
1489   1,    /* vec_reassoc_width.  */
1490   2,    /* min_div_recip_mul_sf.  */
1491   2,    /* min_div_recip_mul_df.  */
1492   0,    /* max_case_values.  */
1493   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1494   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1495   &generic_prefetch_tune
1496 };
1497
1498 static const struct tune_params cortexa73_tunings =
1499 {
1500   &cortexa57_extra_costs,
1501   &generic_addrcost_table,
1502   &cortexa57_regmove_cost,
1503   &cortexa57_vector_cost,
1504   &generic_branch_cost,
1505   &generic_approx_modes,
1506   SVE_NOT_IMPLEMENTED, /* sve_width  */
1507   { 4, /* load_int.  */
1508     4, /* store_int.  */
1509     4, /* load_fp.  */
1510     4, /* store_fp.  */
1511     4, /* load_pred.  */
1512     4 /* store_pred.  */
1513   }, /* memmov_cost.  */
1514   2, /* issue_rate.  */
1515   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1516    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
1517   "16", /* function_align.  */
1518   "4",  /* jump_align.  */
1519   "8",  /* loop_align.  */
1520   2,    /* int_reassoc_width.  */
1521   4,    /* fp_reassoc_width.  */
1522   1,    /* fma_reassoc_width.  */
1523   1,    /* vec_reassoc_width.  */
1524   2,    /* min_div_recip_mul_sf.  */
1525   2,    /* min_div_recip_mul_df.  */
1526   0,    /* max_case_values.  */
1527   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1528   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1529   &generic_prefetch_tune
1530 };
1531
1532
1533
1534 static const struct tune_params exynosm1_tunings =
1535 {
1536   &exynosm1_extra_costs,
1537   &exynosm1_addrcost_table,
1538   &exynosm1_regmove_cost,
1539   &exynosm1_vector_cost,
1540   &generic_branch_cost,
1541   &exynosm1_approx_modes,
1542   SVE_NOT_IMPLEMENTED, /* sve_width  */
1543   { 4, /* load_int.  */
1544     4, /* store_int.  */
1545     4, /* load_fp.  */
1546     4, /* store_fp.  */
1547     4, /* load_pred.  */
1548     4 /* store_pred.  */
1549   }, /* memmov_cost.  */
1550   3,    /* issue_rate  */
1551   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
1552   "4",  /* function_align.  */
1553   "4",  /* jump_align.  */
1554   "4",  /* loop_align.  */
1555   2,    /* int_reassoc_width.  */
1556   4,    /* fp_reassoc_width.  */
1557   1,    /* fma_reassoc_width.  */
1558   1,    /* vec_reassoc_width.  */
1559   2,    /* min_div_recip_mul_sf.  */
1560   2,    /* min_div_recip_mul_df.  */
1561   48,   /* max_case_values.  */
1562   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
1563   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
1564   &exynosm1_prefetch_tune
1565 };
1566
1567 static const struct tune_params thunderxt88_tunings =
1568 {
1569   &thunderx_extra_costs,
1570   &generic_addrcost_table,
1571   &thunderx_regmove_cost,
1572   &thunderx_vector_cost,
1573   &generic_branch_cost,
1574   &generic_approx_modes,
1575   SVE_NOT_IMPLEMENTED, /* sve_width  */
1576   { 6, /* load_int.  */
1577     6, /* store_int.  */
1578     6, /* load_fp.  */
1579     6, /* store_fp.  */
1580     6, /* load_pred.  */
1581     6 /* store_pred.  */
1582   }, /* memmov_cost.  */
1583   2, /* issue_rate  */
1584   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
1585   "8",  /* function_align.  */
1586   "8",  /* jump_align.  */
1587   "8",  /* loop_align.  */
1588   2,    /* int_reassoc_width.  */
1589   4,    /* fp_reassoc_width.  */
1590   1,    /* fma_reassoc_width.  */
1591   1,    /* vec_reassoc_width.  */
1592   2,    /* min_div_recip_mul_sf.  */
1593   2,    /* min_div_recip_mul_df.  */
1594   0,    /* max_case_values.  */
1595   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1596   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
1597   &thunderxt88_prefetch_tune
1598 };
1599
1600 static const struct tune_params thunderx_tunings =
1601 {
1602   &thunderx_extra_costs,
1603   &generic_addrcost_table,
1604   &thunderx_regmove_cost,
1605   &thunderx_vector_cost,
1606   &generic_branch_cost,
1607   &generic_approx_modes,
1608   SVE_NOT_IMPLEMENTED, /* sve_width  */
1609   { 6, /* load_int.  */
1610     6, /* store_int.  */
1611     6, /* load_fp.  */
1612     6, /* store_fp.  */
1613     6, /* load_pred.  */
1614     6 /* store_pred.  */
1615   }, /* memmov_cost.  */
1616   2, /* issue_rate  */
1617   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
1618   "8",  /* function_align.  */
1619   "8",  /* jump_align.  */
1620   "8",  /* loop_align.  */
1621   2,    /* int_reassoc_width.  */
1622   4,    /* fp_reassoc_width.  */
1623   1,    /* fma_reassoc_width.  */
1624   1,    /* vec_reassoc_width.  */
1625   2,    /* min_div_recip_mul_sf.  */
1626   2,    /* min_div_recip_mul_df.  */
1627   0,    /* max_case_values.  */
1628   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1629   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
1630    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
1631   &thunderx_prefetch_tune
1632 };
1633
1634 static const struct tune_params tsv110_tunings =
1635 {
1636   &tsv110_extra_costs,
1637   &tsv110_addrcost_table,
1638   &tsv110_regmove_cost,
1639   &tsv110_vector_cost,
1640   &generic_branch_cost,
1641   &generic_approx_modes,
1642   SVE_NOT_IMPLEMENTED, /* sve_width  */
1643   { 4, /* load_int.  */
1644     4, /* store_int.  */
1645     4, /* load_fp.  */
1646     4, /* store_fp.  */
1647     4, /* load_pred.  */
1648     4 /* store_pred.  */
1649   }, /* memmov_cost.  */
1650   4,    /* issue_rate  */
1651   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
1652    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1653   "16", /* function_align.  */
1654   "4",  /* jump_align.  */
1655   "8",  /* loop_align.  */
1656   2,    /* int_reassoc_width.  */
1657   4,    /* fp_reassoc_width.  */
1658   1,    /* fma_reassoc_width.  */
1659   1,    /* vec_reassoc_width.  */
1660   2,    /* min_div_recip_mul_sf.  */
1661   2,    /* min_div_recip_mul_df.  */
1662   0,    /* max_case_values.  */
1663   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1664   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
1665   &tsv110_prefetch_tune
1666 };
1667
1668 static const struct tune_params xgene1_tunings =
1669 {
1670   &xgene1_extra_costs,
1671   &xgene1_addrcost_table,
1672   &xgene1_regmove_cost,
1673   &xgene1_vector_cost,
1674   &generic_branch_cost,
1675   &xgene1_approx_modes,
1676   SVE_NOT_IMPLEMENTED, /* sve_width  */
1677   { 6, /* load_int.  */
1678     6, /* store_int.  */
1679     6, /* load_fp.  */
1680     6, /* store_fp.  */
1681     6, /* load_pred.  */
1682     6 /* store_pred.  */
1683   }, /* memmov_cost.  */
1684   4, /* issue_rate  */
1685   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1686   "16", /* function_align.  */
1687   "16", /* jump_align.  */
1688   "16", /* loop_align.  */
1689   2,    /* int_reassoc_width.  */
1690   4,    /* fp_reassoc_width.  */
1691   1,    /* fma_reassoc_width.  */
1692   1,    /* vec_reassoc_width.  */
1693   2,    /* min_div_recip_mul_sf.  */
1694   2,    /* min_div_recip_mul_df.  */
1695   17,   /* max_case_values.  */
1696   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1697   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1698   &xgene1_prefetch_tune
1699 };
1700
1701 static const struct tune_params emag_tunings =
1702 {
1703   &xgene1_extra_costs,
1704   &xgene1_addrcost_table,
1705   &xgene1_regmove_cost,
1706   &xgene1_vector_cost,
1707   &generic_branch_cost,
1708   &xgene1_approx_modes,
1709   SVE_NOT_IMPLEMENTED,
1710   { 6, /* load_int.  */
1711     6, /* store_int.  */
1712     6, /* load_fp.  */
1713     6, /* store_fp.  */
1714     6, /* load_pred.  */
1715     6 /* store_pred.  */
1716   }, /* memmov_cost.  */
1717   4, /* issue_rate  */
1718   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1719   "16", /* function_align.  */
1720   "16", /* jump_align.  */
1721   "16", /* loop_align.  */
1722   2,    /* int_reassoc_width.  */
1723   4,    /* fp_reassoc_width.  */
1724   1,    /* fma_reassoc_width.  */
1725   1,    /* vec_reassoc_width.  */
1726   2,    /* min_div_recip_mul_sf.  */
1727   2,    /* min_div_recip_mul_df.  */
1728   17,   /* max_case_values.  */
1729   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
1730   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),        /* tune_flags.  */
1731   &xgene1_prefetch_tune
1732 };
1733
1734 static const struct tune_params qdf24xx_tunings =
1735 {
1736   &qdf24xx_extra_costs,
1737   &qdf24xx_addrcost_table,
1738   &qdf24xx_regmove_cost,
1739   &qdf24xx_vector_cost,
1740   &generic_branch_cost,
1741   &generic_approx_modes,
1742   SVE_NOT_IMPLEMENTED, /* sve_width  */
1743   { 4, /* load_int.  */
1744     4, /* store_int.  */
1745     4, /* load_fp.  */
1746     4, /* store_fp.  */
1747     4, /* load_pred.  */
1748     4 /* store_pred.  */
1749   }, /* memmov_cost.  */
1750   4, /* issue_rate  */
1751   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1752    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1753   "16", /* function_align.  */
1754   "8",  /* jump_align.  */
1755   "16", /* loop_align.  */
1756   2,    /* int_reassoc_width.  */
1757   4,    /* fp_reassoc_width.  */
1758   1,    /* fma_reassoc_width.  */
1759   1,    /* vec_reassoc_width.  */
1760   2,    /* min_div_recip_mul_sf.  */
1761   2,    /* min_div_recip_mul_df.  */
1762   0,    /* max_case_values.  */
1763   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1764   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1765   &qdf24xx_prefetch_tune
1766 };
1767
1768 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1769    for now.  */
1770 static const struct tune_params saphira_tunings =
1771 {
1772   &generic_extra_costs,
1773   &generic_addrcost_table,
1774   &generic_regmove_cost,
1775   &generic_vector_cost,
1776   &generic_branch_cost,
1777   &generic_approx_modes,
1778   SVE_NOT_IMPLEMENTED, /* sve_width  */
1779   { 4, /* load_int.  */
1780     4, /* store_int.  */
1781     4, /* load_fp.  */
1782     4, /* store_fp.  */
1783     4, /* load_pred.  */
1784     4 /* store_pred.  */
1785   }, /* memmov_cost.  */
1786   4, /* issue_rate  */
1787   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1788    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1789   "16", /* function_align.  */
1790   "8",  /* jump_align.  */
1791   "16", /* loop_align.  */
1792   2,    /* int_reassoc_width.  */
1793   4,    /* fp_reassoc_width.  */
1794   1,    /* fma_reassoc_width.  */
1795   1,    /* vec_reassoc_width.  */
1796   2,    /* min_div_recip_mul_sf.  */
1797   2,    /* min_div_recip_mul_df.  */
1798   0,    /* max_case_values.  */
1799   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1800   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1801   &generic_prefetch_tune
1802 };
1803
1804 static const struct tune_params thunderx2t99_tunings =
1805 {
1806   &thunderx2t99_extra_costs,
1807   &thunderx2t99_addrcost_table,
1808   &thunderx2t99_regmove_cost,
1809   &thunderx2t99_vector_cost,
1810   &generic_branch_cost,
1811   &generic_approx_modes,
1812   SVE_NOT_IMPLEMENTED, /* sve_width  */
1813   { 4, /* load_int.  */
1814     4, /* store_int.  */
1815     4, /* load_fp.  */
1816     4, /* store_fp.  */
1817     4, /* load_pred.  */
1818     4 /* store_pred.  */
1819   }, /* memmov_cost.  */
1820   4, /* issue_rate.  */
1821   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1822    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1823   "16", /* function_align.  */
1824   "8",  /* jump_align.  */
1825   "16", /* loop_align.  */
1826   3,    /* int_reassoc_width.  */
1827   2,    /* fp_reassoc_width.  */
1828   1,    /* fma_reassoc_width.  */
1829   2,    /* vec_reassoc_width.  */
1830   2,    /* min_div_recip_mul_sf.  */
1831   2,    /* min_div_recip_mul_df.  */
1832   0,    /* max_case_values.  */
1833   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1834   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1835   &thunderx2t99_prefetch_tune
1836 };
1837
1838 static const struct tune_params thunderx3t110_tunings =
1839 {
1840   &thunderx3t110_extra_costs,
1841   &thunderx3t110_addrcost_table,
1842   &thunderx3t110_regmove_cost,
1843   &thunderx3t110_vector_cost,
1844   &generic_branch_cost,
1845   &generic_approx_modes,
1846   SVE_NOT_IMPLEMENTED, /* sve_width  */
1847   { 4, /* load_int.  */
1848     4, /* store_int.  */
1849     4, /* load_fp.  */
1850     4, /* store_fp.  */
1851     4, /* load_pred.  */
1852     4 /* store_pred.  */
1853   }, /* memmov_cost.  */
1854   6, /* issue_rate.  */
1855   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1856    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1857   "16", /* function_align.  */
1858   "8",  /* jump_align.  */
1859   "16", /* loop_align.  */
1860   3,    /* int_reassoc_width.  */
1861   2,    /* fp_reassoc_width.  */
1862   1,    /* fma_reassoc_width.  */
1863   2,    /* vec_reassoc_width.  */
1864   2,    /* min_div_recip_mul_sf.  */
1865   2,    /* min_div_recip_mul_df.  */
1866   0,    /* max_case_values.  */
1867   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1868   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
1869   &thunderx3t110_prefetch_tune
1870 };
1871
1872 static const struct tune_params neoversen1_tunings =
1873 {
1874   &cortexa76_extra_costs,
1875   &generic_addrcost_table,
1876   &generic_regmove_cost,
1877   &cortexa57_vector_cost,
1878   &generic_branch_cost,
1879   &generic_approx_modes,
1880   SVE_NOT_IMPLEMENTED, /* sve_width  */
1881   { 4, /* load_int.  */
1882     2, /* store_int.  */
1883     5, /* load_fp.  */
1884     2, /* store_fp.  */
1885     4, /* load_pred.  */
1886     4 /* store_pred.  */
1887   }, /* memmov_cost.  */
1888   3, /* issue_rate  */
1889   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1890   "32:16",      /* function_align.  */
1891   "4",          /* jump_align.  */
1892   "32:16",      /* loop_align.  */
1893   2,    /* int_reassoc_width.  */
1894   4,    /* fp_reassoc_width.  */
1895   1,    /* fma_reassoc_width.  */
1896   2,    /* vec_reassoc_width.  */
1897   2,    /* min_div_recip_mul_sf.  */
1898   2,    /* min_div_recip_mul_df.  */
1899   0,    /* max_case_values.  */
1900   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1901   (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),      /* tune_flags.  */
1902   &generic_prefetch_tune
1903 };
1904
1905 static const struct tune_params ampere1_tunings =
1906 {
1907   &ampere1_extra_costs,
1908   &generic_addrcost_table,
1909   &generic_regmove_cost,
1910   &ampere1_vector_cost,
1911   &generic_branch_cost,
1912   &generic_approx_modes,
1913   SVE_NOT_IMPLEMENTED, /* sve_width  */
1914   { 4, /* load_int.  */
1915     4, /* store_int.  */
1916     4, /* load_fp.  */
1917     4, /* store_fp.  */
1918     4, /* load_pred.  */
1919     4 /* store_pred.  */
1920   }, /* memmov_cost.  */
1921   4, /* issue_rate  */
1922   (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
1923    AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
1924    AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
1925    AARCH64_FUSE_CMP_BRANCH),
1926   /* fusible_ops  */
1927   "32",         /* function_align.  */
1928   "4",          /* jump_align.  */
1929   "32:16",      /* loop_align.  */
1930   2,    /* int_reassoc_width.  */
1931   4,    /* fp_reassoc_width.  */
1932   1,    /* fma_reassoc_width.  */
1933   2,    /* vec_reassoc_width.  */
1934   2,    /* min_div_recip_mul_sf.  */
1935   2,    /* min_div_recip_mul_df.  */
1936   0,    /* max_case_values.  */
1937   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1938   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1939   &ampere1_prefetch_tune
1940 };
1941
1942 static const struct tune_params ampere1a_tunings =
1943 {
1944   &ampere1a_extra_costs,
1945   &generic_addrcost_table,
1946   &generic_regmove_cost,
1947   &ampere1_vector_cost,
1948   &generic_branch_cost,
1949   &generic_approx_modes,
1950   SVE_NOT_IMPLEMENTED, /* sve_width  */
1951   { 4, /* load_int.  */
1952     4, /* store_int.  */
1953     4, /* load_fp.  */
1954     4, /* store_fp.  */
1955     4, /* load_pred.  */
1956     4 /* store_pred.  */
1957   }, /* memmov_cost.  */
1958   4, /* issue_rate  */
1959   (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
1960    AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
1961    AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
1962    AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ |
1963    AARCH64_FUSE_ADDSUB_2REG_CONST1),
1964   /* fusible_ops  */
1965   "32",         /* function_align.  */
1966   "4",          /* jump_align.  */
1967   "32:16",      /* loop_align.  */
1968   2,    /* int_reassoc_width.  */
1969   4,    /* fp_reassoc_width.  */
1970   1,    /* fma_reassoc_width.  */
1971   2,    /* vec_reassoc_width.  */
1972   2,    /* min_div_recip_mul_sf.  */
1973   2,    /* min_div_recip_mul_df.  */
1974   0,    /* max_case_values.  */
1975   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1976   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
1977   &ampere1_prefetch_tune
1978 };
1979
1980 static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
1981 {
1982   2, /* int_stmt_cost  */
1983   2, /* fp_stmt_cost  */
1984   4, /* ld2_st2_permute_cost */
1985   4, /* ld3_st3_permute_cost  */
1986   5, /* ld4_st4_permute_cost  */
1987   3, /* permute_cost  */
1988   4, /* reduc_i8_cost  */
1989   4, /* reduc_i16_cost  */
1990   2, /* reduc_i32_cost  */
1991   2, /* reduc_i64_cost  */
1992   6, /* reduc_f16_cost  */
1993   3, /* reduc_f32_cost  */
1994   2, /* reduc_f64_cost  */
1995   2, /* store_elt_extra_cost  */
1996   /* This value is just inherited from the Cortex-A57 table.  */
1997   8, /* vec_to_scalar_cost  */
1998   /* This depends very much on what the scalar value is and
1999      where it comes from.  E.g. some constants take two dependent
2000      instructions or a load, while others might be moved from a GPR.
2001      4 seems to be a reasonable compromise in practice.  */
2002   4, /* scalar_to_vec_cost  */
2003   4, /* align_load_cost  */
2004   4, /* unalign_load_cost  */
2005   /* Although stores have a latency of 2 and compete for the
2006      vector pipes, in practice it's better not to model that.  */
2007   1, /* unalign_store_cost  */
2008   1  /* store_cost  */
2009 };
2010
2011 static const sve_vec_cost neoversev1_sve_vector_cost =
2012 {
2013   {
2014     2, /* int_stmt_cost  */
2015     2, /* fp_stmt_cost  */
2016     4, /* ld2_st2_permute_cost  */
2017     7, /* ld3_st3_permute_cost  */
2018     8, /* ld4_st4_permute_cost  */
2019     3, /* permute_cost  */
2020     /* Theoretically, a reduction involving 31 scalar ADDs could
2021        complete in ~9 cycles and would have a cost of 31.  [SU]ADDV
2022        completes in 14 cycles, so give it a cost of 31 + 5.  */
2023     36, /* reduc_i8_cost  */
2024     /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7.  */
2025     22, /* reduc_i16_cost  */
2026     /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7.  */
2027     14, /* reduc_i32_cost  */
2028     /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8.  */
2029     11, /* reduc_i64_cost  */
2030     /* Theoretically, a reduction involving 15 scalar FADDs could
2031        complete in ~9 cycles and would have a cost of 30.  FADDV
2032        completes in 13 cycles, so give it a cost of 30 + 4.  */
2033     34, /* reduc_f16_cost  */
2034     /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5.  */
2035     19, /* reduc_f32_cost  */
2036     /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5.  */
2037     11, /* reduc_f64_cost  */
2038     2, /* store_elt_extra_cost  */
2039     /* This value is just inherited from the Cortex-A57 table.  */
2040     8, /* vec_to_scalar_cost  */
2041     /* See the comment above the Advanced SIMD versions.  */
2042     4, /* scalar_to_vec_cost  */
2043     4, /* align_load_cost  */
2044     4, /* unalign_load_cost  */
2045     /* Although stores have a latency of 2 and compete for the
2046        vector pipes, in practice it's better not to model that.  */
2047     1, /* unalign_store_cost  */
2048     1  /* store_cost  */
2049   },
2050   3, /* clast_cost  */
2051   19, /* fadda_f16_cost  */
2052   11, /* fadda_f32_cost  */
2053   8, /* fadda_f64_cost  */
2054   32, /* gather_load_x32_cost  */
2055   16, /* gather_load_x64_cost  */
2056   3 /* scatter_store_elt_cost  */
2057 };
2058
2059 static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
2060 {
2061   3, /* loads_stores_per_cycle  */
2062   2, /* stores_per_cycle  */
2063   4, /* general_ops_per_cycle  */
2064   0, /* fp_simd_load_general_ops  */
2065   1 /* fp_simd_store_general_ops  */
2066 };
2067
2068 static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
2069 {
2070   {
2071     3, /* loads_stores_per_cycle  */
2072     2, /* stores_per_cycle  */
2073     4, /* general_ops_per_cycle  */
2074     0, /* fp_simd_load_general_ops  */
2075     1 /* fp_simd_store_general_ops  */
2076   },
2077   2, /* ld2_st2_general_ops  */
2078   2, /* ld3_st3_general_ops  */
2079   3 /* ld4_st4_general_ops  */
2080 };
2081
2082 static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
2083 {
2084   {
2085     {
2086       2, /* loads_per_cycle  */
2087       2, /* stores_per_cycle  */
2088       2, /* general_ops_per_cycle  */
2089       0, /* fp_simd_load_general_ops  */
2090       1 /* fp_simd_store_general_ops  */
2091     },
2092     2, /* ld2_st2_general_ops  */
2093     2, /* ld3_st3_general_ops  */
2094     3 /* ld4_st4_general_ops  */
2095   },
2096   1, /* pred_ops_per_cycle  */
2097   2, /* while_pred_ops  */
2098   2, /* int_cmp_pred_ops  */
2099   1, /* fp_cmp_pred_ops  */
2100   1, /* gather_scatter_pair_general_ops  */
2101   1 /* gather_scatter_pair_pred_ops  */
2102 };
2103
2104 static const aarch64_vec_issue_info neoversev1_vec_issue_info =
2105 {
2106   &neoversev1_scalar_issue_info,
2107   &neoversev1_advsimd_issue_info,
2108   &neoversev1_sve_issue_info
2109 };
2110
2111 /* Neoverse V1 costs for vector insn classes.  */
2112 static const struct cpu_vector_cost neoversev1_vector_cost =
2113 {
2114   1, /* scalar_int_stmt_cost  */
2115   2, /* scalar_fp_stmt_cost  */
2116   4, /* scalar_load_cost  */
2117   1, /* scalar_store_cost  */
2118   1, /* cond_taken_branch_cost  */
2119   1, /* cond_not_taken_branch_cost  */
2120   &neoversev1_advsimd_vector_cost, /* advsimd  */
2121   &neoversev1_sve_vector_cost, /* sve  */
2122   &neoversev1_vec_issue_info /* issue_info  */
2123 };
2124
2125 static const struct tune_params neoversev1_tunings =
2126 {
2127   &cortexa76_extra_costs,
2128   &neoversev1_addrcost_table,
2129   &neoversev1_regmove_cost,
2130   &neoversev1_vector_cost,
2131   &generic_branch_cost,
2132   &generic_approx_modes,
2133   SVE_256, /* sve_width  */
2134   { 4, /* load_int.  */
2135     2, /* store_int.  */
2136     6, /* load_fp.  */
2137     2, /* store_fp.  */
2138     6, /* load_pred.  */
2139     1 /* store_pred.  */
2140   }, /* memmov_cost.  */
2141   3, /* issue_rate  */
2142   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
2143   "32:16",      /* function_align.  */
2144   "4",          /* jump_align.  */
2145   "32:16",      /* loop_align.  */
2146   2,    /* int_reassoc_width.  */
2147   4,    /* fp_reassoc_width.  */
2148   4,    /* fma_reassoc_width.  */
2149   2,    /* vec_reassoc_width.  */
2150   2,    /* min_div_recip_mul_sf.  */
2151   2,    /* min_div_recip_mul_df.  */
2152   0,    /* max_case_values.  */
2153   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
2154   (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2155    | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2156    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
2157    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
2158   &generic_prefetch_tune
2159 };
2160
2161 static const sve_vec_cost neoverse512tvb_sve_vector_cost =
2162 {
2163   {
2164     2, /* int_stmt_cost  */
2165     2, /* fp_stmt_cost  */
2166     4, /* ld2_st2_permute_cost  */
2167     5, /* ld3_st3_permute_cost  */
2168     5, /* ld4_st4_permute_cost  */
2169     3, /* permute_cost  */
2170     /* Theoretically, a reduction involving 15 scalar ADDs could
2171        complete in ~5 cycles and would have a cost of 15.  Assume that
2172        [SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6.  */
2173     21, /* reduc_i8_cost  */
2174     /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
2175     13, /* reduc_i16_cost  */
2176     /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
2177     9, /* reduc_i32_cost  */
2178     /* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7.  */
2179     8, /* reduc_i64_cost  */
2180     /* Theoretically, a reduction involving 7 scalar FADDs could
2181        complete in ~6 cycles and would have a cost of 14.  Assume that
2182        FADDV completes in 8 cycles and so give it a cost of 14 + 2.  */
2183     16, /* reduc_f16_cost  */
2184     /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2.  */
2185     8, /* reduc_f32_cost  */
2186     /* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2.  */
2187     4, /* reduc_f64_cost  */
2188     2, /* store_elt_extra_cost  */
2189     /* This value is just inherited from the Cortex-A57 table.  */
2190     8, /* vec_to_scalar_cost  */
2191     /* This depends very much on what the scalar value is and
2192        where it comes from.  E.g. some constants take two dependent
2193        instructions or a load, while others might be moved from a GPR.
2194        4 seems to be a reasonable compromise in practice.  */
2195     4, /* scalar_to_vec_cost  */
2196     4, /* align_load_cost  */
2197     4, /* unalign_load_cost  */
2198     /* Although stores generally have a latency of 2 and compete for the
2199        vector pipes, in practice it's better not to model that.  */
2200     1, /* unalign_store_cost  */
2201     1  /* store_cost  */
2202   },
2203   3, /* clast_cost  */
2204   10, /* fadda_f16_cost  */
2205   6, /* fadda_f32_cost  */
2206   4, /* fadda_f64_cost  */
2207   /* A strided Advanced SIMD x64 load would take two parallel FP loads
2208      (6 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
2209      is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
2210      (cost 8) and a vec_construct (cost 2).  Add a full vector operation
2211      (cost 2) to that, to avoid the difference being lost in rounding.
2212
2213      There is no easy comparison between a strided Advanced SIMD x32 load
2214      and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2215      operation more than a 64-bit gather.  */
2216   14, /* gather_load_x32_cost  */
2217   12, /* gather_load_x64_cost  */
2218   3 /* scatter_store_elt_cost  */
2219 };
2220
2221 static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info =
2222 {
2223   {
2224     {
2225       3, /* loads_per_cycle  */
2226       2, /* stores_per_cycle  */
2227       4, /* general_ops_per_cycle  */
2228       0, /* fp_simd_load_general_ops  */
2229       1 /* fp_simd_store_general_ops  */
2230     },
2231     2, /* ld2_st2_general_ops  */
2232     2, /* ld3_st3_general_ops  */
2233     3 /* ld4_st4_general_ops  */
2234   },
2235   2, /* pred_ops_per_cycle  */
2236   2, /* while_pred_ops  */
2237   2, /* int_cmp_pred_ops  */
2238   1, /* fp_cmp_pred_ops  */
2239   1, /* gather_scatter_pair_general_ops  */
2240   1 /* gather_scatter_pair_pred_ops  */
2241 };
2242
2243 static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info =
2244 {
2245   &neoversev1_scalar_issue_info,
2246   &neoversev1_advsimd_issue_info,
2247   &neoverse512tvb_sve_issue_info
2248 };
2249
2250 static const struct cpu_vector_cost neoverse512tvb_vector_cost =
2251 {
2252   1, /* scalar_int_stmt_cost  */
2253   2, /* scalar_fp_stmt_cost  */
2254   4, /* scalar_load_cost  */
2255   1, /* scalar_store_cost  */
2256   1, /* cond_taken_branch_cost  */
2257   1, /* cond_not_taken_branch_cost  */
2258   &neoversev1_advsimd_vector_cost, /* advsimd  */
2259   &neoverse512tvb_sve_vector_cost, /* sve  */
2260   &neoverse512tvb_vec_issue_info /* issue_info  */
2261 };
2262
2263 static const struct tune_params neoverse512tvb_tunings =
2264 {
2265   &cortexa76_extra_costs,
2266   &neoversev1_addrcost_table,
2267   &neoversev1_regmove_cost,
2268   &neoverse512tvb_vector_cost,
2269   &generic_branch_cost,
2270   &generic_approx_modes,
2271   SVE_128 | SVE_256, /* sve_width  */
2272   { 4, /* load_int.  */
2273     2, /* store_int.  */
2274     6, /* load_fp.  */
2275     2, /* store_fp.  */
2276     6, /* load_pred.  */
2277     1 /* store_pred.  */
2278   }, /* memmov_cost.  */
2279   3, /* issue_rate  */
2280   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
2281   "32:16",      /* function_align.  */
2282   "4",          /* jump_align.  */
2283   "32:16",      /* loop_align.  */
2284   2,    /* int_reassoc_width.  */
2285   4,    /* fp_reassoc_width.  */
2286   4,    /* fma_reassoc_width.  */
2287   2,    /* vec_reassoc_width.  */
2288   2,    /* min_div_recip_mul_sf.  */
2289   2,    /* min_div_recip_mul_df.  */
2290   0,    /* max_case_values.  */
2291   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
2292   (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2293    | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2294    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),     /* tune_flags.  */
2295   &generic_prefetch_tune
2296 };
2297
2298 static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
2299 {
2300   2, /* int_stmt_cost  */
2301   2, /* fp_stmt_cost  */
2302   2, /* ld2_st2_permute_cost */
2303   2, /* ld3_st3_permute_cost  */
2304   3, /* ld4_st4_permute_cost  */
2305   3, /* permute_cost  */
2306   4, /* reduc_i8_cost  */
2307   4, /* reduc_i16_cost  */
2308   2, /* reduc_i32_cost  */
2309   2, /* reduc_i64_cost  */
2310   6, /* reduc_f16_cost  */
2311   4, /* reduc_f32_cost  */
2312   2, /* reduc_f64_cost  */
2313   2, /* store_elt_extra_cost  */
2314   /* This value is just inherited from the Cortex-A57 table.  */
2315   8, /* vec_to_scalar_cost  */
2316   /* This depends very much on what the scalar value is and
2317      where it comes from.  E.g. some constants take two dependent
2318      instructions or a load, while others might be moved from a GPR.
2319      4 seems to be a reasonable compromise in practice.  */
2320   4, /* scalar_to_vec_cost  */
2321   4, /* align_load_cost  */
2322   4, /* unalign_load_cost  */
2323   /* Although stores have a latency of 2 and compete for the
2324      vector pipes, in practice it's better not to model that.  */
2325   1, /* unalign_store_cost  */
2326   1  /* store_cost  */
2327 };
2328
2329 static const sve_vec_cost neoversen2_sve_vector_cost =
2330 {
2331   {
2332     2, /* int_stmt_cost  */
2333     2, /* fp_stmt_cost  */
2334     3, /* ld2_st2_permute_cost  */
2335     4, /* ld3_st3_permute_cost  */
2336     4, /* ld4_st4_permute_cost  */
2337     3, /* permute_cost  */
2338     /* Theoretically, a reduction involving 15 scalar ADDs could
2339        complete in ~5 cycles and would have a cost of 15.  [SU]ADDV
2340        completes in 11 cycles, so give it a cost of 15 + 6.  */
2341     21, /* reduc_i8_cost  */
2342     /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
2343     13, /* reduc_i16_cost  */
2344     /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
2345     9, /* reduc_i32_cost  */
2346     /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
2347     2, /* reduc_i64_cost  */
2348     /* Theoretically, a reduction involving 7 scalar FADDs could
2349        complete in ~8 cycles and would have a cost of 14.  FADDV
2350        completes in 6 cycles, so give it a cost of 14 - 2.  */
2351     12, /* reduc_f16_cost  */
2352     /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0.  */
2353     6, /* reduc_f32_cost  */
2354     /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0.  */
2355     2, /* reduc_f64_cost  */
2356     2, /* store_elt_extra_cost  */
2357     /* This value is just inherited from the Cortex-A57 table.  */
2358     8, /* vec_to_scalar_cost  */
2359     /* See the comment above the Advanced SIMD versions.  */
2360     4, /* scalar_to_vec_cost  */
2361     4, /* align_load_cost  */
2362     4, /* unalign_load_cost  */
2363     /* Although stores have a latency of 2 and compete for the
2364        vector pipes, in practice it's better not to model that.  */
2365     1, /* unalign_store_cost  */
2366     1  /* store_cost  */
2367   },
2368   3, /* clast_cost  */
2369   10, /* fadda_f16_cost  */
2370   6, /* fadda_f32_cost  */
2371   4, /* fadda_f64_cost  */
2372   /* A strided Advanced SIMD x64 load would take two parallel FP loads
2373      (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
2374      is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
2375      (cost 8) and a vec_construct (cost 2).  Add a full vector operation
2376      (cost 2) to that, to avoid the difference being lost in rounding.
2377
2378      There is no easy comparison between a strided Advanced SIMD x32 load
2379      and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2380      operation more than a 64-bit gather.  */
2381   14, /* gather_load_x32_cost  */
2382   12, /* gather_load_x64_cost  */
2383   3 /* scatter_store_elt_cost  */
2384 };
2385
2386 static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info =
2387 {
2388   3, /* loads_stores_per_cycle  */
2389   2, /* stores_per_cycle  */
2390   4, /* general_ops_per_cycle  */
2391   0, /* fp_simd_load_general_ops  */
2392   1 /* fp_simd_store_general_ops  */
2393 };
2394
2395 static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_info =
2396 {
2397   {
2398     3, /* loads_stores_per_cycle  */
2399     2, /* stores_per_cycle  */
2400     2, /* general_ops_per_cycle  */
2401     0, /* fp_simd_load_general_ops  */
2402     1 /* fp_simd_store_general_ops  */
2403   },
2404   2, /* ld2_st2_general_ops  */
2405   2, /* ld3_st3_general_ops  */
2406   3 /* ld4_st4_general_ops  */
2407 };
2408
2409 static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info =
2410 {
2411   {
2412     {
2413       3, /* loads_per_cycle  */
2414       2, /* stores_per_cycle  */
2415       2, /* general_ops_per_cycle  */
2416       0, /* fp_simd_load_general_ops  */
2417       1 /* fp_simd_store_general_ops  */
2418     },
2419     2, /* ld2_st2_general_ops  */
2420     3, /* ld3_st3_general_ops  */
2421     3 /* ld4_st4_general_ops  */
2422   },
2423   2, /* pred_ops_per_cycle  */
2424   2, /* while_pred_ops  */
2425   2, /* int_cmp_pred_ops  */
2426   1, /* fp_cmp_pred_ops  */
2427   1, /* gather_scatter_pair_general_ops  */
2428   1 /* gather_scatter_pair_pred_ops  */
2429 };
2430
2431 static const aarch64_vec_issue_info neoversen2_vec_issue_info =
2432 {
2433   &neoversen2_scalar_issue_info,
2434   &neoversen2_advsimd_issue_info,
2435   &neoversen2_sve_issue_info
2436 };
2437
2438 /* Neoverse N2 costs for vector insn classes.  */
2439 static const struct cpu_vector_cost neoversen2_vector_cost =
2440 {
2441   1, /* scalar_int_stmt_cost  */
2442   2, /* scalar_fp_stmt_cost  */
2443   4, /* scalar_load_cost  */
2444   1, /* scalar_store_cost  */
2445   1, /* cond_taken_branch_cost  */
2446   1, /* cond_not_taken_branch_cost  */
2447   &neoversen2_advsimd_vector_cost, /* advsimd  */
2448   &neoversen2_sve_vector_cost, /* sve  */
2449   &neoversen2_vec_issue_info /* issue_info  */
2450 };
2451
2452 static const struct tune_params neoversen2_tunings =
2453 {
2454   &cortexa76_extra_costs,
2455   &neoversen2_addrcost_table,
2456   &neoversen2_regmove_cost,
2457   &neoversen2_vector_cost,
2458   &generic_branch_cost,
2459   &generic_approx_modes,
2460   SVE_128, /* sve_width  */
2461   { 4, /* load_int.  */
2462     1, /* store_int.  */
2463     6, /* load_fp.  */
2464     2, /* store_fp.  */
2465     6, /* load_pred.  */
2466     1 /* store_pred.  */
2467   }, /* memmov_cost.  */
2468   3, /* issue_rate  */
2469   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
2470   "32:16",      /* function_align.  */
2471   "4",          /* jump_align.  */
2472   "32:16",      /* loop_align.  */
2473   2,    /* int_reassoc_width.  */
2474   4,    /* fp_reassoc_width.  */
2475   1,    /* fma_reassoc_width.  */
2476   2,    /* vec_reassoc_width.  */
2477   2,    /* min_div_recip_mul_sf.  */
2478   2,    /* min_div_recip_mul_df.  */
2479   0,    /* max_case_values.  */
2480   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
2481   (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
2482    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2483    | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2484    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),     /* tune_flags.  */
2485   &generic_prefetch_tune
2486 };
2487
2488 static const advsimd_vec_cost neoversev2_advsimd_vector_cost =
2489 {
2490   2, /* int_stmt_cost  */
2491   2, /* fp_stmt_cost  */
2492   2, /* ld2_st2_permute_cost */
2493   2, /* ld3_st3_permute_cost  */
2494   3, /* ld4_st4_permute_cost  */
2495   3, /* permute_cost  */
2496   4, /* reduc_i8_cost  */
2497   4, /* reduc_i16_cost  */
2498   2, /* reduc_i32_cost  */
2499   2, /* reduc_i64_cost  */
2500   6, /* reduc_f16_cost  */
2501   3, /* reduc_f32_cost  */
2502   2, /* reduc_f64_cost  */
2503   2, /* store_elt_extra_cost  */
2504   /* This value is just inherited from the Cortex-A57 table.  */
2505   8, /* vec_to_scalar_cost  */
2506   /* This depends very much on what the scalar value is and
2507      where it comes from.  E.g. some constants take two dependent
2508      instructions or a load, while others might be moved from a GPR.
2509      4 seems to be a reasonable compromise in practice.  */
2510   4, /* scalar_to_vec_cost  */
2511   4, /* align_load_cost  */
2512   4, /* unalign_load_cost  */
2513   /* Although stores have a latency of 2 and compete for the
2514      vector pipes, in practice it's better not to model that.  */
2515   1, /* unalign_store_cost  */
2516   1  /* store_cost  */
2517 };
2518
2519 static const sve_vec_cost neoversev2_sve_vector_cost =
2520 {
2521   {
2522     2, /* int_stmt_cost  */
2523     2, /* fp_stmt_cost  */
2524     3, /* ld2_st2_permute_cost  */
2525     3, /* ld3_st3_permute_cost  */
2526     4, /* ld4_st4_permute_cost  */
2527     3, /* permute_cost  */
2528     /* Theoretically, a reduction involving 15 scalar ADDs could
2529        complete in ~3 cycles and would have a cost of 15.  [SU]ADDV
2530        completes in 11 cycles, so give it a cost of 15 + 8.  */
2531     21, /* reduc_i8_cost  */
2532     /* Likewise for 7 scalar ADDs (~2 cycles) vs. 9: 7 + 7.  */
2533     14, /* reduc_i16_cost  */
2534     /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 4.  */
2535     7, /* reduc_i32_cost  */
2536     /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
2537     2, /* reduc_i64_cost  */
2538     /* Theoretically, a reduction involving 7 scalar FADDs could
2539        complete in ~6 cycles and would have a cost of 14.  FADDV
2540        completes in 8 cycles, so give it a cost of 14 + 2.  */
2541     16, /* reduc_f16_cost  */
2542     /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2.  */
2543     8, /* reduc_f32_cost  */
2544     /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 2 + 2.  */
2545     4, /* reduc_f64_cost  */
2546     2, /* store_elt_extra_cost  */
2547     /* This value is just inherited from the Cortex-A57 table.  */
2548     8, /* vec_to_scalar_cost  */
2549     /* See the comment above the Advanced SIMD versions.  */
2550     4, /* scalar_to_vec_cost  */
2551     4, /* align_load_cost  */
2552     4, /* unalign_load_cost  */
2553     /* Although stores have a latency of 2 and compete for the
2554        vector pipes, in practice it's better not to model that.  */
2555     1, /* unalign_store_cost  */
2556     1  /* store_cost  */
2557   },
2558   3, /* clast_cost  */
2559   10, /* fadda_f16_cost  */
2560   6, /* fadda_f32_cost  */
2561   4, /* fadda_f64_cost  */
2562   /* A strided Advanced SIMD x64 load would take two parallel FP loads
2563      (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
2564      is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
2565      (cost 8) and a vec_construct (cost 2).  Add a full vector operation
2566      (cost 2) to that, to avoid the difference being lost in rounding.
2567
2568      There is no easy comparison between a strided Advanced SIMD x32 load
2569      and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2570      operation more than a 64-bit gather.  */
2571   14, /* gather_load_x32_cost  */
2572   12, /* gather_load_x64_cost  */
2573   3 /* scatter_store_elt_cost  */
2574 };
2575
2576 static const aarch64_scalar_vec_issue_info neoversev2_scalar_issue_info =
2577 {
2578   3, /* loads_stores_per_cycle  */
2579   2, /* stores_per_cycle  */
2580   6, /* general_ops_per_cycle  */
2581   0, /* fp_simd_load_general_ops  */
2582   1 /* fp_simd_store_general_ops  */
2583 };
2584
2585 static const aarch64_advsimd_vec_issue_info neoversev2_advsimd_issue_info =
2586 {
2587   {
2588     3, /* loads_stores_per_cycle  */
2589     2, /* stores_per_cycle  */
2590     4, /* general_ops_per_cycle  */
2591     0, /* fp_simd_load_general_ops  */
2592     1 /* fp_simd_store_general_ops  */
2593   },
2594   2, /* ld2_st2_general_ops  */
2595   2, /* ld3_st3_general_ops  */
2596   3 /* ld4_st4_general_ops  */
2597 };
2598
2599 static const aarch64_sve_vec_issue_info neoversev2_sve_issue_info =
2600 {
2601   {
2602     {
2603       3, /* loads_per_cycle  */
2604       2, /* stores_per_cycle  */
2605       4, /* general_ops_per_cycle  */
2606       0, /* fp_simd_load_general_ops  */
2607       1 /* fp_simd_store_general_ops  */
2608     },
2609     2, /* ld2_st2_general_ops  */
2610     3, /* ld3_st3_general_ops  */
2611     3 /* ld4_st4_general_ops  */
2612   },
2613   2, /* pred_ops_per_cycle  */
2614   2, /* while_pred_ops  */
2615   2, /* int_cmp_pred_ops  */
2616   1, /* fp_cmp_pred_ops  */
2617   1, /* gather_scatter_pair_general_ops  */
2618   1 /* gather_scatter_pair_pred_ops  */
2619 };
2620
2621 static const aarch64_vec_issue_info neoversev2_vec_issue_info =
2622 {
2623   &neoversev2_scalar_issue_info,
2624   &neoversev2_advsimd_issue_info,
2625   &neoversev2_sve_issue_info
2626 };
2627
2628 /* Demeter costs for vector insn classes.  */
2629 static const struct cpu_vector_cost neoversev2_vector_cost =
2630 {
2631   1, /* scalar_int_stmt_cost  */
2632   2, /* scalar_fp_stmt_cost  */
2633   4, /* scalar_load_cost  */
2634   1, /* scalar_store_cost  */
2635   1, /* cond_taken_branch_cost  */
2636   1, /* cond_not_taken_branch_cost  */
2637   &neoversev2_advsimd_vector_cost, /* advsimd  */
2638   &neoversev2_sve_vector_cost, /* sve  */
2639   &neoversev2_vec_issue_info /* issue_info  */
2640 };
2641
2642 static const struct tune_params neoversev2_tunings =
2643 {
2644   &cortexa76_extra_costs,
2645   &neoversev2_addrcost_table,
2646   &neoversev2_regmove_cost,
2647   &neoversev2_vector_cost,
2648   &generic_branch_cost,
2649   &generic_approx_modes,
2650   SVE_128, /* sve_width  */
2651   { 4, /* load_int.  */
2652     2, /* store_int.  */
2653     6, /* load_fp.  */
2654     1, /* store_fp.  */
2655     6, /* load_pred.  */
2656     2 /* store_pred.  */
2657   }, /* memmov_cost.  */
2658   5, /* issue_rate  */
2659   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
2660   "32:16",      /* function_align.  */
2661   "4",          /* jump_align.  */
2662   "32:16",      /* loop_align.  */
2663   3,    /* int_reassoc_width.  */
2664   6,    /* fp_reassoc_width.  */
2665   4,    /* fma_reassoc_width.  */
2666   3,    /* vec_reassoc_width.  */
2667   2,    /* min_div_recip_mul_sf.  */
2668   2,    /* min_div_recip_mul_df.  */
2669   0,    /* max_case_values.  */
2670   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
2671   (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
2672    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2673    | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2674    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),     /* tune_flags.  */
2675   &generic_prefetch_tune
2676 };
2677
2678 static const struct tune_params a64fx_tunings =
2679 {
2680   &a64fx_extra_costs,
2681   &a64fx_addrcost_table,
2682   &a64fx_regmove_cost,
2683   &a64fx_vector_cost,
2684   &generic_branch_cost,
2685   &generic_approx_modes,
2686   SVE_512, /* sve_width  */
2687   { 4, /* load_int.  */
2688     4, /* store_int.  */
2689     4, /* load_fp.  */
2690     4, /* store_fp.  */
2691     4, /* load_pred.  */
2692     4 /* store_pred.  */
2693   }, /* memmov_cost.  */
2694   7, /* issue_rate  */
2695   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
2696   "32", /* function_align.  */
2697   "16", /* jump_align.  */
2698   "32", /* loop_align.  */
2699   4,    /* int_reassoc_width.  */
2700   2,    /* fp_reassoc_width.  */
2701   1,    /* fma_reassoc_width.  */
2702   2,    /* vec_reassoc_width.  */
2703   2,    /* min_div_recip_mul_sf.  */
2704   2,    /* min_div_recip_mul_df.  */
2705   0,    /* max_case_values.  */
2706   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
2707   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
2708   &a64fx_prefetch_tune
2709 };
2710
2711 /* Support for fine-grained override of the tuning structures.  */
2712 struct aarch64_tuning_override_function
2713 {
2714   const char* name;
2715   void (*parse_override)(const char*, struct tune_params*);
2716 };
2717
2718 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
2719 static void aarch64_parse_tune_string (const char*, struct tune_params*);
2720 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
2721
2722 static const struct aarch64_tuning_override_function
2723 aarch64_tuning_override_functions[] =
2724 {
2725   { "fuse", aarch64_parse_fuse_string },
2726   { "tune", aarch64_parse_tune_string },
2727   { "sve_width", aarch64_parse_sve_width_string },
2728   { NULL, NULL }
2729 };
2730
2731 /* A processor implementing AArch64.  */
2732 struct processor
2733 {
2734   const char *name;
2735   aarch64_processor ident;
2736   aarch64_processor sched_core;
2737   aarch64_arch arch;
2738   aarch64_feature_flags flags;
2739   const tune_params *tune;
2740 };
2741
2742 /* Architectures implementing AArch64.  */
2743 static CONSTEXPR const processor all_architectures[] =
2744 {
2745 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, D, E) \
2746   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, \
2747    feature_deps::ARCH_IDENT ().enable, NULL},
2748 #include "aarch64-arches.def"
2749   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
2750 };
2751
2752 /* Processor cores implementing AArch64.  */
2753 static const struct processor all_cores[] =
2754 {
2755 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, E, COSTS, G, H, I) \
2756   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
2757    feature_deps::cpu_##IDENT, &COSTS##_tunings},
2758 #include "aarch64-cores.def"
2759   {"generic", generic, cortexa53, AARCH64_ARCH_V8A,
2760    feature_deps::V8A ().enable, &generic_tunings},
2761   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
2762 };
2763
2764 /* The current tuning set.  */
2765 struct tune_params aarch64_tune_params = generic_tunings;
2766
2767 /* Check whether an 'aarch64_vector_pcs' attribute is valid.  */
2768
2769 static tree
2770 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
2771                                      int, bool *no_add_attrs)
2772 {
2773   /* Since we set fn_type_req to true, the caller should have checked
2774      this for us.  */
2775   gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
2776   switch ((arm_pcs) fntype_abi (*node).id ())
2777     {
2778     case ARM_PCS_AAPCS64:
2779     case ARM_PCS_SIMD:
2780       return NULL_TREE;
2781
2782     case ARM_PCS_SVE:
2783       error ("the %qE attribute cannot be applied to an SVE function type",
2784              name);
2785       *no_add_attrs = true;
2786       return NULL_TREE;
2787
2788     case ARM_PCS_TLSDESC:
2789     case ARM_PCS_UNKNOWN:
2790       break;
2791     }
2792   gcc_unreachable ();
2793 }
2794
2795 /* Table of machine attributes.  */
2796 static const struct attribute_spec aarch64_attribute_table[] =
2797 {
2798   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
2799        affects_type_identity, handler, exclude } */
2800   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,
2801                           handle_aarch64_vector_pcs_attribute, NULL },
2802   { "arm_sve_vector_bits", 1, 1, false, true,  false, true,
2803                           aarch64_sve::handle_arm_sve_vector_bits_attribute,
2804                           NULL },
2805   { "Advanced SIMD type", 1, 1, false, true,  false, true,  NULL, NULL },
2806   { "SVE type",           3, 3, false, true,  false, true,  NULL, NULL },
2807   { "SVE sizeless type",  0, 0, false, true,  false, true,  NULL, NULL },
2808   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
2809 };
2810
2811 /* An ISA extension in the co-processor and main instruction set space.  */
2812 struct aarch64_option_extension
2813 {
2814   const char *const name;
2815   const unsigned long flags_on;
2816   const unsigned long flags_off;
2817 };
2818
2819 typedef enum aarch64_cond_code
2820 {
2821   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
2822   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
2823   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
2824 }
2825 aarch64_cc;
2826
2827 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
2828
2829 struct aarch64_branch_protect_type
2830 {
2831   /* The type's name that the user passes to the branch-protection option
2832     string.  */
2833   const char* name;
2834   /* Function to handle the protection type and set global variables.
2835     First argument is the string token corresponding with this type and the
2836     second argument is the next token in the option string.
2837     Return values:
2838     * AARCH64_PARSE_OK: Handling was sucessful.
2839     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
2840       should print an error.
2841     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
2842       own error.  */
2843   enum aarch64_parse_opt_result (*handler)(char*, char*);
2844   /* A list of types that can follow this type in the option string.  */
2845   const aarch64_branch_protect_type* subtypes;
2846   unsigned int num_subtypes;
2847 };
2848
2849 static enum aarch64_parse_opt_result
2850 aarch64_handle_no_branch_protection (char* str, char* rest)
2851 {
2852   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
2853   aarch64_enable_bti = 0;
2854   if (rest)
2855     {
2856       error ("unexpected %<%s%> after %<%s%>", rest, str);
2857       return AARCH64_PARSE_INVALID_FEATURE;
2858     }
2859   return AARCH64_PARSE_OK;
2860 }
2861
2862 static enum aarch64_parse_opt_result
2863 aarch64_handle_standard_branch_protection (char* str, char* rest)
2864 {
2865   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
2866   aarch64_ra_sign_key = AARCH64_KEY_A;
2867   aarch64_enable_bti = 1;
2868   if (rest)
2869     {
2870       error ("unexpected %<%s%> after %<%s%>", rest, str);
2871       return AARCH64_PARSE_INVALID_FEATURE;
2872     }
2873   return AARCH64_PARSE_OK;
2874 }
2875
2876 static enum aarch64_parse_opt_result
2877 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
2878                                     char* rest ATTRIBUTE_UNUSED)
2879 {
2880   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
2881   aarch64_ra_sign_key = AARCH64_KEY_A;
2882   return AARCH64_PARSE_OK;
2883 }
2884
2885 static enum aarch64_parse_opt_result
2886 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
2887                               char* rest ATTRIBUTE_UNUSED)
2888 {
2889   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
2890   return AARCH64_PARSE_OK;
2891 }
2892
2893 static enum aarch64_parse_opt_result
2894 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
2895                               char* rest ATTRIBUTE_UNUSED)
2896 {
2897   aarch64_ra_sign_key = AARCH64_KEY_B;
2898   return AARCH64_PARSE_OK;
2899 }
2900
2901 static enum aarch64_parse_opt_result
2902 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
2903                                     char* rest ATTRIBUTE_UNUSED)
2904 {
2905   aarch64_enable_bti = 1;
2906   return AARCH64_PARSE_OK;
2907 }
2908
2909 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
2910   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
2911   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
2912   { NULL, NULL, NULL, 0 }
2913 };
2914
2915 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
2916   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
2917   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
2918   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
2919     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
2920   { "bti", aarch64_handle_bti_protection, NULL, 0 },
2921   { NULL, NULL, NULL, 0 }
2922 };
2923
2924 /* The condition codes of the processor, and the inverse function.  */
2925 static const char * const aarch64_condition_codes[] =
2926 {
2927   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
2928   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
2929 };
2930
2931 /* The preferred condition codes for SVE conditions.  */
2932 static const char *const aarch64_sve_condition_codes[] =
2933 {
2934   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
2935   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
2936 };
2937
2938 /* Return the assembly token for svpattern value VALUE.  */
2939
2940 static const char *
2941 svpattern_token (enum aarch64_svpattern pattern)
2942 {
2943   switch (pattern)
2944     {
2945 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
2946     AARCH64_FOR_SVPATTERN (CASE)
2947 #undef CASE
2948     case AARCH64_NUM_SVPATTERNS:
2949       break;
2950     }
2951   gcc_unreachable ();
2952 }
2953
2954 /* Return the location of a piece that is known to be passed or returned
2955    in registers.  FIRST_ZR is the first unused vector argument register
2956    and FIRST_PR is the first unused predicate argument register.  */
2957
2958 rtx
2959 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
2960                                          unsigned int first_pr) const
2961 {
2962   gcc_assert (VECTOR_MODE_P (mode)
2963               && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
2964               && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
2965
2966   if (num_zr > 0 && num_pr == 0)
2967     return gen_rtx_REG (mode, first_zr);
2968
2969   if (num_zr == 0 && num_pr == 1)
2970     return gen_rtx_REG (mode, first_pr);
2971
2972   gcc_unreachable ();
2973 }
2974
2975 /* Return the total number of vector registers required by the PST.  */
2976
2977 unsigned int
2978 pure_scalable_type_info::num_zr () const
2979 {
2980   unsigned int res = 0;
2981   for (unsigned int i = 0; i < pieces.length (); ++i)
2982     res += pieces[i].num_zr;
2983   return res;
2984 }
2985
2986 /* Return the total number of predicate registers required by the PST.  */
2987
2988 unsigned int
2989 pure_scalable_type_info::num_pr () const
2990 {
2991   unsigned int res = 0;
2992   for (unsigned int i = 0; i < pieces.length (); ++i)
2993     res += pieces[i].num_pr;
2994   return res;
2995 }
2996
2997 /* Return the location of a PST that is known to be passed or returned
2998    in registers.  FIRST_ZR is the first unused vector argument register
2999    and FIRST_PR is the first unused predicate argument register.  */
3000
3001 rtx
3002 pure_scalable_type_info::get_rtx (machine_mode mode,
3003                                   unsigned int first_zr,
3004                                   unsigned int first_pr) const
3005 {
3006   /* Try to return a single REG if possible.  This leads to better
3007      code generation; it isn't required for correctness.  */
3008   if (mode == pieces[0].mode)
3009     {
3010       gcc_assert (pieces.length () == 1);
3011       return pieces[0].get_rtx (first_zr, first_pr);
3012     }
3013
3014   /* Build up a PARALLEL that contains the individual pieces.  */
3015   rtvec rtxes = rtvec_alloc (pieces.length ());
3016   for (unsigned int i = 0; i < pieces.length (); ++i)
3017     {
3018       rtx reg = pieces[i].get_rtx (first_zr, first_pr);
3019       rtx offset = gen_int_mode (pieces[i].offset, Pmode);
3020       RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
3021       first_zr += pieces[i].num_zr;
3022       first_pr += pieces[i].num_pr;
3023     }
3024   return gen_rtx_PARALLEL (mode, rtxes);
3025 }
3026
3027 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
3028    in the AAPCS64.  */
3029
3030 pure_scalable_type_info::analysis_result
3031 pure_scalable_type_info::analyze (const_tree type)
3032 {
3033   /* Prevent accidental reuse.  */
3034   gcc_assert (pieces.is_empty ());
3035
3036   /* No code will be generated for erroneous types, so we won't establish
3037      an ABI mapping.  */
3038   if (type == error_mark_node)
3039     return NO_ABI_IDENTITY;
3040
3041   /* Zero-sized types disappear in the language->ABI mapping.  */
3042   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
3043     return NO_ABI_IDENTITY;
3044
3045   /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs.  */
3046   piece p = {};
3047   if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
3048     {
3049       machine_mode mode = TYPE_MODE_RAW (type);
3050       gcc_assert (VECTOR_MODE_P (mode)
3051                   && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
3052
3053       p.mode = p.orig_mode = mode;
3054       add_piece (p);
3055       return IS_PST;
3056     }
3057
3058   /* Check for user-defined PSTs.  */
3059   if (TREE_CODE (type) == ARRAY_TYPE)
3060     return analyze_array (type);
3061   if (TREE_CODE (type) == RECORD_TYPE)
3062     return analyze_record (type);
3063
3064   return ISNT_PST;
3065 }
3066
3067 /* Analyze a type that is known not to be passed or returned in memory.
3068    Return true if it has an ABI identity and is a Pure Scalable Type.  */
3069
3070 bool
3071 pure_scalable_type_info::analyze_registers (const_tree type)
3072 {
3073   analysis_result result = analyze (type);
3074   gcc_assert (result != DOESNT_MATTER);
3075   return result == IS_PST;
3076 }
3077
3078 /* Subroutine of analyze for handling ARRAY_TYPEs.  */
3079
3080 pure_scalable_type_info::analysis_result
3081 pure_scalable_type_info::analyze_array (const_tree type)
3082 {
3083   /* Analyze the element type.  */
3084   pure_scalable_type_info element_info;
3085   analysis_result result = element_info.analyze (TREE_TYPE (type));
3086   if (result != IS_PST)
3087     return result;
3088
3089   /* An array of unknown, flexible or variable length will be passed and
3090      returned by reference whatever we do.  */
3091   tree nelts_minus_one = array_type_nelts (type);
3092   if (!tree_fits_uhwi_p (nelts_minus_one))
3093     return DOESNT_MATTER;
3094
3095   /* Likewise if the array is constant-sized but too big to be interesting.
3096      The double checks against MAX_PIECES are to protect against overflow.  */
3097   unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
3098   if (count > MAX_PIECES)
3099     return DOESNT_MATTER;
3100   count += 1;
3101   if (count * element_info.pieces.length () > MAX_PIECES)
3102     return DOESNT_MATTER;
3103
3104   /* The above checks should have weeded out elements of unknown size.  */
3105   poly_uint64 element_bytes;
3106   if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
3107     gcc_unreachable ();
3108
3109   /* Build up the list of individual vectors and predicates.  */
3110   gcc_assert (!element_info.pieces.is_empty ());
3111   for (unsigned int i = 0; i < count; ++i)
3112     for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
3113       {
3114         piece p = element_info.pieces[j];
3115         p.offset += i * element_bytes;
3116         add_piece (p);
3117       }
3118   return IS_PST;
3119 }
3120
3121 /* Subroutine of analyze for handling RECORD_TYPEs.  */
3122
3123 pure_scalable_type_info::analysis_result
3124 pure_scalable_type_info::analyze_record (const_tree type)
3125 {
3126   for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
3127     {
3128       if (TREE_CODE (field) != FIELD_DECL)
3129         continue;
3130
3131       /* Zero-sized fields disappear in the language->ABI mapping.  */
3132       if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
3133         continue;
3134
3135       /* All fields with an ABI identity must be PSTs for the record as
3136          a whole to be a PST.  If any individual field is too big to be
3137          interesting then the record is too.  */
3138       pure_scalable_type_info field_info;
3139       analysis_result subresult = field_info.analyze (TREE_TYPE (field));
3140       if (subresult == NO_ABI_IDENTITY)
3141         continue;
3142       if (subresult != IS_PST)
3143         return subresult;
3144
3145       /* Since all previous fields are PSTs, we ought to be able to track
3146          the field offset using poly_ints.  */
3147       tree bitpos = bit_position (field);
3148       gcc_assert (poly_int_tree_p (bitpos));
3149
3150       /* For the same reason, it shouldn't be possible to create a PST field
3151          whose offset isn't byte-aligned.  */
3152       poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
3153                                                 BITS_PER_UNIT);
3154
3155       /* Punt if the record is too big to be interesting.  */
3156       poly_uint64 bytepos;
3157       if (!wide_bytepos.to_uhwi (&bytepos)
3158           || pieces.length () + field_info.pieces.length () > MAX_PIECES)
3159         return DOESNT_MATTER;
3160
3161       /* Add the individual vectors and predicates in the field to the
3162          record's list.  */
3163       gcc_assert (!field_info.pieces.is_empty ());
3164       for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
3165         {
3166           piece p = field_info.pieces[i];
3167           p.offset += bytepos;
3168           add_piece (p);
3169         }
3170     }
3171   /* Empty structures disappear in the language->ABI mapping.  */
3172   return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
3173 }
3174
3175 /* Add P to the list of pieces in the type.  */
3176
3177 void
3178 pure_scalable_type_info::add_piece (const piece &p)
3179 {
3180   /* Try to fold the new piece into the previous one to form a
3181      single-mode PST.  For example, if we see three consecutive vectors
3182      of the same mode, we can represent them using the corresponding
3183      3-tuple mode.
3184
3185      This is purely an optimization.  */
3186   if (!pieces.is_empty ())
3187     {
3188       piece &prev = pieces.last ();
3189       gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
3190       unsigned int nelems1, nelems2;
3191       if (prev.orig_mode == p.orig_mode
3192           && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
3193           && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
3194                                   GET_MODE_NUNITS (p.orig_mode), &nelems1)
3195           && constant_multiple_p (GET_MODE_NUNITS (p.mode),
3196                                   GET_MODE_NUNITS (p.orig_mode), &nelems2)
3197           && targetm.array_mode (p.orig_mode,
3198                                  nelems1 + nelems2).exists (&prev.mode))
3199         {
3200           prev.num_zr += p.num_zr;
3201           prev.num_pr += p.num_pr;
3202           return;
3203         }
3204     }
3205   pieces.quick_push (p);
3206 }
3207
3208 /* Return true if at least one possible value of type TYPE includes at
3209    least one object of Pure Scalable Type, in the sense of the AAPCS64.
3210
3211    This is a relatively expensive test for some types, so it should
3212    generally be made as late as possible.  */
3213
3214 static bool
3215 aarch64_some_values_include_pst_objects_p (const_tree type)
3216 {
3217   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
3218     return false;
3219
3220   if (aarch64_sve::builtin_type_p (type))
3221     return true;
3222
3223   if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
3224     return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
3225
3226   if (RECORD_OR_UNION_TYPE_P (type))
3227     for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
3228       if (TREE_CODE (field) == FIELD_DECL
3229           && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
3230         return true;
3231
3232   return false;
3233 }
3234
3235 /* Return the descriptor of the SIMD ABI.  */
3236
3237 static const predefined_function_abi &
3238 aarch64_simd_abi (void)
3239 {
3240   predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
3241   if (!simd_abi.initialized_p ())
3242     {
3243       HARD_REG_SET full_reg_clobbers
3244         = default_function_abi.full_reg_clobbers ();
3245       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
3246         if (FP_SIMD_SAVED_REGNUM_P (regno))
3247           CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3248       simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
3249     }
3250   return simd_abi;
3251 }
3252
3253 /* Return the descriptor of the SVE PCS.  */
3254
3255 static const predefined_function_abi &
3256 aarch64_sve_abi (void)
3257 {
3258   predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
3259   if (!sve_abi.initialized_p ())
3260     {
3261       HARD_REG_SET full_reg_clobbers
3262         = default_function_abi.full_reg_clobbers ();
3263       for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
3264         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3265       for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
3266         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3267       sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
3268     }
3269   return sve_abi;
3270 }
3271
3272 /* If X is an UNSPEC_SALT_ADDR expression, return the address that it
3273    wraps, otherwise return X itself.  */
3274
3275 static rtx
3276 strip_salt (rtx x)
3277 {
3278   rtx search = x;
3279   if (GET_CODE (search) == CONST)
3280     search = XEXP (search, 0);
3281   if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
3282     x = XVECEXP (search, 0, 0);
3283   return x;
3284 }
3285
3286 /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
3287    expression.  */
3288
3289 static rtx
3290 strip_offset_and_salt (rtx addr, poly_int64 *offset)
3291 {
3292   return strip_salt (strip_offset (addr, offset));
3293 }
3294
3295 /* Generate code to enable conditional branches in functions over 1 MiB.  */
3296 const char *
3297 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
3298                         const char * branch_format)
3299 {
3300     rtx_code_label * tmp_label = gen_label_rtx ();
3301     char label_buf[256];
3302     char buffer[128];
3303     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
3304                                  CODE_LABEL_NUMBER (tmp_label));
3305     const char *label_ptr = targetm.strip_name_encoding (label_buf);
3306     rtx dest_label = operands[pos_label];
3307     operands[pos_label] = tmp_label;
3308
3309     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
3310     output_asm_insn (buffer, operands);
3311
3312     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
3313     operands[pos_label] = dest_label;
3314     output_asm_insn (buffer, operands);
3315     return "";
3316 }
3317
3318 void
3319 aarch64_err_no_fpadvsimd (machine_mode mode)
3320 {
3321   if (TARGET_GENERAL_REGS_ONLY)
3322     if (FLOAT_MODE_P (mode))
3323       error ("%qs is incompatible with the use of floating-point types",
3324              "-mgeneral-regs-only");
3325     else
3326       error ("%qs is incompatible with the use of vector types",
3327              "-mgeneral-regs-only");
3328   else
3329     if (FLOAT_MODE_P (mode))
3330       error ("%qs feature modifier is incompatible with the use of"
3331              " floating-point types", "+nofp");
3332     else
3333       error ("%qs feature modifier is incompatible with the use of"
3334              " vector types", "+nofp");
3335 }
3336
3337 /* Report when we try to do something that requires SVE when SVE is disabled.
3338    This is an error of last resort and isn't very high-quality.  It usually
3339    involves attempts to measure the vector length in some way.  */
3340 static void
3341 aarch64_report_sve_required (void)
3342 {
3343   static bool reported_p = false;
3344
3345   /* Avoid reporting a slew of messages for a single oversight.  */
3346   if (reported_p)
3347     return;
3348
3349   error ("this operation requires the SVE ISA extension");
3350   inform (input_location, "you can enable SVE using the command-line"
3351           " option %<-march%>, or by using the %<target%>"
3352           " attribute or pragma");
3353   reported_p = true;
3354 }
3355
3356 /* Return true if REGNO is P0-P15 or one of the special FFR-related
3357    registers.  */
3358 inline bool
3359 pr_or_ffr_regnum_p (unsigned int regno)
3360 {
3361   return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
3362 }
3363
3364 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
3365    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
3366    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
3367    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
3368    and GENERAL_REGS is lower than the memory cost (in this case the best class
3369    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
3370    cost results in bad allocations with many redundant int<->FP moves which
3371    are expensive on various cores.
3372    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
3373    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
3374    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
3375    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
3376    The result of this is that it is no longer inefficient to have a higher
3377    memory move cost than the register move cost.
3378 */
3379
3380 static reg_class_t
3381 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
3382                                          reg_class_t best_class)
3383 {
3384   machine_mode mode;
3385
3386   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
3387       || !reg_class_subset_p (FP_REGS, allocno_class))
3388     return allocno_class;
3389
3390   if (!reg_class_subset_p (GENERAL_REGS, best_class)
3391       || !reg_class_subset_p (FP_REGS, best_class))
3392     return best_class;
3393
3394   mode = PSEUDO_REGNO_MODE (regno);
3395   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
3396 }
3397
3398 static unsigned int
3399 aarch64_min_divisions_for_recip_mul (machine_mode mode)
3400 {
3401   if (GET_MODE_UNIT_SIZE (mode) == 4)
3402     return aarch64_tune_params.min_div_recip_mul_sf;
3403   return aarch64_tune_params.min_div_recip_mul_df;
3404 }
3405
3406 /* Return the reassociation width of treeop OPC with mode MODE.  */
3407 static int
3408 aarch64_reassociation_width (unsigned opc, machine_mode mode)
3409 {
3410   if (VECTOR_MODE_P (mode))
3411     return aarch64_tune_params.vec_reassoc_width;
3412   if (INTEGRAL_MODE_P (mode))
3413     return aarch64_tune_params.int_reassoc_width;
3414   /* Reassociation reduces the number of FMAs which may result in worse
3415      performance.  Use a per-CPU setting for FMA reassociation which allows
3416      narrow CPUs with few FP pipes to switch it off (value of 1), and wider
3417      CPUs with many FP pipes to enable reassociation.
3418      Since the reassociation pass doesn't understand FMA at all, assume
3419      that any FP addition might turn into FMA.  */
3420   if (FLOAT_MODE_P (mode))
3421     return opc == PLUS_EXPR ? aarch64_tune_params.fma_reassoc_width
3422                             : aarch64_tune_params.fp_reassoc_width;
3423   return 1;
3424 }
3425
3426 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
3427 unsigned
3428 aarch64_debugger_regno (unsigned regno)
3429 {
3430    if (GP_REGNUM_P (regno))
3431      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
3432    else if (regno == SP_REGNUM)
3433      return AARCH64_DWARF_SP;
3434    else if (FP_REGNUM_P (regno))
3435      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
3436    else if (PR_REGNUM_P (regno))
3437      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
3438    else if (regno == VG_REGNUM)
3439      return AARCH64_DWARF_VG;
3440
3441    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
3442       equivalent DWARF register.  */
3443    return DWARF_FRAME_REGISTERS;
3444 }
3445
3446 /* Implement TARGET_DWARF_FRAME_REG_MODE.  */
3447 static machine_mode
3448 aarch64_dwarf_frame_reg_mode (int regno)
3449 {
3450   /* Predicate registers are call-clobbered in the EH ABI (which is
3451      ARM_PCS_AAPCS64), so they should not be described by CFI.
3452      Their size changes as VL changes, so any values computed by
3453      __builtin_init_dwarf_reg_size_table might not be valid for
3454      all frames.  */
3455   if (PR_REGNUM_P (regno))
3456     return VOIDmode;
3457   return default_dwarf_frame_reg_mode (regno);
3458 }
3459
3460 /* If X is a CONST_DOUBLE, return its bit representation as a constant
3461    integer, otherwise return X unmodified.  */
3462 static rtx
3463 aarch64_bit_representation (rtx x)
3464 {
3465   if (CONST_DOUBLE_P (x))
3466     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
3467   return x;
3468 }
3469
3470 /* Return an estimate for the number of quadwords in an SVE vector.  This is
3471    equivalent to the number of Advanced SIMD vectors in an SVE vector.  */
3472 static unsigned int
3473 aarch64_estimated_sve_vq ()
3474 {
3475   return estimated_poly_value (BITS_PER_SVE_VECTOR) / 128;
3476 }
3477
3478 /* Return true if MODE is an SVE predicate mode.  */
3479 static bool
3480 aarch64_sve_pred_mode_p (machine_mode mode)
3481 {
3482   return (TARGET_SVE
3483           && (mode == VNx16BImode
3484               || mode == VNx8BImode
3485               || mode == VNx4BImode
3486               || mode == VNx2BImode));
3487 }
3488
3489 /* Three mutually-exclusive flags describing a vector or predicate type.  */
3490 const unsigned int VEC_ADVSIMD  = 1;
3491 const unsigned int VEC_SVE_DATA = 2;
3492 const unsigned int VEC_SVE_PRED = 4;
3493 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
3494    a structure of 2, 3 or 4 vectors.  */
3495 const unsigned int VEC_STRUCT   = 8;
3496 /* Can be used in combination with VEC_SVE_DATA to indicate that the
3497    vector has fewer significant bytes than a full SVE vector.  */
3498 const unsigned int VEC_PARTIAL  = 16;
3499 /* Useful combinations of the above.  */
3500 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
3501 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
3502
3503 /* Return a set of flags describing the vector properties of mode MODE.
3504    Ignore modes that are not supported by the current target.  */
3505 static unsigned int
3506 aarch64_classify_vector_mode (machine_mode mode)
3507 {
3508   if (aarch64_sve_pred_mode_p (mode))
3509     return VEC_SVE_PRED;
3510
3511   /* Make the decision based on the mode's enum value rather than its
3512      properties, so that we keep the correct classification regardless
3513      of -msve-vector-bits.  */
3514   switch (mode)
3515     {
3516     /* Partial SVE QI vectors.  */
3517     case E_VNx2QImode:
3518     case E_VNx4QImode:
3519     case E_VNx8QImode:
3520     /* Partial SVE HI vectors.  */
3521     case E_VNx2HImode:
3522     case E_VNx4HImode:
3523     /* Partial SVE SI vector.  */
3524     case E_VNx2SImode:
3525     /* Partial SVE HF vectors.  */
3526     case E_VNx2HFmode:
3527     case E_VNx4HFmode:
3528     /* Partial SVE BF vectors.  */
3529     case E_VNx2BFmode:
3530     case E_VNx4BFmode:
3531     /* Partial SVE SF vector.  */
3532     case E_VNx2SFmode:
3533       return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
3534
3535     case E_VNx16QImode:
3536     case E_VNx8HImode:
3537     case E_VNx4SImode:
3538     case E_VNx2DImode:
3539     case E_VNx8BFmode:
3540     case E_VNx8HFmode:
3541     case E_VNx4SFmode:
3542     case E_VNx2DFmode:
3543       return TARGET_SVE ? VEC_SVE_DATA : 0;
3544
3545     /* x2 SVE vectors.  */
3546     case E_VNx32QImode:
3547     case E_VNx16HImode:
3548     case E_VNx8SImode:
3549     case E_VNx4DImode:
3550     case E_VNx16BFmode:
3551     case E_VNx16HFmode:
3552     case E_VNx8SFmode:
3553     case E_VNx4DFmode:
3554     /* x3 SVE vectors.  */
3555     case E_VNx48QImode:
3556     case E_VNx24HImode:
3557     case E_VNx12SImode:
3558     case E_VNx6DImode:
3559     case E_VNx24BFmode:
3560     case E_VNx24HFmode:
3561     case E_VNx12SFmode:
3562     case E_VNx6DFmode:
3563     /* x4 SVE vectors.  */
3564     case E_VNx64QImode:
3565     case E_VNx32HImode:
3566     case E_VNx16SImode:
3567     case E_VNx8DImode:
3568     case E_VNx32BFmode:
3569     case E_VNx32HFmode:
3570     case E_VNx16SFmode:
3571     case E_VNx8DFmode:
3572       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
3573
3574     case E_OImode:
3575     case E_CImode:
3576     case E_XImode:
3577       return TARGET_FLOAT ? VEC_ADVSIMD | VEC_STRUCT : 0;
3578
3579     /* Structures of 64-bit Advanced SIMD vectors.  */
3580     case E_V2x8QImode:
3581     case E_V2x4HImode:
3582     case E_V2x2SImode:
3583     case E_V2x1DImode:
3584     case E_V2x4BFmode:
3585     case E_V2x4HFmode:
3586     case E_V2x2SFmode:
3587     case E_V2x1DFmode:
3588     case E_V3x8QImode:
3589     case E_V3x4HImode:
3590     case E_V3x2SImode:
3591     case E_V3x1DImode:
3592     case E_V3x4BFmode:
3593     case E_V3x4HFmode:
3594     case E_V3x2SFmode:
3595     case E_V3x1DFmode:
3596     case E_V4x8QImode:
3597     case E_V4x4HImode:
3598     case E_V4x2SImode:
3599     case E_V4x1DImode:
3600     case E_V4x4BFmode:
3601     case E_V4x4HFmode:
3602     case E_V4x2SFmode:
3603     case E_V4x1DFmode:
3604       return TARGET_FLOAT ? VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL : 0;
3605
3606     /* Structures of 128-bit Advanced SIMD vectors.  */
3607     case E_V2x16QImode:
3608     case E_V2x8HImode:
3609     case E_V2x4SImode:
3610     case E_V2x2DImode:
3611     case E_V2x8BFmode:
3612     case E_V2x8HFmode:
3613     case E_V2x4SFmode:
3614     case E_V2x2DFmode:
3615     case E_V3x16QImode:
3616     case E_V3x8HImode:
3617     case E_V3x4SImode:
3618     case E_V3x2DImode:
3619     case E_V3x8BFmode:
3620     case E_V3x8HFmode:
3621     case E_V3x4SFmode:
3622     case E_V3x2DFmode:
3623     case E_V4x16QImode:
3624     case E_V4x8HImode:
3625     case E_V4x4SImode:
3626     case E_V4x2DImode:
3627     case E_V4x8BFmode:
3628     case E_V4x8HFmode:
3629     case E_V4x4SFmode:
3630     case E_V4x2DFmode:
3631       return TARGET_FLOAT ? VEC_ADVSIMD | VEC_STRUCT : 0;
3632
3633     /* 64-bit Advanced SIMD vectors.  */
3634     case E_V8QImode:
3635     case E_V4HImode:
3636     case E_V2SImode:
3637     case E_V1DImode:
3638     case E_V4HFmode:
3639     case E_V4BFmode:
3640     case E_V2SFmode:
3641     case E_V1DFmode:
3642     /* 128-bit Advanced SIMD vectors.  */
3643     case E_V16QImode:
3644     case E_V8HImode:
3645     case E_V4SImode:
3646     case E_V2DImode:
3647     case E_V8HFmode:
3648     case E_V8BFmode:
3649     case E_V4SFmode:
3650     case E_V2DFmode:
3651       return TARGET_FLOAT ? VEC_ADVSIMD : 0;
3652
3653     default:
3654       return 0;
3655     }
3656 }
3657
3658 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
3659 bool
3660 aarch64_advsimd_struct_mode_p (machine_mode mode)
3661 {
3662   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3663   return (vec_flags & VEC_ADVSIMD) && (vec_flags & VEC_STRUCT);
3664 }
3665
3666 /* Return true if MODE is an Advanced SIMD D-register structure mode.  */
3667 static bool
3668 aarch64_advsimd_partial_struct_mode_p (machine_mode mode)
3669 {
3670   return (aarch64_classify_vector_mode (mode)
3671           == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL));
3672 }
3673
3674 /* Return true if MODE is an Advanced SIMD Q-register structure mode.  */
3675 static bool
3676 aarch64_advsimd_full_struct_mode_p (machine_mode mode)
3677 {
3678   return (aarch64_classify_vector_mode (mode) == (VEC_ADVSIMD | VEC_STRUCT));
3679 }
3680
3681 /* Return true if MODE is any of the data vector modes, including
3682    structure modes.  */
3683 static bool
3684 aarch64_vector_data_mode_p (machine_mode mode)
3685 {
3686   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
3687 }
3688
3689 /* Return true if MODE is any form of SVE mode, including predicates,
3690    vectors and structures.  */
3691 bool
3692 aarch64_sve_mode_p (machine_mode mode)
3693 {
3694   return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
3695 }
3696
3697 /* Return true if MODE is an SVE data vector mode; either a single vector
3698    or a structure of vectors.  */
3699 static bool
3700 aarch64_sve_data_mode_p (machine_mode mode)
3701 {
3702   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
3703 }
3704
3705 /* Return the number of defined bytes in one constituent vector of
3706    SVE mode MODE, which has vector flags VEC_FLAGS.  */
3707 static poly_int64
3708 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
3709 {
3710   if (vec_flags & VEC_PARTIAL)
3711     /* A single partial vector.  */
3712     return GET_MODE_SIZE (mode);
3713
3714   if (vec_flags & VEC_SVE_DATA)
3715     /* A single vector or a tuple.  */
3716     return BYTES_PER_SVE_VECTOR;
3717
3718   /* A single predicate.  */
3719   gcc_assert (vec_flags & VEC_SVE_PRED);
3720   return BYTES_PER_SVE_PRED;
3721 }
3722
3723 /* If MODE holds an array of vectors, return the number of vectors
3724    in the array, otherwise return 1.  */
3725
3726 static unsigned int
3727 aarch64_ldn_stn_vectors (machine_mode mode)
3728 {
3729   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3730   if (vec_flags == (VEC_ADVSIMD | VEC_PARTIAL | VEC_STRUCT))
3731     return exact_div (GET_MODE_SIZE (mode), 8).to_constant ();
3732   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
3733     return exact_div (GET_MODE_SIZE (mode), 16).to_constant ();
3734   if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
3735     return exact_div (GET_MODE_SIZE (mode),
3736                       BYTES_PER_SVE_VECTOR).to_constant ();
3737   return 1;
3738 }
3739
3740 /* Given an Advanced SIMD vector mode MODE and a tuple size NELEMS, return the
3741    corresponding vector structure mode.  */
3742 static opt_machine_mode
3743 aarch64_advsimd_vector_array_mode (machine_mode mode,
3744                                    unsigned HOST_WIDE_INT nelems)
3745 {
3746   unsigned int flags = VEC_ADVSIMD | VEC_STRUCT;
3747   if (known_eq (GET_MODE_SIZE (mode), 8))
3748     flags |= VEC_PARTIAL;
3749
3750   machine_mode struct_mode;
3751   FOR_EACH_MODE_IN_CLASS (struct_mode, GET_MODE_CLASS (mode))
3752     if (aarch64_classify_vector_mode (struct_mode) == flags
3753         && GET_MODE_INNER (struct_mode) == GET_MODE_INNER (mode)
3754         && known_eq (GET_MODE_NUNITS (struct_mode),
3755              GET_MODE_NUNITS (mode) * nelems))
3756       return struct_mode;
3757   return opt_machine_mode ();
3758 }
3759
3760 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
3761
3762 opt_machine_mode
3763 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
3764 {
3765   enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
3766                             ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
3767   machine_mode mode;
3768   FOR_EACH_MODE_IN_CLASS (mode, mclass)
3769     if (inner_mode == GET_MODE_INNER (mode)
3770         && known_eq (nunits, GET_MODE_NUNITS (mode))
3771         && aarch64_sve_data_mode_p (mode))
3772       return mode;
3773   return opt_machine_mode ();
3774 }
3775
3776 /* Implement target hook TARGET_ARRAY_MODE.  */
3777 static opt_machine_mode
3778 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
3779 {
3780   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
3781       && IN_RANGE (nelems, 2, 4))
3782     return aarch64_sve_data_mode (GET_MODE_INNER (mode),
3783                                   GET_MODE_NUNITS (mode) * nelems);
3784   if (aarch64_classify_vector_mode (mode) == VEC_ADVSIMD
3785       && IN_RANGE (nelems, 2, 4))
3786     return aarch64_advsimd_vector_array_mode (mode, nelems);
3787
3788   return opt_machine_mode ();
3789 }
3790
3791 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
3792 static bool
3793 aarch64_array_mode_supported_p (machine_mode mode,
3794                                 unsigned HOST_WIDE_INT nelems)
3795 {
3796   if (TARGET_SIMD
3797       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
3798           || AARCH64_VALID_SIMD_DREG_MODE (mode))
3799       && (nelems >= 2 && nelems <= 4))
3800     return true;
3801
3802   return false;
3803 }
3804
3805 /* MODE is some form of SVE vector mode.  For data modes, return the number
3806    of vector register bits that each element of MODE occupies, such as 64
3807    for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
3808    in a 64-bit container).  For predicate modes, return the number of
3809    data bits controlled by each significant predicate bit.  */
3810
3811 static unsigned int
3812 aarch64_sve_container_bits (machine_mode mode)
3813 {
3814   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3815   poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
3816                              ? BITS_PER_SVE_VECTOR
3817                              : GET_MODE_BITSIZE (mode));
3818   return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
3819 }
3820
3821 /* Return the SVE predicate mode to use for elements that have
3822    ELEM_NBYTES bytes, if such a mode exists.  */
3823
3824 opt_machine_mode
3825 aarch64_sve_pred_mode (unsigned int elem_nbytes)
3826 {
3827   if (TARGET_SVE)
3828     {
3829       if (elem_nbytes == 1)
3830         return VNx16BImode;
3831       if (elem_nbytes == 2)
3832         return VNx8BImode;
3833       if (elem_nbytes == 4)
3834         return VNx4BImode;
3835       if (elem_nbytes == 8)
3836         return VNx2BImode;
3837     }
3838   return opt_machine_mode ();
3839 }
3840
3841 /* Return the SVE predicate mode that should be used to control
3842    SVE mode MODE.  */
3843
3844 machine_mode
3845 aarch64_sve_pred_mode (machine_mode mode)
3846 {
3847   unsigned int bits = aarch64_sve_container_bits (mode);
3848   return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
3849 }
3850
3851 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
3852
3853 static opt_machine_mode
3854 aarch64_get_mask_mode (machine_mode mode)
3855 {
3856   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3857   if (vec_flags & VEC_SVE_DATA)
3858     return aarch64_sve_pred_mode (mode);
3859
3860   return default_get_mask_mode (mode);
3861 }
3862
3863 /* Return the integer element mode associated with SVE mode MODE.  */
3864
3865 static scalar_int_mode
3866 aarch64_sve_element_int_mode (machine_mode mode)
3867 {
3868   poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3869                              ? BITS_PER_SVE_VECTOR
3870                              : GET_MODE_BITSIZE (mode));
3871   unsigned int elt_bits = vector_element_size (vector_bits,
3872                                                GET_MODE_NUNITS (mode));
3873   return int_mode_for_size (elt_bits, 0).require ();
3874 }
3875
3876 /* Return an integer element mode that contains exactly
3877    aarch64_sve_container_bits (MODE) bits.  This is wider than
3878    aarch64_sve_element_int_mode if MODE is a partial vector,
3879    otherwise it's the same.  */
3880
3881 static scalar_int_mode
3882 aarch64_sve_container_int_mode (machine_mode mode)
3883 {
3884   return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
3885 }
3886
3887 /* Return the integer vector mode associated with SVE mode MODE.
3888    Unlike related_int_vector_mode, this can handle the case in which
3889    MODE is a predicate (and thus has a different total size).  */
3890
3891 machine_mode
3892 aarch64_sve_int_mode (machine_mode mode)
3893 {
3894   scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
3895   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
3896 }
3897
3898 /* Implement TARGET_VECTORIZE_RELATED_MODE.  */
3899
3900 static opt_machine_mode
3901 aarch64_vectorize_related_mode (machine_mode vector_mode,
3902                                 scalar_mode element_mode,
3903                                 poly_uint64 nunits)
3904 {
3905   unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
3906
3907   /* If we're operating on SVE vectors, try to return an SVE mode.  */
3908   poly_uint64 sve_nunits;
3909   if ((vec_flags & VEC_SVE_DATA)
3910       && multiple_p (BYTES_PER_SVE_VECTOR,
3911                      GET_MODE_SIZE (element_mode), &sve_nunits))
3912     {
3913       machine_mode sve_mode;
3914       if (maybe_ne (nunits, 0U))
3915         {
3916           /* Try to find a full or partial SVE mode with exactly
3917              NUNITS units.  */
3918           if (multiple_p (sve_nunits, nunits)
3919               && aarch64_sve_data_mode (element_mode,
3920                                         nunits).exists (&sve_mode))
3921             return sve_mode;
3922         }
3923       else
3924         {
3925           /* Take the preferred number of units from the number of bytes
3926              that fit in VECTOR_MODE.  We always start by "autodetecting"
3927              a full vector mode with preferred_simd_mode, so vectors
3928              chosen here will also be full vector modes.  Then
3929              autovectorize_vector_modes tries smaller starting modes
3930              and thus smaller preferred numbers of units.  */
3931           sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
3932           if (aarch64_sve_data_mode (element_mode,
3933                                      sve_nunits).exists (&sve_mode))
3934             return sve_mode;
3935         }
3936     }
3937
3938   /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
3939   if (TARGET_SIMD
3940       && (vec_flags & VEC_ADVSIMD)
3941       && known_eq (nunits, 0U)
3942       && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
3943       && maybe_ge (GET_MODE_BITSIZE (element_mode)
3944                    * GET_MODE_NUNITS (vector_mode), 128U))
3945     {
3946       machine_mode res = aarch64_simd_container_mode (element_mode, 128);
3947       if (VECTOR_MODE_P (res))
3948         return res;
3949     }
3950
3951   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
3952 }
3953
3954 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
3955    prefer to use the first arithmetic operand as the else value if
3956    the else value doesn't matter, since that exactly matches the SVE
3957    destructive merging form.  For ternary operations we could either
3958    pick the first operand and use FMAD-like instructions or the last
3959    operand and use FMLA-like instructions; the latter seems more
3960    natural.  */
3961
3962 static tree
3963 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
3964 {
3965   return nops == 3 ? ops[2] : ops[0];
3966 }
3967
3968 /* Implement TARGET_HARD_REGNO_NREGS.  */
3969
3970 static unsigned int
3971 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
3972 {
3973   /* ??? Logically we should only need to provide a value when
3974      HARD_REGNO_MODE_OK says that the combination is valid,
3975      but at the moment we need to handle all modes.  Just ignore
3976      any runtime parts for registers that can't store them.  */
3977   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
3978   switch (aarch64_regno_regclass (regno))
3979     {
3980     case FP_REGS:
3981     case FP_LO_REGS:
3982     case FP_LO8_REGS:
3983       {
3984         unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3985         if (vec_flags & VEC_SVE_DATA)
3986           return exact_div (GET_MODE_SIZE (mode),
3987                             aarch64_vl_bytes (mode, vec_flags)).to_constant ();
3988         if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL))
3989           return GET_MODE_SIZE (mode).to_constant () / 8;
3990         return CEIL (lowest_size, UNITS_PER_VREG);
3991       }
3992     case PR_REGS:
3993     case PR_LO_REGS:
3994     case PR_HI_REGS:
3995     case FFR_REGS:
3996     case PR_AND_FFR_REGS:
3997       return 1;
3998     default:
3999       return CEIL (lowest_size, UNITS_PER_WORD);
4000     }
4001   gcc_unreachable ();
4002 }
4003
4004 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
4005
4006 static bool
4007 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
4008 {
4009   if (mode == V8DImode)
4010     return IN_RANGE (regno, R0_REGNUM, R23_REGNUM)
4011            && multiple_p (regno - R0_REGNUM, 2);
4012
4013   if (GET_MODE_CLASS (mode) == MODE_CC)
4014     return regno == CC_REGNUM;
4015
4016   if (regno == VG_REGNUM)
4017     /* This must have the same size as _Unwind_Word.  */
4018     return mode == DImode;
4019
4020   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
4021   if (vec_flags & VEC_SVE_PRED)
4022     return pr_or_ffr_regnum_p (regno);
4023
4024   if (pr_or_ffr_regnum_p (regno))
4025     return false;
4026
4027   if (regno == SP_REGNUM)
4028     /* The purpose of comparing with ptr_mode is to support the
4029        global register variable associated with the stack pointer
4030        register via the syntax of asm ("wsp") in ILP32.  */
4031     return mode == Pmode || mode == ptr_mode;
4032
4033   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
4034     return mode == Pmode;
4035
4036   if (GP_REGNUM_P (regno))
4037     {
4038       if (vec_flags & (VEC_ANY_SVE | VEC_STRUCT))
4039         return false;
4040       if (known_le (GET_MODE_SIZE (mode), 8))
4041         return true;
4042       if (known_le (GET_MODE_SIZE (mode), 16))
4043         return (regno & 1) == 0;
4044     }
4045   else if (FP_REGNUM_P (regno))
4046     {
4047       if (vec_flags & VEC_STRUCT)
4048         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
4049       else
4050         return !VECTOR_MODE_P (mode) || vec_flags != 0;
4051     }
4052
4053   return false;
4054 }
4055
4056 /* Return true if a function with type FNTYPE returns its value in
4057    SVE vector or predicate registers.  */
4058
4059 static bool
4060 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
4061 {
4062   tree return_type = TREE_TYPE (fntype);
4063
4064   pure_scalable_type_info pst_info;
4065   switch (pst_info.analyze (return_type))
4066     {
4067     case pure_scalable_type_info::IS_PST:
4068       return (pst_info.num_zr () <= NUM_FP_ARG_REGS
4069               && pst_info.num_pr () <= NUM_PR_ARG_REGS);
4070
4071     case pure_scalable_type_info::DOESNT_MATTER:
4072       gcc_assert (aarch64_return_in_memory_1 (return_type));
4073       return false;
4074
4075     case pure_scalable_type_info::NO_ABI_IDENTITY:
4076     case pure_scalable_type_info::ISNT_PST:
4077       return false;
4078     }
4079   gcc_unreachable ();
4080 }
4081
4082 /* Return true if a function with type FNTYPE takes arguments in
4083    SVE vector or predicate registers.  */
4084
4085 static bool
4086 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
4087 {
4088   CUMULATIVE_ARGS args_so_far_v;
4089   aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
4090                                 NULL_TREE, 0, true);
4091   cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
4092
4093   for (tree chain = TYPE_ARG_TYPES (fntype);
4094        chain && chain != void_list_node;
4095        chain = TREE_CHAIN (chain))
4096     {
4097       tree arg_type = TREE_VALUE (chain);
4098       if (arg_type == error_mark_node)
4099         return false;
4100
4101       function_arg_info arg (arg_type, /*named=*/true);
4102       apply_pass_by_reference_rules (&args_so_far_v, arg);
4103       pure_scalable_type_info pst_info;
4104       if (pst_info.analyze_registers (arg.type))
4105         {
4106           unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
4107           unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
4108           gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
4109           return true;
4110         }
4111
4112       targetm.calls.function_arg_advance (args_so_far, arg);
4113     }
4114   return false;
4115 }
4116
4117 /* Implement TARGET_FNTYPE_ABI.  */
4118
4119 static const predefined_function_abi &
4120 aarch64_fntype_abi (const_tree fntype)
4121 {
4122   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
4123     return aarch64_simd_abi ();
4124
4125   if (aarch64_returns_value_in_sve_regs_p (fntype)
4126       || aarch64_takes_arguments_in_sve_regs_p (fntype))
4127     return aarch64_sve_abi ();
4128
4129   return default_function_abi;
4130 }
4131
4132 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P.  */
4133
4134 static bool
4135 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
4136 {
4137   return (aarch64_sve::builtin_type_p (type1)
4138           == aarch64_sve::builtin_type_p (type2));
4139 }
4140
4141 /* Return true if we should emit CFI for register REGNO.  */
4142
4143 static bool
4144 aarch64_emit_cfi_for_reg_p (unsigned int regno)
4145 {
4146   return (GP_REGNUM_P (regno)
4147           || !default_function_abi.clobbers_full_reg_p (regno));
4148 }
4149
4150 /* Return the mode we should use to save and restore register REGNO.  */
4151
4152 static machine_mode
4153 aarch64_reg_save_mode (unsigned int regno)
4154 {
4155   if (GP_REGNUM_P (regno))
4156     return DImode;
4157
4158   if (FP_REGNUM_P (regno))
4159     switch (crtl->abi->id ())
4160       {
4161       case ARM_PCS_AAPCS64:
4162         /* Only the low 64 bits are saved by the base PCS.  */
4163         return DFmode;
4164
4165       case ARM_PCS_SIMD:
4166         /* The vector PCS saves the low 128 bits (which is the full
4167            register on non-SVE targets).  */
4168         return TFmode;
4169
4170       case ARM_PCS_SVE:
4171         /* Use vectors of DImode for registers that need frame
4172            information, so that the first 64 bytes of the save slot
4173            are always the equivalent of what storing D<n> would give.  */
4174         if (aarch64_emit_cfi_for_reg_p (regno))
4175           return VNx2DImode;
4176
4177         /* Use vectors of bytes otherwise, so that the layout is
4178            endian-agnostic, and so that we can use LDR and STR for
4179            big-endian targets.  */
4180         return VNx16QImode;
4181
4182       case ARM_PCS_TLSDESC:
4183       case ARM_PCS_UNKNOWN:
4184         break;
4185       }
4186
4187   if (PR_REGNUM_P (regno))
4188     /* Save the full predicate register.  */
4189     return VNx16BImode;
4190
4191   gcc_unreachable ();
4192 }
4193
4194 /* Implement TARGET_INSN_CALLEE_ABI.  */
4195
4196 const predefined_function_abi &
4197 aarch64_insn_callee_abi (const rtx_insn *insn)
4198 {
4199   rtx pat = PATTERN (insn);
4200   gcc_assert (GET_CODE (pat) == PARALLEL);
4201   rtx unspec = XVECEXP (pat, 0, 1);
4202   gcc_assert (GET_CODE (unspec) == UNSPEC
4203               && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
4204   return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
4205 }
4206
4207 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
4208    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
4209    clobbers the top 64 bits when restoring the bottom 64 bits.  */
4210
4211 static bool
4212 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
4213                                         unsigned int regno,
4214                                         machine_mode mode)
4215 {
4216   if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
4217     {
4218       poly_int64 per_register_size = GET_MODE_SIZE (mode);
4219       unsigned int nregs = hard_regno_nregs (regno, mode);
4220       if (nregs > 1)
4221         per_register_size = exact_div (per_register_size, nregs);
4222       if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
4223         return maybe_gt (per_register_size, 16);
4224       return maybe_gt (per_register_size, 8);
4225     }
4226   return false;
4227 }
4228
4229 /* Implement REGMODE_NATURAL_SIZE.  */
4230 poly_uint64
4231 aarch64_regmode_natural_size (machine_mode mode)
4232 {
4233   /* The natural size for SVE data modes is one SVE data vector,
4234      and similarly for predicates.  We can't independently modify
4235      anything smaller than that.  */
4236   /* ??? For now, only do this for variable-width SVE registers.
4237      Doing it for constant-sized registers breaks lower-subreg.cc.  */
4238   /* ??? And once that's fixed, we should probably have similar
4239      code for Advanced SIMD.  */
4240   if (!aarch64_sve_vg.is_constant ())
4241     {
4242       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
4243       if (vec_flags & VEC_SVE_PRED)
4244         return BYTES_PER_SVE_PRED;
4245       if (vec_flags & VEC_SVE_DATA)
4246         return BYTES_PER_SVE_VECTOR;
4247     }
4248   return UNITS_PER_WORD;
4249 }
4250
4251 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
4252 machine_mode
4253 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
4254                                      machine_mode mode)
4255 {
4256   /* The predicate mode determines which bits are significant and
4257      which are "don't care".  Decreasing the number of lanes would
4258      lose data while increasing the number of lanes would make bits
4259      unnecessarily significant.  */
4260   if (PR_REGNUM_P (regno))
4261     return mode;
4262   if (known_ge (GET_MODE_SIZE (mode), 4))
4263     return mode;
4264   else
4265     return SImode;
4266 }
4267
4268 /* Return true if I's bits are consecutive ones from the MSB.  */
4269 bool
4270 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
4271 {
4272   return exact_log2 (-i) != HOST_WIDE_INT_M1;
4273 }
4274
4275 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
4276    that strcpy from constants will be faster.  */
4277
4278 static HOST_WIDE_INT
4279 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
4280 {
4281   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
4282     return MAX (align, BITS_PER_WORD);
4283   return align;
4284 }
4285
4286 /* Return true if calls to DECL should be treated as
4287    long-calls (ie called via a register).  */
4288 static bool
4289 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
4290 {
4291   return false;
4292 }
4293
4294 /* Return true if calls to symbol-ref SYM should be treated as
4295    long-calls (ie called via a register).  */
4296 bool
4297 aarch64_is_long_call_p (rtx sym)
4298 {
4299   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
4300 }
4301
4302 /* Return true if calls to symbol-ref SYM should not go through
4303    plt stubs.  */
4304
4305 bool
4306 aarch64_is_noplt_call_p (rtx sym)
4307 {
4308   const_tree decl = SYMBOL_REF_DECL (sym);
4309
4310   if (flag_pic
4311       && decl
4312       && (!flag_plt
4313           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
4314       && !targetm.binds_local_p (decl))
4315     return true;
4316
4317   return false;
4318 }
4319
4320 /* Emit an insn that's a simple single-set.  Both the operands must be
4321    known to be valid.  */
4322 inline static rtx_insn *
4323 emit_set_insn (rtx x, rtx y)
4324 {
4325   return emit_insn (gen_rtx_SET (x, y));
4326 }
4327
4328 /* X and Y are two things to compare using CODE.  Emit the compare insn and
4329    return the rtx for register 0 in the proper mode.  */
4330 rtx
4331 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
4332 {
4333   machine_mode cmp_mode = GET_MODE (x);
4334   machine_mode cc_mode;
4335   rtx cc_reg;
4336
4337   if (cmp_mode == TImode)
4338     {
4339       gcc_assert (code == NE);
4340
4341       cc_mode = CCmode;
4342       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4343
4344       rtx x_lo = operand_subword (x, 0, 0, TImode);
4345       rtx y_lo = operand_subword (y, 0, 0, TImode);
4346       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
4347
4348       rtx x_hi = operand_subword (x, 1, 0, TImode);
4349       rtx y_hi = operand_subword (y, 1, 0, TImode);
4350       emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
4351                                gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
4352                                GEN_INT (AARCH64_EQ)));
4353     }
4354   else
4355     {
4356       cc_mode = SELECT_CC_MODE (code, x, y);
4357       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4358       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
4359     }
4360   return cc_reg;
4361 }
4362
4363 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
4364
4365 static rtx
4366 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
4367                                   machine_mode y_mode)
4368 {
4369   if (y_mode == E_QImode || y_mode == E_HImode)
4370     {
4371       if (CONST_INT_P (y))
4372         {
4373           y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
4374           y_mode = SImode;
4375         }
4376       else
4377         {
4378           rtx t, cc_reg;
4379           machine_mode cc_mode;
4380
4381           t = gen_rtx_ZERO_EXTEND (SImode, y);
4382           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
4383           cc_mode = CC_SWPmode;
4384           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4385           emit_set_insn (cc_reg, t);
4386           return cc_reg;
4387         }
4388     }
4389
4390   if (!aarch64_plus_operand (y, y_mode))
4391     y = force_reg (y_mode, y);
4392
4393   return aarch64_gen_compare_reg (code, x, y);
4394 }
4395
4396 /* Consider the operation:
4397
4398      OPERANDS[0] = CODE (OPERANDS[1], OPERANDS[2]) + OPERANDS[3]
4399
4400    where:
4401
4402    - CODE is [SU]MAX or [SU]MIN
4403    - OPERANDS[2] and OPERANDS[3] are constant integers
4404    - OPERANDS[3] is a positive or negative shifted 12-bit immediate
4405    - all operands have mode MODE
4406
4407    Decide whether it is possible to implement the operation using:
4408
4409      SUBS <tmp>, OPERANDS[1], -OPERANDS[3]
4410      or
4411      ADDS <tmp>, OPERANDS[1], OPERANDS[3]
4412
4413    followed by:
4414
4415      <insn> OPERANDS[0], <tmp>, [wx]zr, <cond>
4416
4417    where <insn> is one of CSEL, CSINV or CSINC.  Return true if so.
4418    If GENERATE_P is true, also update OPERANDS as follows:
4419
4420      OPERANDS[4] = -OPERANDS[3]
4421      OPERANDS[5] = the rtl condition representing <cond>
4422      OPERANDS[6] = <tmp>
4423      OPERANDS[7] = 0 for CSEL, -1 for CSINV or 1 for CSINC.  */
4424 bool
4425 aarch64_maxmin_plus_const (rtx_code code, rtx *operands, bool generate_p)
4426 {
4427   signop sgn = (code == UMAX || code == UMIN ? UNSIGNED : SIGNED);
4428   rtx dst = operands[0];
4429   rtx maxmin_op = operands[2];
4430   rtx add_op = operands[3];
4431   machine_mode mode = GET_MODE (dst);
4432
4433   /* max (x, y) - z == (x >= y + 1 ? x : y) - z
4434                     == (x >= y ? x : y) - z
4435                     == (x > y ? x : y) - z
4436                     == (x > y - 1 ? x : y) - z
4437
4438      min (x, y) - z == (x <= y - 1 ? x : y) - z
4439                     == (x <= y ? x : y) - z
4440                     == (x < y ? x : y) - z
4441                     == (x < y + 1 ? x : y) - z
4442
4443      Check whether z is in { y - 1, y, y + 1 } and pick the form(s) for
4444      which x is compared with z.  Set DIFF to y - z.  Thus the supported
4445      combinations are as follows, with DIFF being the value after the ":":
4446
4447      max (x, y) - z == x >= y + 1 ? x - (y + 1) : -1   [z == y + 1]
4448                     == x >= y ? x - y : 0              [z == y]
4449                     == x > y ? x - y : 0               [z == y]
4450                     == x > y - 1 ? x - (y - 1) : 1     [z == y - 1]
4451
4452      min (x, y) - z == x <= y - 1 ? x - (y - 1) : 1    [z == y - 1]
4453                     == x <= y ? x - y : 0              [z == y]
4454                     == x < y ? x - y : 0               [z == y]
4455                     == x < y + 1 ? x - (y + 1) : -1    [z == y + 1].  */
4456   auto maxmin_val = rtx_mode_t (maxmin_op, mode);
4457   auto add_val = rtx_mode_t (add_op, mode);
4458   auto sub_val = wi::neg (add_val);
4459   auto diff = wi::sub (maxmin_val, sub_val);
4460   if (!(diff == 0
4461         || (diff == 1 && wi::gt_p (maxmin_val, sub_val, sgn))
4462         || (diff == -1 && wi::lt_p (maxmin_val, sub_val, sgn))))
4463     return false;
4464
4465   if (!generate_p)
4466     return true;
4467
4468   rtx_code cmp;
4469   switch (code)
4470     {
4471     case SMAX:
4472       cmp = diff == 1 ? GT : GE;
4473       break;
4474     case UMAX:
4475       cmp = diff == 1 ? GTU : GEU;
4476       break;
4477     case SMIN:
4478       cmp = diff == -1 ? LT : LE;
4479       break;
4480     case UMIN:
4481       cmp = diff == -1 ? LTU : LEU;
4482       break;
4483     default:
4484       gcc_unreachable ();
4485     }
4486   rtx cc = gen_rtx_REG (CCmode, CC_REGNUM);
4487
4488   operands[4] = immed_wide_int_const (sub_val, mode);
4489   operands[5] = gen_rtx_fmt_ee (cmp, VOIDmode, cc, const0_rtx);
4490   if (can_create_pseudo_p ())
4491     operands[6] = gen_reg_rtx (mode);
4492   else
4493     operands[6] = dst;
4494   operands[7] = immed_wide_int_const (diff, mode);
4495
4496   return true;
4497 }
4498
4499
4500 /* Build the SYMBOL_REF for __tls_get_addr.  */
4501
4502 static GTY(()) rtx tls_get_addr_libfunc;
4503
4504 rtx
4505 aarch64_tls_get_addr (void)
4506 {
4507   if (!tls_get_addr_libfunc)
4508     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
4509   return tls_get_addr_libfunc;
4510 }
4511
4512 /* Return the TLS model to use for ADDR.  */
4513
4514 static enum tls_model
4515 tls_symbolic_operand_type (rtx addr)
4516 {
4517   enum tls_model tls_kind = TLS_MODEL_NONE;
4518   poly_int64 offset;
4519   addr = strip_offset_and_salt (addr, &offset);
4520   if (SYMBOL_REF_P (addr))
4521     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
4522
4523   return tls_kind;
4524 }
4525
4526 /* We'll allow lo_sum's in addresses in our legitimate addresses
4527    so that combine would take care of combining addresses where
4528    necessary, but for generation purposes, we'll generate the address
4529    as :
4530    RTL                               Absolute
4531    tmp = hi (symbol_ref);            adrp  x1, foo
4532    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
4533                                      nop
4534
4535    PIC                               TLS
4536    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
4537    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
4538                                      bl   __tls_get_addr
4539                                      nop
4540
4541    Load TLS symbol, depending on TLS mechanism and TLS access model.
4542
4543    Global Dynamic - Traditional TLS:
4544    adrp tmp, :tlsgd:imm
4545    add  dest, tmp, #:tlsgd_lo12:imm
4546    bl   __tls_get_addr
4547
4548    Global Dynamic - TLS Descriptors:
4549    adrp dest, :tlsdesc:imm
4550    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
4551    add  dest, dest, #:tlsdesc_lo12:imm
4552    blr  tmp
4553    mrs  tp, tpidr_el0
4554    add  dest, dest, tp
4555
4556    Initial Exec:
4557    mrs  tp, tpidr_el0
4558    adrp tmp, :gottprel:imm
4559    ldr  dest, [tmp, #:gottprel_lo12:imm]
4560    add  dest, dest, tp
4561
4562    Local Exec:
4563    mrs  tp, tpidr_el0
4564    add  t0, tp, #:tprel_hi12:imm, lsl #12
4565    add  t0, t0, #:tprel_lo12_nc:imm
4566 */
4567
4568 static void
4569 aarch64_load_symref_appropriately (rtx dest, rtx imm,
4570                                    enum aarch64_symbol_type type)
4571 {
4572   switch (type)
4573     {
4574     case SYMBOL_SMALL_ABSOLUTE:
4575       {
4576         /* In ILP32, the mode of dest can be either SImode or DImode.  */
4577         rtx tmp_reg = dest;
4578         machine_mode mode = GET_MODE (dest);
4579
4580         gcc_assert (mode == Pmode || mode == ptr_mode);
4581
4582         if (can_create_pseudo_p ())
4583           tmp_reg = gen_reg_rtx (mode);
4584
4585         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, copy_rtx (imm)));
4586         emit_insn (gen_add_losym (dest, tmp_reg, imm));
4587         return;
4588       }
4589
4590     case SYMBOL_TINY_ABSOLUTE:
4591       emit_insn (gen_rtx_SET (dest, imm));
4592       return;
4593
4594     case SYMBOL_SMALL_GOT_28K:
4595       {
4596         machine_mode mode = GET_MODE (dest);
4597         rtx gp_rtx = pic_offset_table_rtx;
4598         rtx insn;
4599         rtx mem;
4600
4601         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
4602            here before rtl expand.  Tree IVOPT will generate rtl pattern to
4603            decide rtx costs, in which case pic_offset_table_rtx is not
4604            initialized.  For that case no need to generate the first adrp
4605            instruction as the final cost for global variable access is
4606            one instruction.  */
4607         if (gp_rtx != NULL)
4608           {
4609             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
4610                using the page base as GOT base, the first page may be wasted,
4611                in the worst scenario, there is only 28K space for GOT).
4612
4613                The generate instruction sequence for accessing global variable
4614                is:
4615
4616                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
4617
4618                Only one instruction needed. But we must initialize
4619                pic_offset_table_rtx properly.  We generate initialize insn for
4620                every global access, and allow CSE to remove all redundant.
4621
4622                The final instruction sequences will look like the following
4623                for multiply global variables access.
4624
4625                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
4626
4627                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
4628                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
4629                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
4630                  ...  */
4631
4632             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
4633             crtl->uses_pic_offset_table = 1;
4634             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
4635
4636             if (mode != GET_MODE (gp_rtx))
4637              gp_rtx = gen_lowpart (mode, gp_rtx);
4638
4639           }
4640
4641         if (mode == ptr_mode)
4642           {
4643             if (mode == DImode)
4644               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
4645             else
4646               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
4647
4648             mem = XVECEXP (SET_SRC (insn), 0, 0);
4649           }
4650         else
4651           {
4652             gcc_assert (mode == Pmode);
4653
4654             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
4655             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
4656           }
4657
4658         /* The operand is expected to be MEM.  Whenever the related insn
4659            pattern changed, above code which calculate mem should be
4660            updated.  */
4661         gcc_assert (MEM_P (mem));
4662         MEM_READONLY_P (mem) = 1;
4663         MEM_NOTRAP_P (mem) = 1;
4664         emit_insn (insn);
4665         return;
4666       }
4667
4668     case SYMBOL_SMALL_GOT_4G:
4669       emit_insn (gen_rtx_SET (dest, imm));
4670       return;
4671
4672     case SYMBOL_SMALL_TLSGD:
4673       {
4674         rtx_insn *insns;
4675         /* The return type of __tls_get_addr is the C pointer type
4676            so use ptr_mode.  */
4677         rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
4678         rtx tmp_reg = dest;
4679
4680         if (GET_MODE (dest) != ptr_mode)
4681           tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
4682
4683         start_sequence ();
4684         if (ptr_mode == SImode)
4685           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
4686         else
4687           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
4688         insns = get_insns ();
4689         end_sequence ();
4690
4691         RTL_CONST_CALL_P (insns) = 1;
4692         emit_libcall_block (insns, tmp_reg, result, imm);
4693         /* Convert back to the mode of the dest adding a zero_extend
4694            from SImode (ptr_mode) to DImode (Pmode). */
4695         if (dest != tmp_reg)
4696           convert_move (dest, tmp_reg, true);
4697         return;
4698       }
4699
4700     case SYMBOL_SMALL_TLSDESC:
4701       {
4702         machine_mode mode = GET_MODE (dest);
4703         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
4704         rtx tp;
4705
4706         gcc_assert (mode == Pmode || mode == ptr_mode);
4707
4708         /* In ILP32, the got entry is always of SImode size.  Unlike
4709            small GOT, the dest is fixed at reg 0.  */
4710         if (TARGET_ILP32)
4711           emit_insn (gen_tlsdesc_small_si (imm));
4712         else
4713           emit_insn (gen_tlsdesc_small_di (imm));
4714         tp = aarch64_load_tp (NULL);
4715
4716         if (mode != Pmode)
4717           tp = gen_lowpart (mode, tp);
4718
4719         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
4720         if (REG_P (dest))
4721           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4722         return;
4723       }
4724
4725     case SYMBOL_SMALL_TLSIE:
4726       {
4727         /* In ILP32, the mode of dest can be either SImode or DImode,
4728            while the got entry is always of SImode size.  The mode of
4729            dest depends on how dest is used: if dest is assigned to a
4730            pointer (e.g. in the memory), it has SImode; it may have
4731            DImode if dest is dereferenced to access the memeory.
4732            This is why we have to handle three different tlsie_small
4733            patterns here (two patterns for ILP32).  */
4734         machine_mode mode = GET_MODE (dest);
4735         rtx tmp_reg = gen_reg_rtx (mode);
4736         rtx tp = aarch64_load_tp (NULL);
4737
4738         if (mode == ptr_mode)
4739           {
4740             if (mode == DImode)
4741               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
4742             else
4743               {
4744                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
4745                 tp = gen_lowpart (mode, tp);
4746               }
4747           }
4748         else
4749           {
4750             gcc_assert (mode == Pmode);
4751             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
4752           }
4753
4754         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
4755         if (REG_P (dest))
4756           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4757         return;
4758       }
4759
4760     case SYMBOL_TLSLE12:
4761     case SYMBOL_TLSLE24:
4762     case SYMBOL_TLSLE32:
4763     case SYMBOL_TLSLE48:
4764       {
4765         machine_mode mode = GET_MODE (dest);
4766         rtx tp = aarch64_load_tp (NULL);
4767
4768         if (mode != Pmode)
4769           tp = gen_lowpart (mode, tp);
4770
4771         switch (type)
4772           {
4773           case SYMBOL_TLSLE12:
4774             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
4775                         (dest, tp, imm));
4776             break;
4777           case SYMBOL_TLSLE24:
4778             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
4779                         (dest, tp, imm));
4780           break;
4781           case SYMBOL_TLSLE32:
4782             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
4783                         (dest, imm));
4784             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
4785                         (dest, dest, tp));
4786           break;
4787           case SYMBOL_TLSLE48:
4788             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
4789                         (dest, imm));
4790             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
4791                         (dest, dest, tp));
4792             break;
4793           default:
4794             gcc_unreachable ();
4795           }
4796
4797         if (REG_P (dest))
4798           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4799         return;
4800       }
4801
4802     case SYMBOL_TINY_GOT:
4803       {
4804         rtx insn;
4805         machine_mode mode = GET_MODE (dest);
4806
4807         if (mode == ptr_mode)
4808           insn = gen_ldr_got_tiny (mode, dest, imm);
4809         else
4810           {
4811             gcc_assert (mode == Pmode);
4812             insn = gen_ldr_got_tiny_sidi (dest, imm);
4813           }
4814
4815         emit_insn (insn);
4816         return;
4817       }
4818
4819     case SYMBOL_TINY_TLSIE:
4820       {
4821         machine_mode mode = GET_MODE (dest);
4822         rtx tp = aarch64_load_tp (NULL);
4823
4824         if (mode == ptr_mode)
4825           {
4826             if (mode == DImode)
4827               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
4828             else
4829               {
4830                 tp = gen_lowpart (mode, tp);
4831                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
4832               }
4833           }
4834         else
4835           {
4836             gcc_assert (mode == Pmode);
4837             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
4838           }
4839
4840         if (REG_P (dest))
4841           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4842         return;
4843       }
4844
4845     default:
4846       gcc_unreachable ();
4847     }
4848 }
4849
4850 /* Emit a move from SRC to DEST.  Assume that the move expanders can
4851    handle all moves if !can_create_pseudo_p ().  The distinction is
4852    important because, unlike emit_move_insn, the move expanders know
4853    how to force Pmode objects into the constant pool even when the
4854    constant pool address is not itself legitimate.  */
4855 static rtx
4856 aarch64_emit_move (rtx dest, rtx src)
4857 {
4858   return (can_create_pseudo_p ()
4859           ? emit_move_insn (dest, src)
4860           : emit_move_insn_1 (dest, src));
4861 }
4862
4863 /* Apply UNOPTAB to OP and store the result in DEST.  */
4864
4865 static void
4866 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
4867 {
4868   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
4869   if (dest != tmp)
4870     emit_move_insn (dest, tmp);
4871 }
4872
4873 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
4874
4875 static void
4876 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
4877 {
4878   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
4879                           OPTAB_DIRECT);
4880   if (dest != tmp)
4881     emit_move_insn (dest, tmp);
4882 }
4883
4884 /* Split a 128-bit move operation into two 64-bit move operations,
4885    taking care to handle partial overlap of register to register
4886    copies.  Special cases are needed when moving between GP regs and
4887    FP regs.  SRC can be a register, constant or memory; DST a register
4888    or memory.  If either operand is memory it must not have any side
4889    effects.  */
4890 void
4891 aarch64_split_128bit_move (rtx dst, rtx src)
4892 {
4893   rtx dst_lo, dst_hi;
4894   rtx src_lo, src_hi;
4895
4896   machine_mode mode = GET_MODE (dst);
4897
4898   gcc_assert (mode == TImode || mode == TFmode || mode == TDmode);
4899   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
4900   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
4901
4902   if (REG_P (dst) && REG_P (src))
4903     {
4904       int src_regno = REGNO (src);
4905       int dst_regno = REGNO (dst);
4906
4907       /* Handle FP <-> GP regs.  */
4908       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
4909         {
4910           src_lo = gen_lowpart (word_mode, src);
4911           src_hi = gen_highpart (word_mode, src);
4912
4913           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
4914           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
4915           return;
4916         }
4917       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
4918         {
4919           dst_lo = gen_lowpart (word_mode, dst);
4920           dst_hi = gen_highpart (word_mode, dst);
4921
4922           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
4923           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
4924           return;
4925         }
4926     }
4927
4928   dst_lo = gen_lowpart (word_mode, dst);
4929   dst_hi = gen_highpart (word_mode, dst);
4930   src_lo = gen_lowpart (word_mode, src);
4931   src_hi = gen_highpart_mode (word_mode, mode, src);
4932
4933   /* At most one pairing may overlap.  */
4934   if (reg_overlap_mentioned_p (dst_lo, src_hi))
4935     {
4936       aarch64_emit_move (dst_hi, src_hi);
4937       aarch64_emit_move (dst_lo, src_lo);
4938     }
4939   else
4940     {
4941       aarch64_emit_move (dst_lo, src_lo);
4942       aarch64_emit_move (dst_hi, src_hi);
4943     }
4944 }
4945
4946 /* Return true if we should split a move from 128-bit value SRC
4947    to 128-bit register DEST.  */
4948
4949 bool
4950 aarch64_split_128bit_move_p (rtx dst, rtx src)
4951 {
4952   if (FP_REGNUM_P (REGNO (dst)))
4953     return REG_P (src) && !FP_REGNUM_P (REGNO (src));
4954   /* All moves to GPRs need to be split.  */
4955   return true;
4956 }
4957
4958 /* Split a complex SIMD move.  */
4959
4960 void
4961 aarch64_split_simd_move (rtx dst, rtx src)
4962 {
4963   machine_mode src_mode = GET_MODE (src);
4964   machine_mode dst_mode = GET_MODE (dst);
4965
4966   gcc_assert (VECTOR_MODE_P (dst_mode));
4967
4968   if (REG_P (dst) && REG_P (src))
4969     {
4970       gcc_assert (VECTOR_MODE_P (src_mode));
4971       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
4972     }
4973 }
4974
4975 bool
4976 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
4977                               machine_mode ymode, rtx y)
4978 {
4979   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
4980   gcc_assert (r != NULL);
4981   return rtx_equal_p (x, r);
4982 }
4983
4984 /* Return TARGET if it is nonnull and a register of mode MODE.
4985    Otherwise, return a fresh register of mode MODE if we can,
4986    or TARGET reinterpreted as MODE if we can't.  */
4987
4988 static rtx
4989 aarch64_target_reg (rtx target, machine_mode mode)
4990 {
4991   if (target && REG_P (target) && GET_MODE (target) == mode)
4992     return target;
4993   if (!can_create_pseudo_p ())
4994     {
4995       gcc_assert (target);
4996       return gen_lowpart (mode, target);
4997     }
4998   return gen_reg_rtx (mode);
4999 }
5000
5001 /* Return a register that contains the constant in BUILDER, given that
5002    the constant is a legitimate move operand.  Use TARGET as the register
5003    if it is nonnull and convenient.  */
5004
5005 static rtx
5006 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
5007 {
5008   rtx src = builder.build ();
5009   target = aarch64_target_reg (target, GET_MODE (src));
5010   emit_insn (gen_rtx_SET (target, src));
5011   return target;
5012 }
5013
5014 static rtx
5015 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
5016 {
5017   if (can_create_pseudo_p ())
5018     return force_reg (mode, value);
5019   else
5020     {
5021       gcc_assert (x);
5022       aarch64_emit_move (x, value);
5023       return x;
5024     }
5025 }
5026
5027 /* Return true if predicate value X is a constant in which every element
5028    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
5029    value, i.e. as a predicate in which all bits are significant.  */
5030
5031 static bool
5032 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
5033 {
5034   if (!CONST_VECTOR_P (x))
5035     return false;
5036
5037   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
5038                                              GET_MODE_NUNITS (GET_MODE (x)));
5039   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
5040   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
5041   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
5042
5043   unsigned int nelts = const_vector_encoded_nelts (x);
5044   for (unsigned int i = 0; i < nelts; ++i)
5045     {
5046       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
5047       if (!CONST_INT_P (elt))
5048         return false;
5049
5050       builder.quick_push (elt);
5051       for (unsigned int j = 1; j < factor; ++j)
5052         builder.quick_push (const0_rtx);
5053     }
5054   builder.finalize ();
5055   return true;
5056 }
5057
5058 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
5059    widest predicate element size it can have (that is, the largest size
5060    for which each element would still be 0 or 1).  */
5061
5062 unsigned int
5063 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
5064 {
5065   /* Start with the most optimistic assumption: that we only need
5066      one bit per pattern.  This is what we will use if only the first
5067      bit in each pattern is ever set.  */
5068   unsigned int mask = GET_MODE_SIZE (DImode);
5069   mask |= builder.npatterns ();
5070
5071   /* Look for set bits.  */
5072   unsigned int nelts = builder.encoded_nelts ();
5073   for (unsigned int i = 1; i < nelts; ++i)
5074     if (INTVAL (builder.elt (i)) != 0)
5075       {
5076         if (i & 1)
5077           return 1;
5078         mask |= i;
5079       }
5080   return mask & -mask;
5081 }
5082
5083 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
5084    return that predicate mode, otherwise return opt_machine_mode ().  */
5085
5086 opt_machine_mode
5087 aarch64_ptrue_all_mode (rtx x)
5088 {
5089   gcc_assert (GET_MODE (x) == VNx16BImode);
5090   if (!CONST_VECTOR_P (x)
5091       || !CONST_VECTOR_DUPLICATE_P (x)
5092       || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
5093       || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
5094     return opt_machine_mode ();
5095
5096   unsigned int nelts = const_vector_encoded_nelts (x);
5097   for (unsigned int i = 1; i < nelts; ++i)
5098     if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
5099       return opt_machine_mode ();
5100
5101   return aarch64_sve_pred_mode (nelts);
5102 }
5103
5104 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
5105    that the constant would have with predicate element size ELT_SIZE
5106    (ignoring the upper bits in each element) and return:
5107
5108    * -1 if all bits are set
5109    * N if the predicate has N leading set bits followed by all clear bits
5110    * 0 if the predicate does not have any of these forms.  */
5111
5112 int
5113 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
5114                               unsigned int elt_size)
5115 {
5116   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
5117      followed by set bits.  */
5118   if (builder.nelts_per_pattern () == 3)
5119     return 0;
5120
5121   /* Skip over leading set bits.  */
5122   unsigned int nelts = builder.encoded_nelts ();
5123   unsigned int i = 0;
5124   for (; i < nelts; i += elt_size)
5125     if (INTVAL (builder.elt (i)) == 0)
5126       break;
5127   unsigned int vl = i / elt_size;
5128
5129   /* Check for the all-true case.  */
5130   if (i == nelts)
5131     return -1;
5132
5133   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
5134      repeating pattern of set bits followed by clear bits.  */
5135   if (builder.nelts_per_pattern () != 2)
5136     return 0;
5137
5138   /* We have a "foreground" value and a duplicated "background" value.
5139      If the background might repeat and the last set bit belongs to it,
5140      we might have set bits followed by clear bits followed by set bits.  */
5141   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
5142     return 0;
5143
5144   /* Make sure that the rest are all clear.  */
5145   for (; i < nelts; i += elt_size)
5146     if (INTVAL (builder.elt (i)) != 0)
5147       return 0;
5148
5149   return vl;
5150 }
5151
5152 /* See if there is an svpattern that encodes an SVE predicate of mode
5153    PRED_MODE in which the first VL bits are set and the rest are clear.
5154    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
5155    A VL of -1 indicates an all-true vector.  */
5156
5157 aarch64_svpattern
5158 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
5159 {
5160   if (vl < 0)
5161     return AARCH64_SV_ALL;
5162
5163   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
5164     return AARCH64_NUM_SVPATTERNS;
5165
5166   if (vl >= 1 && vl <= 8)
5167     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
5168
5169   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
5170     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
5171
5172   int max_vl;
5173   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
5174     {
5175       if (vl == (max_vl / 3) * 3)
5176         return AARCH64_SV_MUL3;
5177       /* These would only trigger for non-power-of-2 lengths.  */
5178       if (vl == (max_vl & -4))
5179         return AARCH64_SV_MUL4;
5180       if (vl == (1 << floor_log2 (max_vl)))
5181         return AARCH64_SV_POW2;
5182       if (vl == max_vl)
5183         return AARCH64_SV_ALL;
5184     }
5185   return AARCH64_NUM_SVPATTERNS;
5186 }
5187
5188 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
5189    bits has the lowest bit set and the upper bits clear.  This is the
5190    VNx16BImode equivalent of a PTRUE for controlling elements of
5191    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
5192    all bits are significant, even the upper zeros.  */
5193
5194 rtx
5195 aarch64_ptrue_all (unsigned int elt_size)
5196 {
5197   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
5198   builder.quick_push (const1_rtx);
5199   for (unsigned int i = 1; i < elt_size; ++i)
5200     builder.quick_push (const0_rtx);
5201   return builder.build ();
5202 }
5203
5204 /* Return an all-true predicate register of mode MODE.  */
5205
5206 rtx
5207 aarch64_ptrue_reg (machine_mode mode)
5208 {
5209   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
5210   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
5211   return gen_lowpart (mode, reg);
5212 }
5213
5214 /* Return an all-false predicate register of mode MODE.  */
5215
5216 rtx
5217 aarch64_pfalse_reg (machine_mode mode)
5218 {
5219   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
5220   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
5221   return gen_lowpart (mode, reg);
5222 }
5223
5224 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
5225    for it.  PRED2[0] is the predicate for the instruction whose result
5226    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
5227    for it.  Return true if we can prove that the two predicates are
5228    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
5229    with PRED1[0] without changing behavior.  */
5230
5231 bool
5232 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
5233 {
5234   machine_mode mode = GET_MODE (pred1[0]);
5235   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
5236               && mode == GET_MODE (pred2[0])
5237               && aarch64_sve_ptrue_flag (pred1[1], SImode)
5238               && aarch64_sve_ptrue_flag (pred2[1], SImode));
5239
5240   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
5241                    || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
5242   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
5243                    || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
5244   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
5245 }
5246
5247 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
5248    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
5249    Use TARGET as the target register if nonnull and convenient.  */
5250
5251 static rtx
5252 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
5253                           machine_mode data_mode, rtx op1, rtx op2)
5254 {
5255   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
5256   expand_operand ops[5];
5257   create_output_operand (&ops[0], target, pred_mode);
5258   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
5259   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
5260   create_input_operand (&ops[3], op1, data_mode);
5261   create_input_operand (&ops[4], op2, data_mode);
5262   expand_insn (icode, 5, ops);
5263   return ops[0].value;
5264 }
5265
5266 /* Use a comparison to convert integer vector SRC into MODE, which is
5267    the corresponding SVE predicate mode.  Use TARGET for the result
5268    if it's nonnull and convenient.  */
5269
5270 rtx
5271 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
5272 {
5273   machine_mode src_mode = GET_MODE (src);
5274   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
5275                                    src, CONST0_RTX (src_mode));
5276 }
5277
5278 /* Return the assembly token for svprfop value PRFOP.  */
5279
5280 static const char *
5281 svprfop_token (enum aarch64_svprfop prfop)
5282 {
5283   switch (prfop)
5284     {
5285 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
5286     AARCH64_FOR_SVPRFOP (CASE)
5287 #undef CASE
5288     case AARCH64_NUM_SVPRFOPS:
5289       break;
5290     }
5291   gcc_unreachable ();
5292 }
5293
5294 /* Return the assembly string for an SVE prefetch operation with
5295    mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
5296    and that SUFFIX is the format for the remaining operands.  */
5297
5298 char *
5299 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
5300                              const char *suffix)
5301 {
5302   static char buffer[128];
5303   aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
5304   unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
5305                                    mnemonic, svprfop_token (prfop), suffix);
5306   gcc_assert (written < sizeof (buffer));
5307   return buffer;
5308 }
5309
5310 /* Check whether we can calculate the number of elements in PATTERN
5311    at compile time, given that there are NELTS_PER_VQ elements per
5312    128-bit block.  Return the value if so, otherwise return -1.  */
5313
5314 HOST_WIDE_INT
5315 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
5316 {
5317   unsigned int vl, const_vg;
5318   if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
5319     vl = 1 + (pattern - AARCH64_SV_VL1);
5320   else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
5321     vl = 16 << (pattern - AARCH64_SV_VL16);
5322   else if (aarch64_sve_vg.is_constant (&const_vg))
5323     {
5324       /* There are two vector granules per quadword.  */
5325       unsigned int nelts = (const_vg / 2) * nelts_per_vq;
5326       switch (pattern)
5327         {
5328         case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
5329         case AARCH64_SV_MUL4: return nelts & -4;
5330         case AARCH64_SV_MUL3: return (nelts / 3) * 3;
5331         case AARCH64_SV_ALL: return nelts;
5332         default: gcc_unreachable ();
5333         }
5334     }
5335   else
5336     return -1;
5337
5338   /* There are two vector granules per quadword.  */
5339   poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
5340   if (known_le (vl, nelts_all))
5341     return vl;
5342
5343   /* Requesting more elements than are available results in a PFALSE.  */
5344   if (known_gt (vl, nelts_all))
5345     return 0;
5346
5347   return -1;
5348 }
5349
5350 /* Return true if we can move VALUE into a register using a single
5351    CNT[BHWD] instruction.  */
5352
5353 static bool
5354 aarch64_sve_cnt_immediate_p (poly_int64 value)
5355 {
5356   HOST_WIDE_INT factor = value.coeffs[0];
5357   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
5358   return (value.coeffs[1] == factor
5359           && IN_RANGE (factor, 2, 16 * 16)
5360           && (factor & 1) == 0
5361           && factor <= 16 * (factor & -factor));
5362 }
5363
5364 /* Likewise for rtx X.  */
5365
5366 bool
5367 aarch64_sve_cnt_immediate_p (rtx x)
5368 {
5369   poly_int64 value;
5370   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
5371 }
5372
5373 /* Return the asm string for an instruction with a CNT-like vector size
5374    operand (a vector pattern followed by a multiplier in the range [1, 16]).
5375    PREFIX is the mnemonic without the size suffix and OPERANDS is the
5376    first part of the operands template (the part that comes before the
5377    vector size itself).  PATTERN is the pattern to use.  FACTOR is the
5378    number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
5379    in each quadword.  If it is zero, we can use any element size.  */
5380
5381 static char *
5382 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
5383                                   aarch64_svpattern pattern,
5384                                   unsigned int factor,
5385                                   unsigned int nelts_per_vq)
5386 {
5387   static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
5388
5389   if (nelts_per_vq == 0)
5390     /* There is some overlap in the ranges of the four CNT instructions.
5391        Here we always use the smallest possible element size, so that the
5392        multiplier is 1 whereever possible.  */
5393     nelts_per_vq = factor & -factor;
5394   int shift = std::min (exact_log2 (nelts_per_vq), 4);
5395   gcc_assert (IN_RANGE (shift, 1, 4));
5396   char suffix = "dwhb"[shift - 1];
5397
5398   factor >>= shift;
5399   unsigned int written;
5400   if (pattern == AARCH64_SV_ALL && factor == 1)
5401     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
5402                         prefix, suffix, operands);
5403   else if (factor == 1)
5404     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
5405                         prefix, suffix, operands, svpattern_token (pattern));
5406   else
5407     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
5408                         prefix, suffix, operands, svpattern_token (pattern),
5409                         factor);
5410   gcc_assert (written < sizeof (buffer));
5411   return buffer;
5412 }
5413
5414 /* Return the asm string for an instruction with a CNT-like vector size
5415    operand (a vector pattern followed by a multiplier in the range [1, 16]).
5416    PREFIX is the mnemonic without the size suffix and OPERANDS is the
5417    first part of the operands template (the part that comes before the
5418    vector size itself).  X is the value of the vector size operand,
5419    as a polynomial integer rtx; we need to convert this into an "all"
5420    pattern with a multiplier.  */
5421
5422 char *
5423 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
5424                                   rtx x)
5425 {
5426   poly_int64 value = rtx_to_poly_int64 (x);
5427   gcc_assert (aarch64_sve_cnt_immediate_p (value));
5428   return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
5429                                            value.coeffs[1], 0);
5430 }
5431
5432 /* Return the asm string for an instruction with a CNT-like vector size
5433    operand (a vector pattern followed by a multiplier in the range [1, 16]).
5434    PREFIX is the mnemonic without the size suffix and OPERANDS is the
5435    first part of the operands template (the part that comes before the
5436    vector size itself).  CNT_PAT[0..2] are the operands of the
5437    UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details.  */
5438
5439 char *
5440 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
5441                                       const char *operands, rtx *cnt_pat)
5442 {
5443   aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
5444   unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
5445   unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
5446   return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
5447                                            factor, nelts_per_vq);
5448 }
5449
5450 /* Return true if we can add X using a single SVE INC or DEC instruction.  */
5451
5452 bool
5453 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
5454 {
5455   poly_int64 value;
5456   return (poly_int_rtx_p (x, &value)
5457           && (aarch64_sve_cnt_immediate_p (value)
5458               || aarch64_sve_cnt_immediate_p (-value)));
5459 }
5460
5461 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
5462    operand 0.  */
5463
5464 char *
5465 aarch64_output_sve_scalar_inc_dec (rtx offset)
5466 {
5467   poly_int64 offset_value = rtx_to_poly_int64 (offset);
5468   gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
5469   if (offset_value.coeffs[1] > 0)
5470     return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
5471                                              offset_value.coeffs[1], 0);
5472   else
5473     return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
5474                                              -offset_value.coeffs[1], 0);
5475 }
5476
5477 /* Return true if we can add VALUE to a register using a single ADDVL
5478    or ADDPL instruction.  */
5479
5480 static bool
5481 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
5482 {
5483   HOST_WIDE_INT factor = value.coeffs[0];
5484   if (factor == 0 || value.coeffs[1] != factor)
5485     return false;
5486   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
5487      and a value of 16 is one vector width.  */
5488   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
5489           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
5490 }
5491
5492 /* Likewise for rtx X.  */
5493
5494 bool
5495 aarch64_sve_addvl_addpl_immediate_p (rtx x)
5496 {
5497   poly_int64 value;
5498   return (poly_int_rtx_p (x, &value)
5499           && aarch64_sve_addvl_addpl_immediate_p (value));
5500 }
5501
5502 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
5503    to operand 1 and storing the result in operand 0.  */
5504
5505 char *
5506 aarch64_output_sve_addvl_addpl (rtx offset)
5507 {
5508   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
5509   poly_int64 offset_value = rtx_to_poly_int64 (offset);
5510   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
5511
5512   int factor = offset_value.coeffs[1];
5513   if ((factor & 15) == 0)
5514     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
5515   else
5516     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
5517   return buffer;
5518 }
5519
5520 /* Return true if X is a valid immediate for an SVE vector INC or DEC
5521    instruction.  If it is, store the number of elements in each vector
5522    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
5523    factor in *FACTOR_OUT (if nonnull).  */
5524
5525 bool
5526 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
5527                                         unsigned int *nelts_per_vq_out)
5528 {
5529   rtx elt;
5530   poly_int64 value;
5531
5532   if (!const_vec_duplicate_p (x, &elt)
5533       || !poly_int_rtx_p (elt, &value))
5534     return false;
5535
5536   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
5537   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
5538     /* There's no vector INCB.  */
5539     return false;
5540
5541   HOST_WIDE_INT factor = value.coeffs[0];
5542   if (value.coeffs[1] != factor)
5543     return false;
5544
5545   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
5546   if ((factor % nelts_per_vq) != 0
5547       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
5548     return false;
5549
5550   if (factor_out)
5551     *factor_out = factor;
5552   if (nelts_per_vq_out)
5553     *nelts_per_vq_out = nelts_per_vq;
5554   return true;
5555 }
5556
5557 /* Return true if X is a valid immediate for an SVE vector INC or DEC
5558    instruction.  */
5559
5560 bool
5561 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
5562 {
5563   return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
5564 }
5565
5566 /* Return the asm template for an SVE vector INC or DEC instruction.
5567    OPERANDS gives the operands before the vector count and X is the
5568    value of the vector count operand itself.  */
5569
5570 char *
5571 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
5572 {
5573   int factor;
5574   unsigned int nelts_per_vq;
5575   if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
5576     gcc_unreachable ();
5577   if (factor < 0)
5578     return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
5579                                              -factor, nelts_per_vq);
5580   else
5581     return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
5582                                              factor, nelts_per_vq);
5583 }
5584
5585 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
5586
5587 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5588   {
5589     0x0000000100000001ull,
5590     0x0001000100010001ull,
5591     0x0101010101010101ull,
5592     0x1111111111111111ull,
5593     0x5555555555555555ull,
5594   };
5595
5596
5597
5598 /* Return true if 64-bit VAL is a valid bitmask immediate.  */
5599 static bool
5600 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val)
5601 {
5602   unsigned HOST_WIDE_INT tmp, mask, first_one, next_one;
5603   int bits;
5604
5605   /* Check for a single sequence of one bits and return quickly if so.
5606      The special cases of all ones and all zeroes returns false.  */
5607   tmp = val + (val & -val);
5608
5609   if (tmp == (tmp & -tmp))
5610     return (val + 1) > 1;
5611
5612   /* Invert if the immediate doesn't start with a zero bit - this means we
5613      only need to search for sequences of one bits.  */
5614   if (val & 1)
5615     val = ~val;
5616
5617   /* Find the first set bit and set tmp to val with the first sequence of one
5618      bits removed.  Return success if there is a single sequence of ones.  */
5619   first_one = val & -val;
5620   tmp = val & (val + first_one);
5621
5622   if (tmp == 0)
5623     return true;
5624
5625   /* Find the next set bit and compute the difference in bit position.  */
5626   next_one = tmp & -tmp;
5627   bits = clz_hwi (first_one) - clz_hwi (next_one);
5628   mask = val ^ tmp;
5629
5630   /* Check the bit position difference is a power of 2, and that the first
5631      sequence of one bits fits within 'bits' bits.  */
5632   if ((mask >> bits) != 0 || bits != (bits & -bits))
5633     return false;
5634
5635   /* Check the sequence of one bits is repeated 64/bits times.  */
5636   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5637 }
5638
5639
5640 /* Return true if VAL is a valid bitmask immediate for MODE.  */
5641 bool
5642 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5643 {
5644   if (mode == DImode)
5645     return aarch64_bitmask_imm (val);
5646
5647   if (mode == SImode)
5648     return aarch64_bitmask_imm ((val & 0xffffffff) | (val << 32));
5649
5650   /* Replicate small immediates to fit 64 bits.  */
5651   int size = GET_MODE_UNIT_PRECISION (mode);
5652   val &= (HOST_WIDE_INT_1U << size) - 1;
5653   val *= bitmask_imm_mul[__builtin_clz (size) - 26];
5654
5655   return aarch64_bitmask_imm (val);
5656 }
5657
5658
5659 /* Return true if the immediate VAL can be a bitfield immediate
5660    by changing the given MASK bits in VAL to zeroes, ones or bits
5661    from the other half of VAL.  Return the new immediate in VAL2.  */
5662 static inline bool
5663 aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
5664                        unsigned HOST_WIDE_INT &val2,
5665                        unsigned HOST_WIDE_INT mask)
5666 {
5667   val2 = val & ~mask;
5668   if (val2 != val && aarch64_bitmask_imm (val2))
5669     return true;
5670   val2 = val | mask;
5671   if (val2 != val && aarch64_bitmask_imm (val2))
5672     return true;
5673   val = val & ~mask;
5674   val2 = val | (((val >> 32) | (val << 32)) & mask);
5675   if (val2 != val && aarch64_bitmask_imm (val2))
5676     return true;
5677   val2 = val | (((val >> 16) | (val << 48)) & mask);
5678   if (val2 != val && aarch64_bitmask_imm (val2))
5679     return true;
5680   return false;
5681 }
5682
5683
5684 /* Return true if VAL is a valid MOVZ immediate.  */
5685 static inline bool
5686 aarch64_is_movz (unsigned HOST_WIDE_INT val)
5687 {
5688   return (val >> (ctz_hwi (val) & 48)) < 65536;
5689 }
5690
5691
5692 /* Return true if immediate VAL can be created by a 64-bit MOVI/MOVN/MOVZ.  */
5693 bool
5694 aarch64_is_mov_xn_imm (unsigned HOST_WIDE_INT val)
5695 {
5696   return aarch64_is_movz (val) || aarch64_is_movz (~val)
5697     || aarch64_bitmask_imm (val);
5698 }
5699
5700
5701 /* Return true if VAL is an immediate that can be created by a single
5702    MOV instruction.  */
5703 bool
5704 aarch64_move_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5705 {
5706   gcc_assert (mode == SImode || mode == DImode);
5707
5708   if (val < 65536)
5709     return true;
5710
5711   unsigned HOST_WIDE_INT mask =
5712     (val >> 32) == 0 || mode == SImode ? 0xffffffff : HOST_WIDE_INT_M1U;
5713
5714   if (aarch64_is_movz (val & mask) || aarch64_is_movz (~val & mask))
5715     return true;
5716
5717   val = (val & mask) | ((val << 32) & ~mask);
5718   return aarch64_bitmask_imm (val);
5719 }
5720
5721
5722 static int
5723 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
5724                                 machine_mode mode)
5725 {
5726   int i;
5727   unsigned HOST_WIDE_INT val, val2, mask;
5728   int one_match, zero_match;
5729   int num_insns;
5730
5731   gcc_assert (mode == SImode || mode == DImode);
5732
5733   val = INTVAL (imm);
5734
5735   if (aarch64_move_imm (val, mode))
5736     {
5737       if (generate)
5738         emit_insn (gen_rtx_SET (dest, imm));
5739       return 1;
5740     }
5741
5742   if ((val >> 32) == 0 || mode == SImode)
5743     {
5744       if (generate)
5745         {
5746           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
5747           if (mode == SImode)
5748             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
5749                                        GEN_INT ((val >> 16) & 0xffff)));
5750           else
5751             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
5752                                        GEN_INT ((val >> 16) & 0xffff)));
5753         }
5754       return 2;
5755     }
5756
5757   /* Remaining cases are all for DImode.  */
5758
5759   mask = 0xffff;
5760   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
5761     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
5762   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
5763     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
5764
5765   /* Try a bitmask immediate and a movk to generate the immediate
5766      in 2 instructions.  */
5767
5768   if (zero_match < 2 && one_match < 2)
5769     {
5770       for (i = 0; i < 64; i += 16)
5771         {
5772           if (aarch64_check_bitmask (val, val2, mask << i))
5773             break;
5774
5775           val2 = val & ~(mask << i);
5776           if ((val2 >> 32) == 0 && aarch64_move_imm (val2, DImode))
5777             break;
5778         }
5779
5780       if (i != 64)
5781         {
5782           if (generate)
5783             {
5784               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
5785               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5786                                          GEN_INT ((val >> i) & 0xffff)));
5787             }
5788           return 2;
5789         }
5790     }
5791
5792   /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions.  */
5793   if (zero_match + one_match == 0)
5794     {
5795       for (i = 0; i < 48; i += 16)
5796         for (int j = i + 16; j < 64; j += 16)
5797           if (aarch64_check_bitmask (val, val2, (mask << i) | (mask << j)))
5798             {
5799               if (generate)
5800                 {
5801                   emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
5802                   emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5803                                              GEN_INT ((val >> i) & 0xffff)));
5804                   emit_insn (gen_insv_immdi (dest, GEN_INT (j),
5805                                                GEN_INT ((val >> j) & 0xffff)));
5806                 }
5807               return 3;
5808             }
5809     }
5810
5811   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
5812      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
5813      otherwise skip zero bits.  */
5814
5815   num_insns = 1;
5816   mask = 0xffff;
5817   val2 = one_match > zero_match ? ~val : val;
5818   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
5819
5820   if (generate)
5821     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
5822                                            ? (val | ~(mask << i))
5823                                            : (val & (mask << i)))));
5824   for (i += 16; i < 64; i += 16)
5825     {
5826       if ((val2 & (mask << i)) == 0)
5827         continue;
5828       if (generate)
5829         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5830                                    GEN_INT ((val >> i) & 0xffff)));
5831       num_insns ++;
5832     }
5833
5834   return num_insns;
5835 }
5836
5837 /* Return whether imm is a 128-bit immediate which is simple enough to
5838    expand inline.  */
5839 bool
5840 aarch64_mov128_immediate (rtx imm)
5841 {
5842   if (CONST_INT_P (imm))
5843     return true;
5844
5845   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
5846
5847   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
5848   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
5849
5850   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
5851          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
5852 }
5853
5854
5855 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5856    a left shift of 0 or 12 bits.  */
5857 bool
5858 aarch64_uimm12_shift (unsigned HOST_WIDE_INT val)
5859 {
5860   return val < 4096 || (val & 0xfff000) == val;
5861 }
5862
5863 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
5864    that can be created with a left shift of 0 or 12.  */
5865 static HOST_WIDE_INT
5866 aarch64_clamp_to_uimm12_shift (unsigned HOST_WIDE_INT val)
5867 {
5868   /* Check to see if the value fits in 24 bits, as that is the maximum we can
5869      handle correctly.  */
5870   gcc_assert (val < 0x1000000);
5871
5872   if (val < 4096)
5873     return val;
5874
5875   return val & 0xfff000;
5876 }
5877
5878
5879 /* Test whether:
5880
5881      X = (X & AND_VAL) | IOR_VAL;
5882
5883    can be implemented using:
5884
5885      MOVK X, #(IOR_VAL >> shift), LSL #shift
5886
5887    Return the shift if so, otherwise return -1.  */
5888 int
5889 aarch64_movk_shift (const wide_int_ref &and_val,
5890                     const wide_int_ref &ior_val)
5891 {
5892   unsigned int precision = and_val.get_precision ();
5893   unsigned HOST_WIDE_INT mask = 0xffff;
5894   for (unsigned int shift = 0; shift < precision; shift += 16)
5895     {
5896       if (and_val == ~mask && (ior_val & mask) == ior_val)
5897         return shift;
5898       mask <<= 16;
5899     }
5900   return -1;
5901 }
5902
5903 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5904    Assumed precondition: VAL_IN Is not zero.  */
5905
5906 unsigned HOST_WIDE_INT
5907 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5908 {
5909   int lowest_bit_set = ctz_hwi (val_in);
5910   int highest_bit_set = floor_log2 (val_in);
5911   gcc_assert (val_in != 0);
5912
5913   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5914           (HOST_WIDE_INT_1U << lowest_bit_set));
5915 }
5916
5917 /* Create constant where bits outside of lowest bit set to highest bit set
5918    are set to 1.  */
5919
5920 unsigned HOST_WIDE_INT
5921 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5922 {
5923   return val_in | ~aarch64_and_split_imm1 (val_in);
5924 }
5925
5926 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
5927
5928 bool
5929 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5930 {
5931   scalar_int_mode int_mode;
5932   if (!is_a <scalar_int_mode> (mode, &int_mode))
5933     return false;
5934
5935   if (aarch64_bitmask_imm (val_in, int_mode))
5936     return false;
5937
5938   if (aarch64_move_imm (val_in, int_mode))
5939     return false;
5940
5941   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5942
5943   return aarch64_bitmask_imm (imm2, int_mode);
5944 }
5945
5946 /* Return the number of temporary registers that aarch64_add_offset_1
5947    would need to add OFFSET to a register.  */
5948
5949 static unsigned int
5950 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
5951 {
5952   return absu_hwi (offset) < 0x1000000 ? 0 : 1;
5953 }
5954
5955 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
5956    a non-polynomial OFFSET.  MODE is the mode of the addition.
5957    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
5958    be set and CFA adjustments added to the generated instructions.
5959
5960    TEMP1, if nonnull, is a register of mode MODE that can be used as a
5961    temporary if register allocation is already complete.  This temporary
5962    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
5963    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
5964    the immediate again.
5965
5966    Since this function may be used to adjust the stack pointer, we must
5967    ensure that it cannot cause transient stack deallocation (for example
5968    by first incrementing SP and then decrementing when adjusting by a
5969    large immediate).  */
5970
5971 static void
5972 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
5973                       rtx src, HOST_WIDE_INT offset, rtx temp1,
5974                       bool frame_related_p, bool emit_move_imm)
5975 {
5976   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
5977   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
5978
5979   unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
5980   rtx_insn *insn;
5981
5982   if (!moffset)
5983     {
5984       if (!rtx_equal_p (dest, src))
5985         {
5986           insn = emit_insn (gen_rtx_SET (dest, src));
5987           RTX_FRAME_RELATED_P (insn) = frame_related_p;
5988         }
5989       return;
5990     }
5991
5992   /* Single instruction adjustment.  */
5993   if (aarch64_uimm12_shift (moffset))
5994     {
5995       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
5996       RTX_FRAME_RELATED_P (insn) = frame_related_p;
5997       return;
5998     }
5999
6000   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
6001      and either:
6002
6003      a) the offset cannot be loaded by a 16-bit move or
6004      b) there is no spare register into which we can move it.  */
6005   if (moffset < 0x1000000
6006       && ((!temp1 && !can_create_pseudo_p ())
6007           || !aarch64_move_imm (moffset, mode)))
6008     {
6009       HOST_WIDE_INT low_off = moffset & 0xfff;
6010
6011       low_off = offset < 0 ? -low_off : low_off;
6012       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
6013       RTX_FRAME_RELATED_P (insn) = frame_related_p;
6014       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
6015       RTX_FRAME_RELATED_P (insn) = frame_related_p;
6016       return;
6017     }
6018
6019   /* Emit a move immediate if required and an addition/subtraction.  */
6020   if (emit_move_imm)
6021     {
6022       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
6023       temp1 = aarch64_force_temporary (mode, temp1,
6024                                        gen_int_mode (moffset, mode));
6025     }
6026   insn = emit_insn (offset < 0
6027                     ? gen_sub3_insn (dest, src, temp1)
6028                     : gen_add3_insn (dest, src, temp1));
6029   if (frame_related_p)
6030     {
6031       RTX_FRAME_RELATED_P (insn) = frame_related_p;
6032       rtx adj = plus_constant (mode, src, offset);
6033       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
6034     }
6035 }
6036
6037 /* Return the number of temporary registers that aarch64_add_offset
6038    would need to move OFFSET into a register or add OFFSET to a register;
6039    ADD_P is true if we want the latter rather than the former.  */
6040
6041 static unsigned int
6042 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
6043 {
6044   /* This follows the same structure as aarch64_add_offset.  */
6045   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
6046     return 0;
6047
6048   unsigned int count = 0;
6049   HOST_WIDE_INT factor = offset.coeffs[1];
6050   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
6051   poly_int64 poly_offset (factor, factor);
6052   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
6053     /* Need one register for the ADDVL/ADDPL result.  */
6054     count += 1;
6055   else if (factor != 0)
6056     {
6057       factor = abs (factor);
6058       if (factor > 16 * (factor & -factor))
6059         /* Need one register for the CNT result and one for the multiplication
6060            factor.  If necessary, the second temporary can be reused for the
6061            constant part of the offset.  */
6062         return 2;
6063       /* Need one register for the CNT result (which might then
6064          be shifted).  */
6065       count += 1;
6066     }
6067   return count + aarch64_add_offset_1_temporaries (constant);
6068 }
6069
6070 /* If X can be represented as a poly_int64, return the number
6071    of temporaries that are required to add it to a register.
6072    Return -1 otherwise.  */
6073
6074 int
6075 aarch64_add_offset_temporaries (rtx x)
6076 {
6077   poly_int64 offset;
6078   if (!poly_int_rtx_p (x, &offset))
6079     return -1;
6080   return aarch64_offset_temporaries (true, offset);
6081 }
6082
6083 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
6084    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
6085    be set and CFA adjustments added to the generated instructions.
6086
6087    TEMP1, if nonnull, is a register of mode MODE that can be used as a
6088    temporary if register allocation is already complete.  This temporary
6089    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
6090    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
6091    false to avoid emitting the immediate again.
6092
6093    TEMP2, if nonnull, is a second temporary register that doesn't
6094    overlap either DEST or REG.
6095
6096    Since this function may be used to adjust the stack pointer, we must
6097    ensure that it cannot cause transient stack deallocation (for example
6098    by first incrementing SP and then decrementing when adjusting by a
6099    large immediate).  */
6100
6101 static void
6102 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
6103                     poly_int64 offset, rtx temp1, rtx temp2,
6104                     bool frame_related_p, bool emit_move_imm = true)
6105 {
6106   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
6107   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
6108   gcc_assert (temp1 == NULL_RTX
6109               || !frame_related_p
6110               || !reg_overlap_mentioned_p (temp1, dest));
6111   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
6112
6113   /* Try using ADDVL or ADDPL to add the whole value.  */
6114   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
6115     {
6116       rtx offset_rtx = gen_int_mode (offset, mode);
6117       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
6118       RTX_FRAME_RELATED_P (insn) = frame_related_p;
6119       return;
6120     }
6121
6122   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
6123      SVE vector register, over and above the minimum size of 128 bits.
6124      This is equivalent to half the value returned by CNTD with a
6125      vector shape of ALL.  */
6126   HOST_WIDE_INT factor = offset.coeffs[1];
6127   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
6128
6129   /* Try using ADDVL or ADDPL to add the VG-based part.  */
6130   poly_int64 poly_offset (factor, factor);
6131   if (src != const0_rtx
6132       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
6133     {
6134       rtx offset_rtx = gen_int_mode (poly_offset, mode);
6135       if (frame_related_p)
6136         {
6137           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
6138           RTX_FRAME_RELATED_P (insn) = true;
6139           src = dest;
6140         }
6141       else
6142         {
6143           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
6144           src = aarch64_force_temporary (mode, temp1, addr);
6145           temp1 = temp2;
6146           temp2 = NULL_RTX;
6147         }
6148     }
6149   /* Otherwise use a CNT-based sequence.  */
6150   else if (factor != 0)
6151     {
6152       /* Use a subtraction if we have a negative factor.  */
6153       rtx_code code = PLUS;
6154       if (factor < 0)
6155         {
6156           factor = -factor;
6157           code = MINUS;
6158         }
6159
6160       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
6161          into the multiplication.  */
6162       rtx val;
6163       int shift = 0;
6164       if (factor & 1)
6165         /* Use a right shift by 1.  */
6166         shift = -1;
6167       else
6168         factor /= 2;
6169       HOST_WIDE_INT low_bit = factor & -factor;
6170       if (factor <= 16 * low_bit)
6171         {
6172           if (factor > 16 * 8)
6173             {
6174               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
6175                  the value with the minimum multiplier and shift it into
6176                  position.  */
6177               int extra_shift = exact_log2 (low_bit);
6178               shift += extra_shift;
6179               factor >>= extra_shift;
6180             }
6181           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
6182         }
6183       else
6184         {
6185           /* Base the factor on LOW_BIT if we can calculate LOW_BIT
6186              directly, since that should increase the chances of being
6187              able to use a shift and add sequence.  If LOW_BIT itself
6188              is out of range, just use CNTD.  */
6189           if (low_bit <= 16 * 8)
6190             factor /= low_bit;
6191           else
6192             low_bit = 1;
6193
6194           val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
6195           val = aarch64_force_temporary (mode, temp1, val);
6196
6197           if (can_create_pseudo_p ())
6198             {
6199               rtx coeff1 = gen_int_mode (factor, mode);
6200               val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
6201             }
6202           else
6203             {
6204               /* Go back to using a negative multiplication factor if we have
6205                  no register from which to subtract.  */
6206               if (code == MINUS && src == const0_rtx)
6207                 {
6208                   factor = -factor;
6209                   code = PLUS;
6210                 }
6211               rtx coeff1 = gen_int_mode (factor, mode);
6212               coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
6213               val = gen_rtx_MULT (mode, val, coeff1);
6214             }
6215         }
6216
6217       if (shift > 0)
6218         {
6219           /* Multiply by 1 << SHIFT.  */
6220           val = aarch64_force_temporary (mode, temp1, val);
6221           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
6222         }
6223       else if (shift == -1)
6224         {
6225           /* Divide by 2.  */
6226           val = aarch64_force_temporary (mode, temp1, val);
6227           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
6228         }
6229
6230       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
6231       if (src != const0_rtx)
6232         {
6233           val = aarch64_force_temporary (mode, temp1, val);
6234           val = gen_rtx_fmt_ee (code, mode, src, val);
6235         }
6236       else if (code == MINUS)
6237         {
6238           val = aarch64_force_temporary (mode, temp1, val);
6239           val = gen_rtx_NEG (mode, val);
6240         }
6241
6242       if (constant == 0 || frame_related_p)
6243         {
6244           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
6245           if (frame_related_p)
6246             {
6247               RTX_FRAME_RELATED_P (insn) = true;
6248               add_reg_note (insn, REG_CFA_ADJUST_CFA,
6249                             gen_rtx_SET (dest, plus_constant (Pmode, src,
6250                                                               poly_offset)));
6251             }
6252           src = dest;
6253           if (constant == 0)
6254             return;
6255         }
6256       else
6257         {
6258           src = aarch64_force_temporary (mode, temp1, val);
6259           temp1 = temp2;
6260           temp2 = NULL_RTX;
6261         }
6262
6263       emit_move_imm = true;
6264     }
6265
6266   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
6267                         frame_related_p, emit_move_imm);
6268 }
6269
6270 /* Like aarch64_add_offset, but the offset is given as an rtx rather
6271    than a poly_int64.  */
6272
6273 void
6274 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
6275                           rtx offset_rtx, rtx temp1, rtx temp2)
6276 {
6277   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
6278                       temp1, temp2, false);
6279 }
6280
6281 /* Add DELTA to the stack pointer, marking the instructions frame-related.
6282    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
6283    if TEMP1 already contains abs (DELTA).  */
6284
6285 static inline void
6286 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
6287 {
6288   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
6289                       temp1, temp2, true, emit_move_imm);
6290 }
6291
6292 /* Subtract DELTA from the stack pointer, marking the instructions
6293    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
6294    if nonnull.  */
6295
6296 static inline void
6297 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
6298                 bool emit_move_imm = true)
6299 {
6300   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
6301                       temp1, temp2, frame_related_p, emit_move_imm);
6302 }
6303
6304 /* Set DEST to (vec_series BASE STEP).  */
6305
6306 static void
6307 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
6308 {
6309   machine_mode mode = GET_MODE (dest);
6310   scalar_mode inner = GET_MODE_INNER (mode);
6311
6312   /* Each operand can be a register or an immediate in the range [-16, 15].  */
6313   if (!aarch64_sve_index_immediate_p (base))
6314     base = force_reg (inner, base);
6315   if (!aarch64_sve_index_immediate_p (step))
6316     step = force_reg (inner, step);
6317
6318   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
6319 }
6320
6321 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
6322    register of mode MODE.  Use TARGET for the result if it's nonnull
6323    and convenient.
6324
6325    The two vector modes must have the same element mode.  The behavior
6326    is to duplicate architectural lane N of SRC into architectural lanes
6327    N + I * STEP of the result.  On big-endian targets, architectural
6328    lane 0 of an Advanced SIMD vector is the last element of the vector
6329    in memory layout, so for big-endian targets this operation has the
6330    effect of reversing SRC before duplicating it.  Callers need to
6331    account for this.  */
6332
6333 rtx
6334 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
6335 {
6336   machine_mode src_mode = GET_MODE (src);
6337   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
6338   insn_code icode = (BYTES_BIG_ENDIAN
6339                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
6340                      : code_for_aarch64_vec_duplicate_vq_le (mode));
6341
6342   unsigned int i = 0;
6343   expand_operand ops[3];
6344   create_output_operand (&ops[i++], target, mode);
6345   create_output_operand (&ops[i++], src, src_mode);
6346   if (BYTES_BIG_ENDIAN)
6347     {
6348       /* Create a PARALLEL describing the reversal of SRC.  */
6349       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
6350       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
6351                                                   nelts_per_vq - 1, -1);
6352       create_fixed_operand (&ops[i++], sel);
6353     }
6354   expand_insn (icode, i, ops);
6355   return ops[0].value;
6356 }
6357
6358 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
6359    the memory image into DEST.  Return true on success.  */
6360
6361 static bool
6362 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
6363 {
6364   src = force_const_mem (GET_MODE (src), src);
6365   if (!src)
6366     return false;
6367
6368   /* Make sure that the address is legitimate.  */
6369   if (!aarch64_sve_ld1rq_operand_p (src))
6370     {
6371       rtx addr = force_reg (Pmode, XEXP (src, 0));
6372       src = replace_equiv_address (src, addr);
6373     }
6374
6375   machine_mode mode = GET_MODE (dest);
6376   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
6377   rtx ptrue = aarch64_ptrue_reg (pred_mode);
6378   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
6379   return true;
6380 }
6381
6382 /* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
6383    by N "background" values.  Try to move it into TARGET using:
6384
6385       PTRUE PRED.<T>, VL<N>
6386       MOV TRUE.<T>, #<foreground>
6387       MOV FALSE.<T>, #<background>
6388       SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
6389
6390    The PTRUE is always a single instruction but the MOVs might need a
6391    longer sequence.  If the background value is zero (as it often is),
6392    the sequence can sometimes collapse to a PTRUE followed by a
6393    zero-predicated move.
6394
6395    Return the target on success, otherwise return null.  */
6396
6397 static rtx
6398 aarch64_expand_sve_const_vector_sel (rtx target, rtx src)
6399 {
6400   gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2);
6401
6402   /* Make sure that the PTRUE is valid.  */
6403   machine_mode mode = GET_MODE (src);
6404   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
6405   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
6406   if (aarch64_svpattern_for_vl (pred_mode, npatterns)
6407       == AARCH64_NUM_SVPATTERNS)
6408     return NULL_RTX;
6409
6410   rtx_vector_builder pred_builder (pred_mode, npatterns, 2);
6411   rtx_vector_builder true_builder (mode, npatterns, 1);
6412   rtx_vector_builder false_builder (mode, npatterns, 1);
6413   for (unsigned int i = 0; i < npatterns; ++i)
6414     {
6415       true_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
6416       pred_builder.quick_push (CONST1_RTX (BImode));
6417     }
6418   for (unsigned int i = 0; i < npatterns; ++i)
6419     {
6420       false_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i + npatterns));
6421       pred_builder.quick_push (CONST0_RTX (BImode));
6422     }
6423   expand_operand ops[4];
6424   create_output_operand (&ops[0], target, mode);
6425   create_input_operand (&ops[1], true_builder.build (), mode);
6426   create_input_operand (&ops[2], false_builder.build (), mode);
6427   create_input_operand (&ops[3], pred_builder.build (), pred_mode);
6428   expand_insn (code_for_vcond_mask (mode, mode), 4, ops);
6429   return target;
6430 }
6431
6432 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
6433    SVE data mode and isn't a legitimate constant.  Use TARGET for the
6434    result if convenient.
6435
6436    The returned register can have whatever mode seems most natural
6437    given the contents of SRC.  */
6438
6439 static rtx
6440 aarch64_expand_sve_const_vector (rtx target, rtx src)
6441 {
6442   machine_mode mode = GET_MODE (src);
6443   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
6444   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
6445   scalar_mode elt_mode = GET_MODE_INNER (mode);
6446   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
6447   unsigned int container_bits = aarch64_sve_container_bits (mode);
6448   unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
6449
6450   if (nelts_per_pattern == 1
6451       && encoded_bits <= 128
6452       && container_bits != elt_bits)
6453     {
6454       /* We have a partial vector mode and a constant whose full-vector
6455          equivalent would occupy a repeating 128-bit sequence.  Build that
6456          full-vector equivalent instead, so that we have the option of
6457          using LD1RQ and Advanced SIMD operations.  */
6458       unsigned int repeat = container_bits / elt_bits;
6459       machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
6460       rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
6461       for (unsigned int i = 0; i < npatterns; ++i)
6462         for (unsigned int j = 0; j < repeat; ++j)
6463           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
6464       target = aarch64_target_reg (target, full_mode);
6465       return aarch64_expand_sve_const_vector (target, builder.build ());
6466     }
6467
6468   if (nelts_per_pattern == 1 && encoded_bits == 128)
6469     {
6470       /* The constant is a duplicated quadword but can't be narrowed
6471          beyond a quadword.  Get the memory image of the first quadword
6472          as a 128-bit vector and try using LD1RQ to load it from memory.
6473
6474          The effect for both endiannesses is to load memory lane N into
6475          architectural lanes N + I * STEP of the result.  On big-endian
6476          targets, the layout of the 128-bit vector in an Advanced SIMD
6477          register would be different from its layout in an SVE register,
6478          but this 128-bit vector is a memory value only.  */
6479       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
6480       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
6481       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
6482         return target;
6483     }
6484
6485   if (nelts_per_pattern == 1 && encoded_bits < 128)
6486     {
6487       /* The vector is a repeating sequence of 64 bits or fewer.
6488          See if we can load them using an Advanced SIMD move and then
6489          duplicate it to fill a vector.  This is better than using a GPR
6490          move because it keeps everything in the same register file.  */
6491       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
6492       rtx_vector_builder builder (vq_mode, npatterns, 1);
6493       for (unsigned int i = 0; i < npatterns; ++i)
6494         {
6495           /* We want memory lane N to go into architectural lane N,
6496              so reverse for big-endian targets.  The DUP .Q pattern
6497              has a compensating reverse built-in.  */
6498           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
6499           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
6500         }
6501       rtx vq_src = builder.build ();
6502       if (aarch64_simd_valid_immediate (vq_src, NULL))
6503         {
6504           vq_src = force_reg (vq_mode, vq_src);
6505           return aarch64_expand_sve_dupq (target, mode, vq_src);
6506         }
6507
6508       /* Get an integer representation of the repeating part of Advanced
6509          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
6510          which for big-endian targets is lane-swapped wrt a normal
6511          Advanced SIMD vector.  This means that for both endiannesses,
6512          memory lane N of SVE vector SRC corresponds to architectural
6513          lane N of a register holding VQ_SRC.  This in turn means that
6514          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
6515          as a single 128-bit value) and thus that memory lane 0 of SRC is
6516          in the lsb of the integer.  Duplicating the integer therefore
6517          ensures that memory lane N of SRC goes into architectural lane
6518          N + I * INDEX of the SVE register.  */
6519       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
6520       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
6521       if (elt_value)
6522         {
6523           /* Pretend that we had a vector of INT_MODE to start with.  */
6524           elt_mode = int_mode;
6525           mode = aarch64_full_sve_mode (int_mode).require ();
6526
6527           /* If the integer can be moved into a general register by a
6528              single instruction, do that and duplicate the result.  */
6529           if (CONST_INT_P (elt_value)
6530               && aarch64_move_imm (INTVAL (elt_value),
6531                                    encoded_bits <= 32 ? SImode : DImode))
6532             {
6533               elt_value = force_reg (elt_mode, elt_value);
6534               return expand_vector_broadcast (mode, elt_value);
6535             }
6536         }
6537       else if (npatterns == 1)
6538         /* We're duplicating a single value, but can't do better than
6539            force it to memory and load from there.  This handles things
6540            like symbolic constants.  */
6541         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
6542
6543       if (elt_value)
6544         {
6545           /* Load the element from memory if we can, otherwise move it into
6546              a register and use a DUP.  */
6547           rtx op = force_const_mem (elt_mode, elt_value);
6548           if (!op)
6549             op = force_reg (elt_mode, elt_value);
6550           return expand_vector_broadcast (mode, op);
6551         }
6552     }
6553
6554   /* Try using INDEX.  */
6555   rtx base, step;
6556   if (const_vec_series_p (src, &base, &step))
6557     {
6558       aarch64_expand_vec_series (target, base, step);
6559       return target;
6560     }
6561
6562   /* From here on, it's better to force the whole constant to memory
6563      if we can.  */
6564   if (GET_MODE_NUNITS (mode).is_constant ())
6565     return NULL_RTX;
6566
6567   if (nelts_per_pattern == 2)
6568     if (rtx res = aarch64_expand_sve_const_vector_sel (target, src))
6569       return res;
6570
6571   /* Expand each pattern individually.  */
6572   gcc_assert (npatterns > 1);
6573   rtx_vector_builder builder;
6574   auto_vec<rtx, 16> vectors (npatterns);
6575   for (unsigned int i = 0; i < npatterns; ++i)
6576     {
6577       builder.new_vector (mode, 1, nelts_per_pattern);
6578       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
6579         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
6580       vectors.quick_push (force_reg (mode, builder.build ()));
6581     }
6582
6583   /* Use permutes to interleave the separate vectors.  */
6584   while (npatterns > 1)
6585     {
6586       npatterns /= 2;
6587       for (unsigned int i = 0; i < npatterns; ++i)
6588         {
6589           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
6590           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
6591           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
6592           vectors[i] = tmp;
6593         }
6594     }
6595   gcc_assert (vectors[0] == target);
6596   return target;
6597 }
6598
6599 /* Use WHILE to set a predicate register of mode MODE in which the first
6600    VL bits are set and the rest are clear.  Use TARGET for the register
6601    if it's nonnull and convenient.  */
6602
6603 static rtx
6604 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
6605                                  unsigned int vl)
6606 {
6607   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
6608   target = aarch64_target_reg (target, mode);
6609   emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
6610                         target, const0_rtx, limit));
6611   return target;
6612 }
6613
6614 static rtx
6615 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
6616
6617 /* BUILDER is a constant predicate in which the index of every set bit
6618    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
6619    by inverting every element at a multiple of ELT_SIZE and EORing the
6620    result with an ELT_SIZE PTRUE.
6621
6622    Return a register that contains the constant on success, otherwise
6623    return null.  Use TARGET as the register if it is nonnull and
6624    convenient.  */
6625
6626 static rtx
6627 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
6628                                    unsigned int elt_size)
6629 {
6630   /* Invert every element at a multiple of ELT_SIZE, keeping the
6631      other bits zero.  */
6632   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
6633                                   builder.nelts_per_pattern ());
6634   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
6635     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
6636       inv_builder.quick_push (const1_rtx);
6637     else
6638       inv_builder.quick_push (const0_rtx);
6639   inv_builder.finalize ();
6640
6641   /* See if we can load the constant cheaply.  */
6642   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
6643   if (!inv)
6644     return NULL_RTX;
6645
6646   /* EOR the result with an ELT_SIZE PTRUE.  */
6647   rtx mask = aarch64_ptrue_all (elt_size);
6648   mask = force_reg (VNx16BImode, mask);
6649   inv = gen_lowpart (VNx16BImode, inv);
6650   target = aarch64_target_reg (target, VNx16BImode);
6651   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
6652   return target;
6653 }
6654
6655 /* BUILDER is a constant predicate in which the index of every set bit
6656    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
6657    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
6658    register on success, otherwise return null.  Use TARGET as the register
6659    if nonnull and convenient.  */
6660
6661 static rtx
6662 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
6663                                    unsigned int elt_size,
6664                                    unsigned int permute_size)
6665 {
6666   /* We're going to split the constant into two new constants A and B,
6667      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
6668      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
6669
6670      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
6671      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
6672
6673      where _ indicates elements that will be discarded by the permute.
6674
6675      First calculate the ELT_SIZEs for A and B.  */
6676   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
6677   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
6678   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
6679     if (INTVAL (builder.elt (i)) != 0)
6680       {
6681         if (i & permute_size)
6682           b_elt_size |= i - permute_size;
6683         else
6684           a_elt_size |= i;
6685       }
6686   a_elt_size &= -a_elt_size;
6687   b_elt_size &= -b_elt_size;
6688
6689   /* Now construct the vectors themselves.  */
6690   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
6691                                 builder.nelts_per_pattern ());
6692   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
6693                                 builder.nelts_per_pattern ());
6694   unsigned int nelts = builder.encoded_nelts ();
6695   for (unsigned int i = 0; i < nelts; ++i)
6696     if (i & (elt_size - 1))
6697       {
6698         a_builder.quick_push (const0_rtx);
6699         b_builder.quick_push (const0_rtx);
6700       }
6701     else if ((i & permute_size) == 0)
6702       {
6703         /* The A and B elements are significant.  */
6704         a_builder.quick_push (builder.elt (i));
6705         b_builder.quick_push (builder.elt (i + permute_size));
6706       }
6707     else
6708       {
6709         /* The A and B elements are going to be discarded, so pick whatever
6710            is likely to give a nice constant.  We are targeting element
6711            sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
6712            with the aim of each being a sequence of ones followed by
6713            a sequence of zeros.  So:
6714
6715            * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
6716              duplicate the last X_ELT_SIZE element, to extend the
6717              current sequence of ones or zeros.
6718
6719            * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
6720              zero, so that the constant really does have X_ELT_SIZE and
6721              not a smaller size.  */
6722         if (a_elt_size > permute_size)
6723           a_builder.quick_push (const0_rtx);
6724         else
6725           a_builder.quick_push (a_builder.elt (i - a_elt_size));
6726         if (b_elt_size > permute_size)
6727           b_builder.quick_push (const0_rtx);
6728         else
6729           b_builder.quick_push (b_builder.elt (i - b_elt_size));
6730       }
6731   a_builder.finalize ();
6732   b_builder.finalize ();
6733
6734   /* Try loading A into a register.  */
6735   rtx_insn *last = get_last_insn ();
6736   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
6737   if (!a)
6738     return NULL_RTX;
6739
6740   /* Try loading B into a register.  */
6741   rtx b = a;
6742   if (a_builder != b_builder)
6743     {
6744       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
6745       if (!b)
6746         {
6747           delete_insns_since (last);
6748           return NULL_RTX;
6749         }
6750     }
6751
6752   /* Emit the TRN1 itself.  We emit a TRN that operates on VNx16BI
6753      operands but permutes them as though they had mode MODE.  */
6754   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
6755   target = aarch64_target_reg (target, GET_MODE (a));
6756   rtx type_reg = CONST0_RTX (mode);
6757   emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
6758   return target;
6759 }
6760
6761 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
6762    constant in BUILDER into an SVE predicate register.  Return the register
6763    on success, otherwise return null.  Use TARGET for the register if
6764    nonnull and convenient.
6765
6766    ALLOW_RECURSE_P is true if we can use methods that would call this
6767    function recursively.  */
6768
6769 static rtx
6770 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
6771                                  bool allow_recurse_p)
6772 {
6773   if (builder.encoded_nelts () == 1)
6774     /* A PFALSE or a PTRUE .B ALL.  */
6775     return aarch64_emit_set_immediate (target, builder);
6776
6777   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
6778   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
6779     {
6780       /* If we can load the constant using PTRUE, use it as-is.  */
6781       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
6782       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
6783         return aarch64_emit_set_immediate (target, builder);
6784
6785       /* Otherwise use WHILE to set the first VL bits.  */
6786       return aarch64_sve_move_pred_via_while (target, mode, vl);
6787     }
6788
6789   if (!allow_recurse_p)
6790     return NULL_RTX;
6791
6792   /* Try inverting the vector in element size ELT_SIZE and then EORing
6793      the result with an ELT_SIZE PTRUE.  */
6794   if (INTVAL (builder.elt (0)) == 0)
6795     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
6796                                                      elt_size))
6797       return res;
6798
6799   /* Try using TRN1 to permute two simpler constants.  */
6800   for (unsigned int i = elt_size; i <= 8; i *= 2)
6801     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
6802                                                      elt_size, i))
6803       return res;
6804
6805   return NULL_RTX;
6806 }
6807
6808 /* Return an SVE predicate register that contains the VNx16BImode
6809    constant in BUILDER, without going through the move expanders.
6810
6811    The returned register can have whatever mode seems most natural
6812    given the contents of BUILDER.  Use TARGET for the result if
6813    convenient.  */
6814
6815 static rtx
6816 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
6817 {
6818   /* Try loading the constant using pure predicate operations.  */
6819   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
6820     return res;
6821
6822   /* Try forcing the constant to memory.  */
6823   if (builder.full_nelts ().is_constant ())
6824     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
6825       {
6826         target = aarch64_target_reg (target, VNx16BImode);
6827         emit_move_insn (target, mem);
6828         return target;
6829       }
6830
6831   /* The last resort is to load the constant as an integer and then
6832      compare it against zero.  Use -1 for set bits in order to increase
6833      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
6834   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
6835                                   builder.nelts_per_pattern ());
6836   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
6837     int_builder.quick_push (INTVAL (builder.elt (i))
6838                             ? constm1_rtx : const0_rtx);
6839   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
6840                                            int_builder.build ());
6841 }
6842
6843 /* Set DEST to immediate IMM.  */
6844
6845 void
6846 aarch64_expand_mov_immediate (rtx dest, rtx imm)
6847 {
6848   machine_mode mode = GET_MODE (dest);
6849
6850   /* Check on what type of symbol it is.  */
6851   scalar_int_mode int_mode;
6852   if ((SYMBOL_REF_P (imm)
6853        || LABEL_REF_P (imm)
6854        || GET_CODE (imm) == CONST
6855        || GET_CODE (imm) == CONST_POLY_INT)
6856       && is_a <scalar_int_mode> (mode, &int_mode))
6857     {
6858       rtx mem;
6859       poly_int64 offset;
6860       HOST_WIDE_INT const_offset;
6861       enum aarch64_symbol_type sty;
6862
6863       /* If we have (const (plus symbol offset)), separate out the offset
6864          before we start classifying the symbol.  */
6865       rtx base = strip_offset (imm, &offset);
6866
6867       /* We must always add an offset involving VL separately, rather than
6868          folding it into the relocation.  */
6869       if (!offset.is_constant (&const_offset))
6870         {
6871           if (!TARGET_SVE)
6872             {
6873               aarch64_report_sve_required ();
6874               return;
6875             }
6876           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
6877             emit_insn (gen_rtx_SET (dest, imm));
6878           else
6879             {
6880               /* Do arithmetic on 32-bit values if the result is smaller
6881                  than that.  */
6882               if (partial_subreg_p (int_mode, SImode))
6883                 {
6884                   /* It is invalid to do symbol calculations in modes
6885                      narrower than SImode.  */
6886                   gcc_assert (base == const0_rtx);
6887                   dest = gen_lowpart (SImode, dest);
6888                   int_mode = SImode;
6889                 }
6890               if (base != const0_rtx)
6891                 {
6892                   base = aarch64_force_temporary (int_mode, dest, base);
6893                   aarch64_add_offset (int_mode, dest, base, offset,
6894                                       NULL_RTX, NULL_RTX, false);
6895                 }
6896               else
6897                 aarch64_add_offset (int_mode, dest, base, offset,
6898                                     dest, NULL_RTX, false);
6899             }
6900           return;
6901         }
6902
6903       sty = aarch64_classify_symbol (base, const_offset);
6904       switch (sty)
6905         {
6906         case SYMBOL_FORCE_TO_MEM:
6907           if (int_mode != ptr_mode)
6908             imm = convert_memory_address (ptr_mode, imm);
6909
6910           if (const_offset != 0
6911               && targetm.cannot_force_const_mem (ptr_mode, imm))
6912             {
6913               gcc_assert (can_create_pseudo_p ());
6914               base = aarch64_force_temporary (int_mode, dest, base);
6915               aarch64_add_offset (int_mode, dest, base, const_offset,
6916                                   NULL_RTX, NULL_RTX, false);
6917               return;
6918             }
6919
6920           mem = force_const_mem (ptr_mode, imm);
6921           gcc_assert (mem);
6922
6923           /* If we aren't generating PC relative literals, then
6924              we need to expand the literal pool access carefully.
6925              This is something that needs to be done in a number
6926              of places, so could well live as a separate function.  */
6927           if (!aarch64_pcrelative_literal_loads)
6928             {
6929               gcc_assert (can_create_pseudo_p ());
6930               base = gen_reg_rtx (ptr_mode);
6931               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
6932               if (ptr_mode != Pmode)
6933                 base = convert_memory_address (Pmode, base);
6934               mem = gen_rtx_MEM (ptr_mode, base);
6935             }
6936
6937           if (int_mode != ptr_mode)
6938             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
6939
6940           emit_insn (gen_rtx_SET (dest, mem));
6941
6942           return;
6943
6944         case SYMBOL_SMALL_TLSGD:
6945         case SYMBOL_SMALL_TLSDESC:
6946         case SYMBOL_SMALL_TLSIE:
6947         case SYMBOL_SMALL_GOT_28K:
6948         case SYMBOL_SMALL_GOT_4G:
6949         case SYMBOL_TINY_GOT:
6950         case SYMBOL_TINY_TLSIE:
6951           if (const_offset != 0)
6952             {
6953               gcc_assert(can_create_pseudo_p ());
6954               base = aarch64_force_temporary (int_mode, dest, base);
6955               aarch64_add_offset (int_mode, dest, base, const_offset,
6956                                   NULL_RTX, NULL_RTX, false);
6957               return;
6958             }
6959           /* FALLTHRU */
6960
6961         case SYMBOL_SMALL_ABSOLUTE:
6962         case SYMBOL_TINY_ABSOLUTE:
6963         case SYMBOL_TLSLE12:
6964         case SYMBOL_TLSLE24:
6965         case SYMBOL_TLSLE32:
6966         case SYMBOL_TLSLE48:
6967           aarch64_load_symref_appropriately (dest, imm, sty);
6968           return;
6969
6970         default:
6971           gcc_unreachable ();
6972         }
6973     }
6974
6975   if (!CONST_INT_P (imm))
6976     {
6977       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
6978         {
6979           /* Only the low bit of each .H, .S and .D element is defined,
6980              so we can set the upper bits to whatever we like.  If the
6981              predicate is all-true in MODE, prefer to set all the undefined
6982              bits as well, so that we can share a single .B predicate for
6983              all modes.  */
6984           if (imm == CONSTM1_RTX (mode))
6985             imm = CONSTM1_RTX (VNx16BImode);
6986
6987           /* All methods for constructing predicate modes wider than VNx16BI
6988              will set the upper bits of each element to zero.  Expose this
6989              by moving such constants as a VNx16BI, so that all bits are
6990              significant and so that constants for different modes can be
6991              shared.  The wider constant will still be available as a
6992              REG_EQUAL note.  */
6993           rtx_vector_builder builder;
6994           if (aarch64_get_sve_pred_bits (builder, imm))
6995             {
6996               rtx res = aarch64_expand_sve_const_pred (dest, builder);
6997               if (dest != res)
6998                 emit_move_insn (dest, gen_lowpart (mode, res));
6999               return;
7000             }
7001         }
7002
7003       if (GET_CODE (imm) == HIGH
7004           || aarch64_simd_valid_immediate (imm, NULL))
7005         {
7006           emit_insn (gen_rtx_SET (dest, imm));
7007           return;
7008         }
7009
7010       if (CONST_VECTOR_P (imm) && aarch64_sve_data_mode_p (mode))
7011         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
7012           {
7013             if (dest != res)
7014               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
7015             return;
7016           }
7017
7018       rtx mem = force_const_mem (mode, imm);
7019       gcc_assert (mem);
7020       emit_move_insn (dest, mem);
7021       return;
7022     }
7023
7024   aarch64_internal_mov_immediate (dest, imm, true, mode);
7025 }
7026
7027 /* Return the MEM rtx that provides the canary value that should be used
7028    for stack-smashing protection.  MODE is the mode of the memory.
7029    For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
7030    (__stack_chk_guard), otherwise it has no useful value.  SALT_TYPE
7031    indicates whether the caller is performing a SET or a TEST operation.  */
7032
7033 rtx
7034 aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
7035                                   aarch64_salt_type salt_type)
7036 {
7037   rtx addr;
7038   if (aarch64_stack_protector_guard == SSP_GLOBAL)
7039     {
7040       gcc_assert (MEM_P (decl_rtl));
7041       addr = XEXP (decl_rtl, 0);
7042       poly_int64 offset;
7043       rtx base = strip_offset_and_salt (addr, &offset);
7044       if (!SYMBOL_REF_P (base))
7045         return decl_rtl;
7046
7047       rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
7048       addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
7049       addr = gen_rtx_CONST (Pmode, addr);
7050       addr = plus_constant (Pmode, addr, offset);
7051     }
7052   else
7053     {
7054       /* Calculate the address from the system register.  */
7055       rtx salt = GEN_INT (salt_type);
7056       addr = gen_reg_rtx (mode);
7057       if (mode == DImode)
7058         emit_insn (gen_reg_stack_protect_address_di (addr, salt));
7059       else
7060         {
7061           emit_insn (gen_reg_stack_protect_address_si (addr, salt));
7062           addr = convert_memory_address (Pmode, addr);
7063         }
7064       addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
7065     }
7066   return gen_rtx_MEM (mode, force_reg (Pmode, addr));
7067 }
7068
7069 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
7070    that is known to contain PTRUE.  */
7071
7072 void
7073 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
7074 {
7075   expand_operand ops[3];
7076   machine_mode mode = GET_MODE (dest);
7077   create_output_operand (&ops[0], dest, mode);
7078   create_input_operand (&ops[1], pred, GET_MODE(pred));
7079   create_input_operand (&ops[2], src, mode);
7080   temporary_volatile_ok v (true);
7081   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
7082 }
7083
7084 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
7085    operand is in memory.  In this case we need to use the predicated LD1
7086    and ST1 instead of LDR and STR, both for correctness on big-endian
7087    targets and because LD1 and ST1 support a wider range of addressing modes.
7088    PRED_MODE is the mode of the predicate.
7089
7090    See the comment at the head of aarch64-sve.md for details about the
7091    big-endian handling.  */
7092
7093 void
7094 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
7095 {
7096   machine_mode mode = GET_MODE (dest);
7097   rtx ptrue = aarch64_ptrue_reg (pred_mode);
7098   if (!register_operand (src, mode)
7099       && !register_operand (dest, mode))
7100     {
7101       rtx tmp = gen_reg_rtx (mode);
7102       if (MEM_P (src))
7103         aarch64_emit_sve_pred_move (tmp, ptrue, src);
7104       else
7105         emit_move_insn (tmp, src);
7106       src = tmp;
7107     }
7108   aarch64_emit_sve_pred_move (dest, ptrue, src);
7109 }
7110
7111 /* Called only on big-endian targets.  See whether an SVE vector move
7112    from SRC to DEST is effectively a REV[BHW] instruction, because at
7113    least one operand is a subreg of an SVE vector that has wider or
7114    narrower elements.  Return true and emit the instruction if so.
7115
7116    For example:
7117
7118      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
7119
7120    represents a VIEW_CONVERT between the following vectors, viewed
7121    in memory order:
7122
7123      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
7124      R1: { [0],      [1],      [2],      [3],     ... }
7125
7126    The high part of lane X in R2 should therefore correspond to lane X*2
7127    of R1, but the register representations are:
7128
7129          msb                                      lsb
7130      R2: ...... [1].high  [1].low   [0].high  [0].low
7131      R1: ...... [3]       [2]       [1]       [0]
7132
7133    where the low part of lane X in R2 corresponds to lane X*2 in R1.
7134    We therefore need a reverse operation to swap the high and low values
7135    around.
7136
7137    This is purely an optimization.  Without it we would spill the
7138    subreg operand to the stack in one mode and reload it in the
7139    other mode, which has the same effect as the REV.  */
7140
7141 bool
7142 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
7143 {
7144   gcc_assert (BYTES_BIG_ENDIAN);
7145
7146   /* Do not try to optimize subregs that LRA has created for matched
7147      reloads.  These subregs only exist as a temporary measure to make
7148      the RTL well-formed, but they are exempt from the usual
7149      TARGET_CAN_CHANGE_MODE_CLASS rules.
7150
7151      For example, if we have:
7152
7153        (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
7154
7155      and the constraints require R1 and R2 to be in the same register,
7156      LRA may need to create RTL such as:
7157
7158        (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
7159        (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
7160        (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
7161
7162      which forces both the input and output of the original instruction
7163      to use the same hard register.  But for this to work, the normal
7164      rules have to be suppressed on the subreg input, otherwise LRA
7165      would need to reload that input too, meaning that the process
7166      would never terminate.  To compensate for this, the normal rules
7167      are also suppressed for the subreg output of the first move.
7168      Ignoring the special case and handling the first move normally
7169      would therefore generate wrong code: we would reverse the elements
7170      for the first subreg but not reverse them back for the second subreg.  */
7171   if (SUBREG_P (dest) && !LRA_SUBREG_P (dest))
7172     dest = SUBREG_REG (dest);
7173   if (SUBREG_P (src) && !LRA_SUBREG_P (src))
7174     src = SUBREG_REG (src);
7175
7176   /* The optimization handles two single SVE REGs with different element
7177      sizes.  */
7178   if (!REG_P (dest)
7179       || !REG_P (src)
7180       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
7181       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
7182       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
7183           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
7184     return false;
7185
7186   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
7187   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
7188   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
7189                                UNSPEC_REV_SUBREG);
7190   emit_insn (gen_rtx_SET (dest, unspec));
7191   return true;
7192 }
7193
7194 /* Return a copy of X with mode MODE, without changing its other
7195    attributes.  Unlike gen_lowpart, this doesn't care whether the
7196    mode change is valid.  */
7197
7198 rtx
7199 aarch64_replace_reg_mode (rtx x, machine_mode mode)
7200 {
7201   if (GET_MODE (x) == mode)
7202     return x;
7203
7204   x = shallow_copy_rtx (x);
7205   set_mode_and_regno (x, mode, REGNO (x));
7206   return x;
7207 }
7208
7209 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
7210    stored in wider integer containers.  */
7211
7212 static unsigned int
7213 aarch64_sve_rev_unspec (machine_mode mode)
7214 {
7215   switch (GET_MODE_UNIT_SIZE (mode))
7216     {
7217     case 1: return UNSPEC_REVB;
7218     case 2: return UNSPEC_REVH;
7219     case 4: return UNSPEC_REVW;
7220     }
7221   gcc_unreachable ();
7222 }
7223
7224 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
7225    operands.  */
7226
7227 void
7228 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
7229 {
7230   /* Decide which REV operation we need.  The mode with wider elements
7231      determines the mode of the operands and the mode with the narrower
7232      elements determines the reverse width.  */
7233   machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
7234   machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
7235   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
7236       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
7237     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
7238
7239   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
7240   machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
7241
7242   /* Get the operands in the appropriate modes and emit the instruction.  */
7243   ptrue = gen_lowpart (pred_mode, ptrue);
7244   dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
7245   src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
7246   emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
7247                                dest, ptrue, src));
7248 }
7249
7250 static bool
7251 aarch64_function_ok_for_sibcall (tree, tree exp)
7252 {
7253   if (crtl->abi->id () != expr_callee_abi (exp).id ())
7254     return false;
7255
7256   return true;
7257 }
7258
7259 /* Subroutine of aarch64_pass_by_reference for arguments that are not
7260    passed in SVE registers.  */
7261
7262 static bool
7263 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
7264                              const function_arg_info &arg)
7265 {
7266   HOST_WIDE_INT size;
7267   machine_mode dummymode;
7268   int nregs;
7269
7270   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
7271   if (arg.mode == BLKmode && arg.type)
7272     size = int_size_in_bytes (arg.type);
7273   else
7274     /* No frontends can create types with variable-sized modes, so we
7275        shouldn't be asked to pass or return them.  */
7276     size = GET_MODE_SIZE (arg.mode).to_constant ();
7277
7278   /* Aggregates are passed by reference based on their size.  */
7279   if (arg.aggregate_type_p ())
7280     size = int_size_in_bytes (arg.type);
7281
7282   /* Variable sized arguments are always returned by reference.  */
7283   if (size < 0)
7284     return true;
7285
7286   /* Can this be a candidate to be passed in fp/simd register(s)?  */
7287   if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
7288                                                &dummymode, &nregs, NULL,
7289                                                !pcum || pcum->silent_p))
7290     return false;
7291
7292   /* Arguments which are variable sized or larger than 2 registers are
7293      passed by reference unless they are a homogenous floating point
7294      aggregate.  */
7295   return size > 2 * UNITS_PER_WORD;
7296 }
7297
7298 /* Implement TARGET_PASS_BY_REFERENCE.  */
7299
7300 static bool
7301 aarch64_pass_by_reference (cumulative_args_t pcum_v,
7302                            const function_arg_info &arg)
7303 {
7304   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7305
7306   if (!arg.type)
7307     return aarch64_pass_by_reference_1 (pcum, arg);
7308
7309   pure_scalable_type_info pst_info;
7310   switch (pst_info.analyze (arg.type))
7311     {
7312     case pure_scalable_type_info::IS_PST:
7313       if (pcum && !pcum->silent_p && !TARGET_SVE)
7314         /* We can't gracefully recover at this point, so make this a
7315            fatal error.  */
7316         fatal_error (input_location, "arguments of type %qT require"
7317                      " the SVE ISA extension", arg.type);
7318
7319       /* Variadic SVE types are passed by reference.  Normal non-variadic
7320          arguments are too if we've run out of registers.  */
7321       return (!arg.named
7322               || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
7323               || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
7324
7325     case pure_scalable_type_info::DOESNT_MATTER:
7326       gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
7327       return true;
7328
7329     case pure_scalable_type_info::NO_ABI_IDENTITY:
7330     case pure_scalable_type_info::ISNT_PST:
7331       return aarch64_pass_by_reference_1 (pcum, arg);
7332     }
7333   gcc_unreachable ();
7334 }
7335
7336 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
7337 static bool
7338 aarch64_return_in_msb (const_tree valtype)
7339 {
7340   machine_mode dummy_mode;
7341   int dummy_int;
7342
7343   /* Never happens in little-endian mode.  */
7344   if (!BYTES_BIG_ENDIAN)
7345     return false;
7346
7347   /* Only composite types smaller than or equal to 16 bytes can
7348      be potentially returned in registers.  */
7349   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
7350       || int_size_in_bytes (valtype) <= 0
7351       || int_size_in_bytes (valtype) > 16)
7352     return false;
7353
7354   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
7355      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
7356      is always passed/returned in the least significant bits of fp/simd
7357      register(s).  */
7358   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
7359                                                &dummy_mode, &dummy_int, NULL,
7360                                                false))
7361     return false;
7362
7363   /* Likewise pure scalable types for SVE vector and predicate registers.  */
7364   pure_scalable_type_info pst_info;
7365   if (pst_info.analyze_registers (valtype))
7366     return false;
7367
7368   return true;
7369 }
7370
7371 /* Implement TARGET_FUNCTION_VALUE.
7372    Define how to find the value returned by a function.  */
7373
7374 static rtx
7375 aarch64_function_value (const_tree type, const_tree func,
7376                         bool outgoing ATTRIBUTE_UNUSED)
7377 {
7378   machine_mode mode;
7379   int unsignedp;
7380
7381   mode = TYPE_MODE (type);
7382   if (INTEGRAL_TYPE_P (type))
7383     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
7384
7385   pure_scalable_type_info pst_info;
7386   if (type && pst_info.analyze_registers (type))
7387     return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
7388
7389   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
7390      are returned in memory, not by value.  */
7391   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7392   bool sve_p = (vec_flags & VEC_ANY_SVE);
7393
7394   if (aarch64_return_in_msb (type))
7395     {
7396       HOST_WIDE_INT size = int_size_in_bytes (type);
7397
7398       if (size % UNITS_PER_WORD != 0)
7399         {
7400           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
7401           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
7402         }
7403     }
7404
7405   int count;
7406   machine_mode ag_mode;
7407   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
7408                                                NULL, false))
7409     {
7410       gcc_assert (!sve_p);
7411       if (!aarch64_composite_type_p (type, mode))
7412         {
7413           gcc_assert (count == 1 && mode == ag_mode);
7414           return gen_rtx_REG (mode, V0_REGNUM);
7415         }
7416       else if (aarch64_advsimd_full_struct_mode_p (mode)
7417                && known_eq (GET_MODE_SIZE (ag_mode), 16))
7418         return gen_rtx_REG (mode, V0_REGNUM);
7419       else if (aarch64_advsimd_partial_struct_mode_p (mode)
7420                && known_eq (GET_MODE_SIZE (ag_mode), 8))
7421         return gen_rtx_REG (mode, V0_REGNUM);
7422       else
7423         {
7424           int i;
7425           rtx par;
7426
7427           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
7428           for (i = 0; i < count; i++)
7429             {
7430               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
7431               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
7432               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
7433               XVECEXP (par, 0, i) = tmp;
7434             }
7435           return par;
7436         }
7437     }
7438   else
7439     {
7440       if (sve_p)
7441         {
7442           /* Vector types can acquire a partial SVE mode using things like
7443              __attribute__((vector_size(N))), and this is potentially useful.
7444              However, the choice of mode doesn't affect the type's ABI
7445              identity, so we should treat the types as though they had
7446              the associated integer mode, just like they did before SVE
7447              was introduced.
7448
7449              We know that the vector must be 128 bits or smaller,
7450              otherwise we'd have returned it in memory instead.  */
7451           gcc_assert (type
7452                       && (aarch64_some_values_include_pst_objects_p (type)
7453                           || (vec_flags & VEC_PARTIAL)));
7454
7455           scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
7456           rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
7457           rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
7458           return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
7459         }
7460       return gen_rtx_REG (mode, R0_REGNUM);
7461     }
7462 }
7463
7464 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
7465    Return true if REGNO is the number of a hard register in which the values
7466    of called function may come back.  */
7467
7468 static bool
7469 aarch64_function_value_regno_p (const unsigned int regno)
7470 {
7471   /* Maximum of 16 bytes can be returned in the general registers.  Examples
7472      of 16-byte return values are: 128-bit integers and 16-byte small
7473      structures (excluding homogeneous floating-point aggregates).  */
7474   if (regno == R0_REGNUM || regno == R1_REGNUM)
7475     return true;
7476
7477   /* Up to four fp/simd registers can return a function value, e.g. a
7478      homogeneous floating-point aggregate having four members.  */
7479   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
7480     return TARGET_FLOAT;
7481
7482   return false;
7483 }
7484
7485 /* Subroutine for aarch64_return_in_memory for types that are not returned
7486    in SVE registers.  */
7487
7488 static bool
7489 aarch64_return_in_memory_1 (const_tree type)
7490 {
7491   HOST_WIDE_INT size;
7492   machine_mode ag_mode;
7493   int count;
7494
7495   if (!AGGREGATE_TYPE_P (type)
7496       && TREE_CODE (type) != COMPLEX_TYPE
7497       && TREE_CODE (type) != VECTOR_TYPE)
7498     /* Simple scalar types always returned in registers.  */
7499     return false;
7500
7501   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7502                                                &ag_mode, &count, NULL, false))
7503     return false;
7504
7505   /* Types larger than 2 registers returned in memory.  */
7506   size = int_size_in_bytes (type);
7507   return (size < 0 || size > 2 * UNITS_PER_WORD);
7508 }
7509
7510 /* Implement TARGET_RETURN_IN_MEMORY.
7511
7512    If the type T of the result of a function is such that
7513      void func (T arg)
7514    would require that arg be passed as a value in a register (or set of
7515    registers) according to the parameter passing rules, then the result
7516    is returned in the same registers as would be used for such an
7517    argument.  */
7518
7519 static bool
7520 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
7521 {
7522   pure_scalable_type_info pst_info;
7523   switch (pst_info.analyze (type))
7524     {
7525     case pure_scalable_type_info::IS_PST:
7526       return (pst_info.num_zr () > NUM_FP_ARG_REGS
7527               || pst_info.num_pr () > NUM_PR_ARG_REGS);
7528
7529     case pure_scalable_type_info::DOESNT_MATTER:
7530       gcc_assert (aarch64_return_in_memory_1 (type));
7531       return true;
7532
7533     case pure_scalable_type_info::NO_ABI_IDENTITY:
7534     case pure_scalable_type_info::ISNT_PST:
7535       return aarch64_return_in_memory_1 (type);
7536     }
7537   gcc_unreachable ();
7538 }
7539
7540 static bool
7541 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
7542                                const_tree type, int *nregs)
7543 {
7544   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7545   return aarch64_vfp_is_call_or_return_candidate (mode, type,
7546                                                   &pcum->aapcs_vfp_rmode,
7547                                                   nregs, NULL, pcum->silent_p);
7548 }
7549
7550 /* Given MODE and TYPE of a function argument, return the alignment in
7551    bits.  The idea is to suppress any stronger alignment requested by
7552    the user and opt for the natural alignment (specified in AAPCS64 \S
7553    4.1).  ABI_BREAK is set to the old alignment if the alignment was
7554    incorrectly calculated in versions of GCC prior to GCC-9.
7555    ABI_BREAK_PACKED is set to the old alignment if it was incorrectly
7556    calculated in versions between GCC-9 and GCC-13.  This is a helper
7557    function for local use only.  */
7558
7559 static unsigned int
7560 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
7561                                 unsigned int *abi_break,
7562                                 unsigned int *abi_break_packed)
7563 {
7564   *abi_break = 0;
7565   *abi_break_packed = 0;
7566   if (!type)
7567     return GET_MODE_ALIGNMENT (mode);
7568
7569   if (integer_zerop (TYPE_SIZE (type)))
7570     return 0;
7571
7572   gcc_assert (TYPE_MODE (type) == mode);
7573
7574   if (!AGGREGATE_TYPE_P (type))
7575     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
7576
7577   if (TREE_CODE (type) == ARRAY_TYPE)
7578     return TYPE_ALIGN (TREE_TYPE (type));
7579
7580   unsigned int alignment = 0;
7581   unsigned int bitfield_alignment_with_packed = 0;
7582   unsigned int bitfield_alignment = 0;
7583   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7584     if (TREE_CODE (field) == FIELD_DECL)
7585       {
7586         /* Note that we explicitly consider zero-sized fields here,
7587            even though they don't map to AAPCS64 machine types.
7588            For example, in:
7589
7590                struct __attribute__((aligned(8))) empty {};
7591
7592                struct s {
7593                  [[no_unique_address]] empty e;
7594                  int x;
7595                };
7596
7597            "s" contains only one Fundamental Data Type (the int field)
7598            but gains 8-byte alignment and size thanks to "e".  */
7599         alignment = std::max (alignment, DECL_ALIGN (field));
7600         if (DECL_BIT_FIELD_TYPE (field))
7601           {
7602             /* Take the bit-field type's alignment into account only
7603                if the user didn't reduce this field's alignment with
7604                the packed attribute.  */
7605             if (!DECL_PACKED (field))
7606               bitfield_alignment
7607                 = std::max (bitfield_alignment,
7608                             TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
7609
7610             /* Compute the alignment even if the bit-field is
7611                packed, so that we can emit a warning in case the
7612                alignment changed between GCC versions.  */
7613             bitfield_alignment_with_packed
7614               = std::max (bitfield_alignment_with_packed,
7615                           TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
7616           }
7617       }
7618
7619   /* Emit a warning if the alignment is different when taking the
7620      'packed' attribute into account.  */
7621   if (bitfield_alignment != bitfield_alignment_with_packed
7622       && bitfield_alignment_with_packed > alignment)
7623     *abi_break_packed = bitfield_alignment_with_packed;
7624
7625   if (bitfield_alignment > alignment)
7626     {
7627       *abi_break = alignment;
7628       return bitfield_alignment;
7629     }
7630
7631   return alignment;
7632 }
7633
7634 /* Layout a function argument according to the AAPCS64 rules.  The rule
7635    numbers refer to the rule numbers in the AAPCS64.  ORIG_MODE is the
7636    mode that was originally given to us by the target hook, whereas the
7637    mode in ARG might be the result of replacing partial SVE modes with
7638    the equivalent integer mode.  */
7639
7640 static void
7641 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7642 {
7643   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7644   tree type = arg.type;
7645   machine_mode mode = arg.mode;
7646   int ncrn, nvrn, nregs;
7647   bool allocate_ncrn, allocate_nvrn;
7648   HOST_WIDE_INT size;
7649   unsigned int abi_break;
7650   unsigned int abi_break_packed;
7651
7652   /* We need to do this once per argument.  */
7653   if (pcum->aapcs_arg_processed)
7654     return;
7655
7656   bool warn_pcs_change
7657     = (warn_psabi
7658        && !pcum->silent_p
7659        && (currently_expanding_function_start
7660            || currently_expanding_gimple_stmt));
7661
7662   /* There are several things to note here:
7663
7664      - Both the C and AAPCS64 interpretations of a type's alignment should
7665        give a value that is no greater than the type's size.
7666
7667      - Types bigger than 16 bytes are passed indirectly.
7668
7669      - If an argument of type T is passed indirectly, TYPE and MODE describe
7670        a pointer to T rather than T iself.
7671
7672      It follows that the AAPCS64 alignment of TYPE must be no greater
7673      than 16 bytes.
7674
7675      Versions prior to GCC 9.1 ignored a bitfield's underlying type
7676      and so could calculate an alignment that was too small.  If this
7677      happened for TYPE then ABI_BREAK is this older, too-small alignment.
7678
7679      Although GCC 9.1 fixed that bug, it introduced a different one:
7680      it would consider the alignment of a bitfield's underlying type even
7681      if the field was packed (which should have the effect of overriding
7682      the alignment of the underlying type).  This was fixed in GCC 13.1.
7683
7684      As a result of this bug, GCC 9 to GCC 12 could calculate an alignment
7685      that was too big.  If this happened for TYPE, ABI_BREAK_PACKED is
7686      this older, too-big alignment.
7687
7688      Also, the fact that GCC 9 to GCC 12 considered irrelevant
7689      alignments meant they could calculate type alignments that were
7690      bigger than the type's size, contrary to the assumption above.
7691      The handling of register arguments was nevertheless (and justifiably)
7692      written to follow the assumption that the alignment can never be
7693      greater than the size.  The same was not true for stack arguments;
7694      their alignment was instead handled by MIN bounds in
7695      aarch64_function_arg_boundary.
7696
7697      The net effect is that, if GCC 9 to GCC 12 incorrectly calculated
7698      an alignment of more than 16 bytes for TYPE then:
7699
7700      - If the argument was passed in registers, these GCC versions
7701        would treat the alignment as though it was *less than* 16 bytes.
7702
7703      - If the argument was passed on the stack, these GCC versions
7704        would treat the alignment as though it was *equal to* 16 bytes.
7705
7706      Both behaviors were wrong, but in different cases.  */
7707   unsigned int alignment
7708     = aarch64_function_arg_alignment (mode, type, &abi_break,
7709                                       &abi_break_packed);
7710   gcc_assert (alignment <= 16 * BITS_PER_UNIT
7711               && (!alignment || abi_break < alignment)
7712               && (!abi_break_packed || alignment < abi_break_packed));
7713
7714   pcum->aapcs_arg_processed = true;
7715
7716   pure_scalable_type_info pst_info;
7717   if (type && pst_info.analyze_registers (type))
7718     {
7719       /* aarch64_function_arg_alignment has never had an effect on
7720          this case.  */
7721
7722       /* The PCS says that it is invalid to pass an SVE value to an
7723          unprototyped function.  There is no ABI-defined location we
7724          can return in this case, so we have no real choice but to raise
7725          an error immediately, even though this is only a query function.  */
7726       if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
7727         {
7728           gcc_assert (!pcum->silent_p);
7729           error ("SVE type %qT cannot be passed to an unprototyped function",
7730                  arg.type);
7731           /* Avoid repeating the message, and avoid tripping the assert
7732              below.  */
7733           pcum->pcs_variant = ARM_PCS_SVE;
7734         }
7735
7736       /* We would have converted the argument into pass-by-reference
7737          form if it didn't fit in registers.  */
7738       pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
7739       pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
7740       gcc_assert (arg.named
7741                   && pcum->pcs_variant == ARM_PCS_SVE
7742                   && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
7743                   && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
7744       pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
7745                                           P0_REGNUM + pcum->aapcs_nprn);
7746       return;
7747     }
7748
7749   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
7750      are passed by reference, not by value.  */
7751   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7752   bool sve_p = (vec_flags & VEC_ANY_SVE);
7753   if (sve_p)
7754     /* Vector types can acquire a partial SVE mode using things like
7755        __attribute__((vector_size(N))), and this is potentially useful.
7756        However, the choice of mode doesn't affect the type's ABI
7757        identity, so we should treat the types as though they had
7758        the associated integer mode, just like they did before SVE
7759        was introduced.
7760
7761        We know that the vector must be 128 bits or smaller,
7762        otherwise we'd have passed it in memory instead.  */
7763     gcc_assert (type
7764                 && (aarch64_some_values_include_pst_objects_p (type)
7765                     || (vec_flags & VEC_PARTIAL)));
7766
7767   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
7768   if (type)
7769     size = int_size_in_bytes (type);
7770   else
7771     /* No frontends can create types with variable-sized modes, so we
7772        shouldn't be asked to pass or return them.  */
7773     size = GET_MODE_SIZE (mode).to_constant ();
7774   size = ROUND_UP (size, UNITS_PER_WORD);
7775
7776   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
7777   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
7778                                                  mode,
7779                                                  type,
7780                                                  &nregs);
7781   gcc_assert (!sve_p || !allocate_nvrn);
7782
7783   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
7784      The following code thus handles passing by SIMD/FP registers first.  */
7785
7786   nvrn = pcum->aapcs_nvrn;
7787
7788   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
7789      and homogenous short-vector aggregates (HVA).  */
7790   if (allocate_nvrn)
7791     {
7792       /* aarch64_function_arg_alignment has never had an effect on
7793          this case.  */
7794       if (!pcum->silent_p && !TARGET_FLOAT)
7795         aarch64_err_no_fpadvsimd (mode);
7796
7797       if (nvrn + nregs <= NUM_FP_ARG_REGS)
7798         {
7799           pcum->aapcs_nextnvrn = nvrn + nregs;
7800           if (!aarch64_composite_type_p (type, mode))
7801             {
7802               gcc_assert (nregs == 1);
7803               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7804             }
7805           else if (aarch64_advsimd_full_struct_mode_p (mode)
7806                    && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 16))
7807             pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7808           else if (aarch64_advsimd_partial_struct_mode_p (mode)
7809                    && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 8))
7810             pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7811           else
7812             {
7813               rtx par;
7814               int i;
7815               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7816               for (i = 0; i < nregs; i++)
7817                 {
7818                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
7819                                          V0_REGNUM + nvrn + i);
7820                   rtx offset = gen_int_mode
7821                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
7822                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
7823                   XVECEXP (par, 0, i) = tmp;
7824                 }
7825               pcum->aapcs_reg = par;
7826             }
7827           return;
7828         }
7829       else
7830         {
7831           /* C.3 NSRN is set to 8.  */
7832           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
7833           goto on_stack;
7834         }
7835     }
7836
7837   ncrn = pcum->aapcs_ncrn;
7838   nregs = size / UNITS_PER_WORD;
7839
7840   /* C6 - C9.  though the sign and zero extension semantics are
7841      handled elsewhere.  This is the case where the argument fits
7842      entirely general registers.  */
7843   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
7844     {
7845       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
7846
7847       /* C.8 if the argument has an alignment of 16 then the NGRN is
7848          rounded up to the next even number.  */
7849       if (nregs == 2
7850           && ncrn % 2)
7851         {
7852           /* Emit a warning if the alignment changed when taking the
7853              'packed' attribute into account.  */
7854           if (warn_pcs_change
7855               && abi_break_packed
7856               && ((abi_break_packed == 16 * BITS_PER_UNIT)
7857                   != (alignment == 16 * BITS_PER_UNIT)))
7858             inform (input_location, "parameter passing for argument of type "
7859                     "%qT changed in GCC 13.1", type);
7860
7861           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
7862              comparison is there because for > 16 * BITS_PER_UNIT
7863              alignment nregs should be > 2 and therefore it should be
7864              passed by reference rather than value.  */
7865           if (alignment == 16 * BITS_PER_UNIT)
7866             {
7867               if (warn_pcs_change && abi_break)
7868                 inform (input_location, "parameter passing for argument of type "
7869                         "%qT changed in GCC 9.1", type);
7870               ++ncrn;
7871               gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
7872             }
7873         }
7874
7875       /* If an argument with an SVE mode needs to be shifted up to the
7876          high part of the register, treat it as though it had an integer mode.
7877          Using the normal (parallel [...]) would suppress the shifting.  */
7878       if (sve_p
7879           && BYTES_BIG_ENDIAN
7880           && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
7881           && aarch64_pad_reg_upward (mode, type, false))
7882         {
7883           mode = int_mode_for_mode (mode).require ();
7884           sve_p = false;
7885         }
7886
7887       /* NREGS can be 0 when e.g. an empty structure is to be passed.
7888          A reg is still generated for it, but the caller should be smart
7889          enough not to use it.  */
7890       if (nregs == 0
7891           || (nregs == 1 && !sve_p)
7892           || GET_MODE_CLASS (mode) == MODE_INT)
7893         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
7894       else
7895         {
7896           rtx par;
7897           int i;
7898
7899           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7900           for (i = 0; i < nregs; i++)
7901             {
7902               scalar_int_mode reg_mode = word_mode;
7903               if (nregs == 1)
7904                 reg_mode = int_mode_for_mode (mode).require ();
7905               rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
7906               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
7907                                        GEN_INT (i * UNITS_PER_WORD));
7908               XVECEXP (par, 0, i) = tmp;
7909             }
7910           pcum->aapcs_reg = par;
7911         }
7912
7913       pcum->aapcs_nextncrn = ncrn + nregs;
7914       return;
7915     }
7916
7917   /* C.11  */
7918   pcum->aapcs_nextncrn = NUM_ARG_REGS;
7919
7920   /* The argument is passed on stack; record the needed number of words for
7921      this argument and align the total size if necessary.  */
7922 on_stack:
7923   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
7924
7925   if (warn_pcs_change
7926       && abi_break_packed
7927       && ((abi_break_packed >= 16 * BITS_PER_UNIT)
7928           != (alignment >= 16 * BITS_PER_UNIT)))
7929     inform (input_location, "parameter passing for argument of type "
7930             "%qT changed in GCC 13.1", type);
7931
7932   if (alignment == 16 * BITS_PER_UNIT)
7933     {
7934       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
7935       if (pcum->aapcs_stack_size != new_size)
7936         {
7937           if (warn_pcs_change && abi_break)
7938             inform (input_location, "parameter passing for argument of type "
7939                     "%qT changed in GCC 9.1", type);
7940           pcum->aapcs_stack_size = new_size;
7941         }
7942     }
7943   return;
7944 }
7945
7946 /* Implement TARGET_FUNCTION_ARG.  */
7947
7948 static rtx
7949 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7950 {
7951   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7952   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
7953               || pcum->pcs_variant == ARM_PCS_SIMD
7954               || pcum->pcs_variant == ARM_PCS_SVE);
7955
7956   if (arg.end_marker_p ())
7957     return gen_int_mode (pcum->pcs_variant, DImode);
7958
7959   aarch64_layout_arg (pcum_v, arg);
7960   return pcum->aapcs_reg;
7961 }
7962
7963 void
7964 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
7965                               const_tree fntype,
7966                               rtx libname ATTRIBUTE_UNUSED,
7967                               const_tree fndecl ATTRIBUTE_UNUSED,
7968                               unsigned n_named ATTRIBUTE_UNUSED,
7969                               bool silent_p)
7970 {
7971   pcum->aapcs_ncrn = 0;
7972   pcum->aapcs_nvrn = 0;
7973   pcum->aapcs_nprn = 0;
7974   pcum->aapcs_nextncrn = 0;
7975   pcum->aapcs_nextnvrn = 0;
7976   pcum->aapcs_nextnprn = 0;
7977   if (fntype)
7978     pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
7979   else
7980     pcum->pcs_variant = ARM_PCS_AAPCS64;
7981   pcum->aapcs_reg = NULL_RTX;
7982   pcum->aapcs_arg_processed = false;
7983   pcum->aapcs_stack_words = 0;
7984   pcum->aapcs_stack_size = 0;
7985   pcum->silent_p = silent_p;
7986
7987   if (!silent_p
7988       && !TARGET_FLOAT
7989       && fntype && fntype != error_mark_node)
7990     {
7991       const_tree type = TREE_TYPE (fntype);
7992       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
7993       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
7994       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7995                                                    &mode, &nregs, NULL, false))
7996         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
7997     }
7998
7999   if (!silent_p
8000       && !TARGET_SVE
8001       && pcum->pcs_variant == ARM_PCS_SVE)
8002     {
8003       /* We can't gracefully recover at this point, so make this a
8004          fatal error.  */
8005       if (fndecl)
8006         fatal_error (input_location, "%qE requires the SVE ISA extension",
8007                      fndecl);
8008       else
8009         fatal_error (input_location, "calls to functions of type %qT require"
8010                      " the SVE ISA extension", fntype);
8011     }
8012 }
8013
8014 static void
8015 aarch64_function_arg_advance (cumulative_args_t pcum_v,
8016                               const function_arg_info &arg)
8017 {
8018   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
8019   if (pcum->pcs_variant == ARM_PCS_AAPCS64
8020       || pcum->pcs_variant == ARM_PCS_SIMD
8021       || pcum->pcs_variant == ARM_PCS_SVE)
8022     {
8023       aarch64_layout_arg (pcum_v, arg);
8024       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
8025                   != (pcum->aapcs_stack_words != 0));
8026       pcum->aapcs_arg_processed = false;
8027       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
8028       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
8029       pcum->aapcs_nprn = pcum->aapcs_nextnprn;
8030       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
8031       pcum->aapcs_stack_words = 0;
8032       pcum->aapcs_reg = NULL_RTX;
8033     }
8034 }
8035
8036 bool
8037 aarch64_function_arg_regno_p (unsigned regno)
8038 {
8039   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
8040           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
8041 }
8042
8043 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
8044    PARM_BOUNDARY bits of alignment, but will be given anything up
8045    to STACK_BOUNDARY bits if the type requires it.  This makes sure
8046    that both before and after the layout of each argument, the Next
8047    Stacked Argument Address (NSAA) will have a minimum alignment of
8048    8 bytes.  */
8049
8050 static unsigned int
8051 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
8052 {
8053   unsigned int abi_break;
8054   unsigned int abi_break_packed;
8055   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
8056                                                            &abi_break,
8057                                                            &abi_break_packed);
8058   /* We rely on aarch64_layout_arg and aarch64_gimplify_va_arg_expr
8059      to emit warnings about ABI incompatibility.  */
8060   alignment = MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
8061   return alignment;
8062 }
8063
8064 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
8065
8066 static fixed_size_mode
8067 aarch64_get_reg_raw_mode (int regno)
8068 {
8069   if (TARGET_SVE && FP_REGNUM_P (regno))
8070     /* Don't use the SVE part of the register for __builtin_apply and
8071        __builtin_return.  The SVE registers aren't used by the normal PCS,
8072        so using them there would be a waste of time.  The PCS extensions
8073        for SVE types are fundamentally incompatible with the
8074        __builtin_return/__builtin_apply interface.  */
8075     return as_a <fixed_size_mode> (V16QImode);
8076   return default_get_reg_raw_mode (regno);
8077 }
8078
8079 /* Implement TARGET_FUNCTION_ARG_PADDING.
8080
8081    Small aggregate types are placed in the lowest memory address.
8082
8083    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
8084
8085 static pad_direction
8086 aarch64_function_arg_padding (machine_mode mode, const_tree type)
8087 {
8088   /* On little-endian targets, the least significant byte of every stack
8089      argument is passed at the lowest byte address of the stack slot.  */
8090   if (!BYTES_BIG_ENDIAN)
8091     return PAD_UPWARD;
8092
8093   /* Otherwise, integral, floating-point and pointer types are padded downward:
8094      the least significant byte of a stack argument is passed at the highest
8095      byte address of the stack slot.  */
8096   if (type
8097       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
8098          || POINTER_TYPE_P (type))
8099       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
8100     return PAD_DOWNWARD;
8101
8102   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
8103   return PAD_UPWARD;
8104 }
8105
8106 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
8107
8108    It specifies padding for the last (may also be the only)
8109    element of a block move between registers and memory.  If
8110    assuming the block is in the memory, padding upward means that
8111    the last element is padded after its highest significant byte,
8112    while in downward padding, the last element is padded at the
8113    its least significant byte side.
8114
8115    Small aggregates and small complex types are always padded
8116    upwards.
8117
8118    We don't need to worry about homogeneous floating-point or
8119    short-vector aggregates; their move is not affected by the
8120    padding direction determined here.  Regardless of endianness,
8121    each element of such an aggregate is put in the least
8122    significant bits of a fp/simd register.
8123
8124    Return !BYTES_BIG_ENDIAN if the least significant byte of the
8125    register has useful data, and return the opposite if the most
8126    significant byte does.  */
8127
8128 bool
8129 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
8130                      bool first ATTRIBUTE_UNUSED)
8131 {
8132
8133   /* Aside from pure scalable types, small composite types are always
8134      padded upward.  */
8135   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
8136     {
8137       HOST_WIDE_INT size;
8138       if (type)
8139         size = int_size_in_bytes (type);
8140       else
8141         /* No frontends can create types with variable-sized modes, so we
8142            shouldn't be asked to pass or return them.  */
8143         size = GET_MODE_SIZE (mode).to_constant ();
8144       if (size < 2 * UNITS_PER_WORD)
8145         {
8146           pure_scalable_type_info pst_info;
8147           if (pst_info.analyze_registers (type))
8148             return false;
8149           return true;
8150         }
8151     }
8152
8153   /* Otherwise, use the default padding.  */
8154   return !BYTES_BIG_ENDIAN;
8155 }
8156
8157 static scalar_int_mode
8158 aarch64_libgcc_cmp_return_mode (void)
8159 {
8160   return SImode;
8161 }
8162
8163 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
8164
8165 /* We use the 12-bit shifted immediate arithmetic instructions so values
8166    must be multiple of (1 << 12), i.e. 4096.  */
8167 #define ARITH_FACTOR 4096
8168
8169 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
8170 #error Cannot use simple address calculation for stack probing
8171 #endif
8172
8173 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
8174    inclusive.  These are offsets from the current stack pointer.  */
8175
8176 static void
8177 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
8178 {
8179   HOST_WIDE_INT size;
8180   if (!poly_size.is_constant (&size))
8181     {
8182       sorry ("stack probes for SVE frames");
8183       return;
8184     }
8185
8186   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REGNUM);
8187
8188   /* See the same assertion on PROBE_INTERVAL above.  */
8189   gcc_assert ((first % ARITH_FACTOR) == 0);
8190
8191   /* See if we have a constant small number of probes to generate.  If so,
8192      that's the easy case.  */
8193   if (size <= PROBE_INTERVAL)
8194     {
8195       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
8196
8197       emit_set_insn (reg1,
8198                      plus_constant (Pmode,
8199                                     stack_pointer_rtx, -(first + base)));
8200       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
8201     }
8202
8203   /* The run-time loop is made up of 8 insns in the generic case while the
8204      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
8205   else if (size <= 4 * PROBE_INTERVAL)
8206     {
8207       HOST_WIDE_INT i, rem;
8208
8209       emit_set_insn (reg1,
8210                      plus_constant (Pmode,
8211                                     stack_pointer_rtx,
8212                                     -(first + PROBE_INTERVAL)));
8213       emit_stack_probe (reg1);
8214
8215       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
8216          it exceeds SIZE.  If only two probes are needed, this will not
8217          generate any code.  Then probe at FIRST + SIZE.  */
8218       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
8219         {
8220           emit_set_insn (reg1,
8221                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
8222           emit_stack_probe (reg1);
8223         }
8224
8225       rem = size - (i - PROBE_INTERVAL);
8226       if (rem > 256)
8227         {
8228           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
8229
8230           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
8231           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
8232         }
8233       else
8234         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
8235     }
8236
8237   /* Otherwise, do the same as above, but in a loop.  Note that we must be
8238      extra careful with variables wrapping around because we might be at
8239      the very top (or the very bottom) of the address space and we have
8240      to be able to handle this case properly; in particular, we use an
8241      equality test for the loop condition.  */
8242   else
8243     {
8244       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REGNUM);
8245
8246       /* Step 1: round SIZE to the previous multiple of the interval.  */
8247
8248       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
8249
8250
8251       /* Step 2: compute initial and final value of the loop counter.  */
8252
8253       /* TEST_ADDR = SP + FIRST.  */
8254       emit_set_insn (reg1,
8255                      plus_constant (Pmode, stack_pointer_rtx, -first));
8256
8257       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
8258       HOST_WIDE_INT adjustment = - (first + rounded_size);
8259       if (! aarch64_uimm12_shift (adjustment))
8260         {
8261           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
8262                                           true, Pmode);
8263           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
8264         }
8265       else
8266         emit_set_insn (reg2,
8267                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
8268
8269       /* Step 3: the loop
8270
8271          do
8272            {
8273              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
8274              probe at TEST_ADDR
8275            }
8276          while (TEST_ADDR != LAST_ADDR)
8277
8278          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
8279          until it is equal to ROUNDED_SIZE.  */
8280
8281       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
8282
8283
8284       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
8285          that SIZE is equal to ROUNDED_SIZE.  */
8286
8287       if (size != rounded_size)
8288         {
8289           HOST_WIDE_INT rem = size - rounded_size;
8290
8291           if (rem > 256)
8292             {
8293               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
8294
8295               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
8296               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
8297             }
8298           else
8299             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
8300         }
8301     }
8302
8303   /* Make sure nothing is scheduled before we are done.  */
8304   emit_insn (gen_blockage ());
8305 }
8306
8307 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
8308    absolute addresses.  */
8309
8310 const char *
8311 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
8312 {
8313   static int labelno = 0;
8314   char loop_lab[32];
8315   rtx xops[2];
8316
8317   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
8318
8319   /* Loop.  */
8320   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
8321
8322   HOST_WIDE_INT stack_clash_probe_interval
8323     = 1 << param_stack_clash_protection_guard_size;
8324
8325   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
8326   xops[0] = reg1;
8327   HOST_WIDE_INT interval;
8328   if (flag_stack_clash_protection)
8329     interval = stack_clash_probe_interval;
8330   else
8331     interval = PROBE_INTERVAL;
8332
8333   gcc_assert (aarch64_uimm12_shift (interval));
8334   xops[1] = GEN_INT (interval);
8335
8336   output_asm_insn ("sub\t%0, %0, %1", xops);
8337
8338   /* If doing stack clash protection then we probe up by the ABI specified
8339      amount.  We do this because we're dropping full pages at a time in the
8340      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
8341   if (flag_stack_clash_protection)
8342     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
8343   else
8344     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
8345
8346   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
8347      by this amount for each iteration.  */
8348   output_asm_insn ("str\txzr, [%0, %1]", xops);
8349
8350   /* Test if TEST_ADDR == LAST_ADDR.  */
8351   xops[1] = reg2;
8352   output_asm_insn ("cmp\t%0, %1", xops);
8353
8354   /* Branch.  */
8355   fputs ("\tb.ne\t", asm_out_file);
8356   assemble_name_raw (asm_out_file, loop_lab);
8357   fputc ('\n', asm_out_file);
8358
8359   return "";
8360 }
8361
8362 /* Emit the probe loop for doing stack clash probes and stack adjustments for
8363    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
8364    of GUARD_SIZE.  When a probe is emitted it is done at most
8365    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
8366    at most MIN_PROBE_THRESHOLD.  By the end of this function
8367    BASE = BASE - ADJUSTMENT.  */
8368
8369 const char *
8370 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
8371                                       rtx min_probe_threshold, rtx guard_size)
8372 {
8373   /* This function is not allowed to use any instruction generation function
8374      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
8375      so instead emit the code you want using output_asm_insn.  */
8376   gcc_assert (flag_stack_clash_protection);
8377   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
8378   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
8379
8380   /* The minimum required allocation before the residual requires probing.  */
8381   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
8382
8383   /* Clamp the value down to the nearest value that can be used with a cmp.  */
8384   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
8385   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
8386
8387   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
8388   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
8389
8390   static int labelno = 0;
8391   char loop_start_lab[32];
8392   char loop_end_lab[32];
8393   rtx xops[2];
8394
8395   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
8396   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
8397
8398   /* Emit loop start label.  */
8399   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
8400
8401   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
8402   xops[0] = adjustment;
8403   xops[1] = probe_offset_value_rtx;
8404   output_asm_insn ("cmp\t%0, %1", xops);
8405
8406   /* Branch to end if not enough adjustment to probe.  */
8407   fputs ("\tb.lt\t", asm_out_file);
8408   assemble_name_raw (asm_out_file, loop_end_lab);
8409   fputc ('\n', asm_out_file);
8410
8411   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
8412   xops[0] = base;
8413   xops[1] = probe_offset_value_rtx;
8414   output_asm_insn ("sub\t%0, %0, %1", xops);
8415
8416   /* Probe at BASE.  */
8417   xops[1] = const0_rtx;
8418   output_asm_insn ("str\txzr, [%0, %1]", xops);
8419
8420   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
8421   xops[0] = adjustment;
8422   xops[1] = probe_offset_value_rtx;
8423   output_asm_insn ("sub\t%0, %0, %1", xops);
8424
8425   /* Branch to start if still more bytes to allocate.  */
8426   fputs ("\tb\t", asm_out_file);
8427   assemble_name_raw (asm_out_file, loop_start_lab);
8428   fputc ('\n', asm_out_file);
8429
8430   /* No probe leave.  */
8431   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
8432
8433   /* BASE = BASE - ADJUSTMENT.  */
8434   xops[0] = base;
8435   xops[1] = adjustment;
8436   output_asm_insn ("sub\t%0, %0, %1", xops);
8437   return "";
8438 }
8439
8440 /* Determine whether a frame chain needs to be generated.  */
8441 static bool
8442 aarch64_needs_frame_chain (void)
8443 {
8444   /* Force a frame chain for EH returns so the return address is at FP+8.  */
8445   if (frame_pointer_needed || crtl->calls_eh_return)
8446     return true;
8447
8448   /* A leaf function cannot have calls or write LR.  */
8449   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
8450
8451   /* Don't use a frame chain in leaf functions if leaf frame pointers
8452      are disabled.  */
8453   if (flag_omit_leaf_frame_pointer && is_leaf)
8454     return false;
8455
8456   return aarch64_use_frame_pointer;
8457 }
8458
8459 /* Mark the registers that need to be saved by the callee and calculate
8460    the size of the callee-saved registers area and frame record (both FP
8461    and LR may be omitted).  */
8462 static void
8463 aarch64_layout_frame (void)
8464 {
8465   poly_int64 offset = 0;
8466   int regno, last_fp_reg = INVALID_REGNUM;
8467   machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
8468   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
8469   bool frame_related_fp_reg_p = false;
8470   aarch64_frame &frame = cfun->machine->frame;
8471
8472   frame.emit_frame_chain = aarch64_needs_frame_chain ();
8473
8474   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
8475      the mid-end is doing.  */
8476   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
8477
8478 #define SLOT_NOT_REQUIRED (-2)
8479 #define SLOT_REQUIRED     (-1)
8480
8481   frame.wb_push_candidate1 = INVALID_REGNUM;
8482   frame.wb_push_candidate2 = INVALID_REGNUM;
8483   frame.spare_pred_reg = INVALID_REGNUM;
8484
8485   /* First mark all the registers that really need to be saved...  */
8486   for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8487     frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
8488
8489   /* ... that includes the eh data registers (if needed)...  */
8490   if (crtl->calls_eh_return)
8491     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
8492       frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
8493
8494   /* ... and any callee saved register that dataflow says is live.  */
8495   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
8496     if (df_regs_ever_live_p (regno)
8497         && !fixed_regs[regno]
8498         && (regno == R30_REGNUM
8499             || !crtl->abi->clobbers_full_reg_p (regno)))
8500       frame.reg_offset[regno] = SLOT_REQUIRED;
8501
8502   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8503     if (df_regs_ever_live_p (regno)
8504         && !fixed_regs[regno]
8505         && !crtl->abi->clobbers_full_reg_p (regno))
8506       {
8507         frame.reg_offset[regno] = SLOT_REQUIRED;
8508         last_fp_reg = regno;
8509         if (aarch64_emit_cfi_for_reg_p (regno))
8510           frame_related_fp_reg_p = true;
8511       }
8512
8513   /* Big-endian SVE frames need a spare predicate register in order
8514      to save Z8-Z15.  Decide which register they should use.  Prefer
8515      an unused argument register if possible, so that we don't force P4
8516      to be saved unnecessarily.  */
8517   if (frame_related_fp_reg_p
8518       && crtl->abi->id () == ARM_PCS_SVE
8519       && BYTES_BIG_ENDIAN)
8520     {
8521       bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
8522       bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
8523       for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
8524         if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
8525           break;
8526       gcc_assert (regno <= P7_REGNUM);
8527       frame.spare_pred_reg = regno;
8528       df_set_regs_ever_live (regno, true);
8529     }
8530
8531   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
8532     if (df_regs_ever_live_p (regno)
8533         && !fixed_regs[regno]
8534         && !crtl->abi->clobbers_full_reg_p (regno))
8535       frame.reg_offset[regno] = SLOT_REQUIRED;
8536
8537   /* With stack-clash, LR must be saved in non-leaf functions.  The saving of
8538      LR counts as an implicit probe which allows us to maintain the invariant
8539      described in the comment at expand_prologue.  */
8540   gcc_assert (crtl->is_leaf
8541               || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
8542
8543   /* Now assign stack slots for the registers.  Start with the predicate
8544      registers, since predicate LDR and STR have a relatively small
8545      offset range.  These saves happen below the hard frame pointer.  */
8546   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
8547     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8548       {
8549         frame.reg_offset[regno] = offset;
8550         offset += BYTES_PER_SVE_PRED;
8551       }
8552
8553   if (maybe_ne (offset, 0))
8554     {
8555       /* If we have any vector registers to save above the predicate registers,
8556          the offset of the vector register save slots need to be a multiple
8557          of the vector size.  This lets us use the immediate forms of LDR/STR
8558          (or LD1/ST1 for big-endian).
8559
8560          A vector register is 8 times the size of a predicate register,
8561          and we need to save a maximum of 12 predicate registers, so the
8562          first vector register will be at either #1, MUL VL or #2, MUL VL.
8563
8564          If we don't have any vector registers to save, and we know how
8565          big the predicate save area is, we can just round it up to the
8566          next 16-byte boundary.  */
8567       if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ())
8568         offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8569       else
8570         {
8571           if (known_le (offset, vector_save_size))
8572             offset = vector_save_size;
8573           else if (known_le (offset, vector_save_size * 2))
8574             offset = vector_save_size * 2;
8575           else
8576             gcc_unreachable ();
8577         }
8578     }
8579
8580   /* If we need to save any SVE vector registers, add them next.  */
8581   if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
8582     for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8583       if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8584         {
8585           frame.reg_offset[regno] = offset;
8586           offset += vector_save_size;
8587         }
8588
8589   /* OFFSET is now the offset of the hard frame pointer from the bottom
8590      of the callee save area.  */
8591   bool saves_below_hard_fp_p = maybe_ne (offset, 0);
8592   frame.below_hard_fp_saved_regs_size = offset;
8593   if (frame.emit_frame_chain)
8594     {
8595       /* FP and LR are placed in the linkage record.  */
8596       frame.reg_offset[R29_REGNUM] = offset;
8597       frame.wb_push_candidate1 = R29_REGNUM;
8598       frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
8599       frame.wb_push_candidate2 = R30_REGNUM;
8600       offset += 2 * UNITS_PER_WORD;
8601     }
8602
8603   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
8604     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8605       {
8606         frame.reg_offset[regno] = offset;
8607         if (frame.wb_push_candidate1 == INVALID_REGNUM)
8608           frame.wb_push_candidate1 = regno;
8609         else if (frame.wb_push_candidate2 == INVALID_REGNUM)
8610           frame.wb_push_candidate2 = regno;
8611         offset += UNITS_PER_WORD;
8612       }
8613
8614   poly_int64 max_int_offset = offset;
8615   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8616   bool has_align_gap = maybe_ne (offset, max_int_offset);
8617
8618   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8619     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8620       {
8621         /* If there is an alignment gap between integer and fp callee-saves,
8622            allocate the last fp register to it if possible.  */
8623         if (regno == last_fp_reg
8624             && has_align_gap
8625             && known_eq (vector_save_size, 8)
8626             && multiple_p (offset, 16))
8627           {
8628             frame.reg_offset[regno] = max_int_offset;
8629             break;
8630           }
8631
8632         frame.reg_offset[regno] = offset;
8633         if (frame.wb_push_candidate1 == INVALID_REGNUM)
8634           frame.wb_push_candidate1 = regno;
8635         else if (frame.wb_push_candidate2 == INVALID_REGNUM
8636                  && frame.wb_push_candidate1 >= V0_REGNUM)
8637           frame.wb_push_candidate2 = regno;
8638         offset += vector_save_size;
8639       }
8640
8641   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8642
8643   frame.saved_regs_size = offset;
8644
8645   poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
8646
8647   poly_int64 above_outgoing_args
8648     = aligned_upper_bound (varargs_and_saved_regs_size
8649                            + get_frame_size (),
8650                            STACK_BOUNDARY / BITS_PER_UNIT);
8651
8652   frame.hard_fp_offset
8653     = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
8654
8655   /* Both these values are already aligned.  */
8656   gcc_assert (multiple_p (crtl->outgoing_args_size,
8657                           STACK_BOUNDARY / BITS_PER_UNIT));
8658   frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
8659
8660   frame.locals_offset = frame.saved_varargs_size;
8661
8662   frame.initial_adjust = 0;
8663   frame.final_adjust = 0;
8664   frame.callee_adjust = 0;
8665   frame.sve_callee_adjust = 0;
8666   frame.callee_offset = 0;
8667
8668   frame.wb_pop_candidate1 = frame.wb_push_candidate1;
8669   frame.wb_pop_candidate2 = frame.wb_push_candidate2;
8670
8671   /* Shadow call stack only deals with functions where the LR is pushed
8672      onto the stack and without specifying the "no_sanitize" attribute
8673      with the argument "shadow-call-stack".  */
8674   frame.is_scs_enabled
8675     = (!crtl->calls_eh_return
8676        && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK)
8677        && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0));
8678
8679   /* When shadow call stack is enabled, the scs_pop in the epilogue will
8680      restore x30, and we don't need to pop x30 again in the traditional
8681      way.  Pop candidates record the registers that need to be popped
8682      eventually.  */
8683   if (frame.is_scs_enabled)
8684     {
8685       if (frame.wb_pop_candidate2 == R30_REGNUM)
8686         frame.wb_pop_candidate2 = INVALID_REGNUM;
8687       else if (frame.wb_pop_candidate1 == R30_REGNUM)
8688         frame.wb_pop_candidate1 = INVALID_REGNUM;
8689     }
8690
8691   /* If candidate2 is INVALID_REGNUM, we need to adjust max_push_offset to
8692      256 to ensure that the offset meets the requirements of emit_move_insn.
8693      Similarly, if candidate1 is INVALID_REGNUM, we need to set
8694      max_push_offset to 0, because no registers are popped at this time,
8695      so callee_adjust cannot be adjusted.  */
8696   HOST_WIDE_INT max_push_offset = 0;
8697   if (frame.wb_pop_candidate2 != INVALID_REGNUM)
8698     max_push_offset = 512;
8699   else if (frame.wb_pop_candidate1 != INVALID_REGNUM)
8700     max_push_offset = 256;
8701
8702   HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
8703   HOST_WIDE_INT const_saved_regs_size;
8704   if (frame.frame_size.is_constant (&const_size)
8705       && const_size < max_push_offset
8706       && known_eq (frame.hard_fp_offset, const_size))
8707     {
8708       /* Simple, small frame with no outgoing arguments:
8709
8710          stp reg1, reg2, [sp, -frame_size]!
8711          stp reg3, reg4, [sp, 16]  */
8712       frame.callee_adjust = const_size;
8713     }
8714   else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
8715            && frame.saved_regs_size.is_constant (&const_saved_regs_size)
8716            && const_outgoing_args_size + const_saved_regs_size < 512
8717            /* We could handle this case even with outgoing args, provided
8718               that the number of args left us with valid offsets for all
8719               predicate and vector save slots.  It's such a rare case that
8720               it hardly seems worth the effort though.  */
8721            && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
8722            && !(cfun->calls_alloca
8723                 && frame.hard_fp_offset.is_constant (&const_fp_offset)
8724                 && const_fp_offset < max_push_offset))
8725     {
8726       /* Frame with small outgoing arguments:
8727
8728          sub sp, sp, frame_size
8729          stp reg1, reg2, [sp, outgoing_args_size]
8730          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
8731       frame.initial_adjust = frame.frame_size;
8732       frame.callee_offset = const_outgoing_args_size;
8733     }
8734   else if (saves_below_hard_fp_p
8735            && known_eq (frame.saved_regs_size,
8736                         frame.below_hard_fp_saved_regs_size))
8737     {
8738       /* Frame in which all saves are SVE saves:
8739
8740          sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
8741          save SVE registers relative to SP
8742          sub sp, sp, outgoing_args_size  */
8743       frame.initial_adjust = (frame.hard_fp_offset
8744                               + frame.below_hard_fp_saved_regs_size);
8745       frame.final_adjust = crtl->outgoing_args_size;
8746     }
8747   else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
8748            && const_fp_offset < max_push_offset)
8749     {
8750       /* Frame with large outgoing arguments or SVE saves, but with
8751          a small local area:
8752
8753          stp reg1, reg2, [sp, -hard_fp_offset]!
8754          stp reg3, reg4, [sp, 16]
8755          [sub sp, sp, below_hard_fp_saved_regs_size]
8756          [save SVE registers relative to SP]
8757          sub sp, sp, outgoing_args_size  */
8758       frame.callee_adjust = const_fp_offset;
8759       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
8760       frame.final_adjust = crtl->outgoing_args_size;
8761     }
8762   else
8763     {
8764       /* Frame with large local area and outgoing arguments or SVE saves,
8765          using frame pointer:
8766
8767          sub sp, sp, hard_fp_offset
8768          stp x29, x30, [sp, 0]
8769          add x29, sp, 0
8770          stp reg3, reg4, [sp, 16]
8771          [sub sp, sp, below_hard_fp_saved_regs_size]
8772          [save SVE registers relative to SP]
8773          sub sp, sp, outgoing_args_size  */
8774       frame.initial_adjust = frame.hard_fp_offset;
8775       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
8776       frame.final_adjust = crtl->outgoing_args_size;
8777     }
8778
8779   /* Make sure the individual adjustments add up to the full frame size.  */
8780   gcc_assert (known_eq (frame.initial_adjust
8781                         + frame.callee_adjust
8782                         + frame.sve_callee_adjust
8783                         + frame.final_adjust, frame.frame_size));
8784
8785   if (!frame.emit_frame_chain && frame.callee_adjust == 0)
8786     {
8787       /* We've decided not to associate any register saves with the initial
8788          stack allocation.  */
8789       frame.wb_pop_candidate1 = frame.wb_push_candidate1 = INVALID_REGNUM;
8790       frame.wb_pop_candidate2 = frame.wb_push_candidate2 = INVALID_REGNUM;
8791     }
8792
8793   frame.laid_out = true;
8794 }
8795
8796 /* Return true if the register REGNO is saved on entry to
8797    the current function.  */
8798
8799 static bool
8800 aarch64_register_saved_on_entry (int regno)
8801 {
8802   return known_ge (cfun->machine->frame.reg_offset[regno], 0);
8803 }
8804
8805 /* Return the next register up from REGNO up to LIMIT for the callee
8806    to save.  */
8807
8808 static unsigned
8809 aarch64_next_callee_save (unsigned regno, unsigned limit)
8810 {
8811   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
8812     regno ++;
8813   return regno;
8814 }
8815
8816 /* Push the register number REGNO of mode MODE to the stack with write-back
8817    adjusting the stack by ADJUSTMENT.  */
8818
8819 static void
8820 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
8821                            HOST_WIDE_INT adjustment)
8822  {
8823   rtx base_rtx = stack_pointer_rtx;
8824   rtx insn, reg, mem;
8825
8826   reg = gen_rtx_REG (mode, regno);
8827   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
8828                             plus_constant (Pmode, base_rtx, -adjustment));
8829   mem = gen_frame_mem (mode, mem);
8830
8831   insn = emit_move_insn (mem, reg);
8832   RTX_FRAME_RELATED_P (insn) = 1;
8833 }
8834
8835 /* Generate and return an instruction to store the pair of registers
8836    REG and REG2 of mode MODE to location BASE with write-back adjusting
8837    the stack location BASE by ADJUSTMENT.  */
8838
8839 static rtx
8840 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8841                           HOST_WIDE_INT adjustment)
8842 {
8843   switch (mode)
8844     {
8845     case E_DImode:
8846       return gen_storewb_pairdi_di (base, base, reg, reg2,
8847                                     GEN_INT (-adjustment),
8848                                     GEN_INT (UNITS_PER_WORD - adjustment));
8849     case E_DFmode:
8850       return gen_storewb_pairdf_di (base, base, reg, reg2,
8851                                     GEN_INT (-adjustment),
8852                                     GEN_INT (UNITS_PER_WORD - adjustment));
8853     case E_TFmode:
8854       return gen_storewb_pairtf_di (base, base, reg, reg2,
8855                                     GEN_INT (-adjustment),
8856                                     GEN_INT (UNITS_PER_VREG - adjustment));
8857     default:
8858       gcc_unreachable ();
8859     }
8860 }
8861
8862 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
8863    stack pointer by ADJUSTMENT.  */
8864
8865 static void
8866 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
8867 {
8868   rtx_insn *insn;
8869   machine_mode mode = aarch64_reg_save_mode (regno1);
8870
8871   if (regno2 == INVALID_REGNUM)
8872     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
8873
8874   rtx reg1 = gen_rtx_REG (mode, regno1);
8875   rtx reg2 = gen_rtx_REG (mode, regno2);
8876
8877   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
8878                                               reg2, adjustment));
8879   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
8880   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
8881   RTX_FRAME_RELATED_P (insn) = 1;
8882 }
8883
8884 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
8885    adjusting it by ADJUSTMENT afterwards.  */
8886
8887 static rtx
8888 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8889                          HOST_WIDE_INT adjustment)
8890 {
8891   switch (mode)
8892     {
8893     case E_DImode:
8894       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
8895                                    GEN_INT (UNITS_PER_WORD));
8896     case E_DFmode:
8897       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
8898                                    GEN_INT (UNITS_PER_WORD));
8899     case E_TFmode:
8900       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
8901                                    GEN_INT (UNITS_PER_VREG));
8902     default:
8903       gcc_unreachable ();
8904     }
8905 }
8906
8907 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
8908    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
8909    into CFI_OPS.  */
8910
8911 static void
8912 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
8913                   rtx *cfi_ops)
8914 {
8915   machine_mode mode = aarch64_reg_save_mode (regno1);
8916   rtx reg1 = gen_rtx_REG (mode, regno1);
8917
8918   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
8919
8920   if (regno2 == INVALID_REGNUM)
8921     {
8922       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
8923       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
8924       emit_move_insn (reg1, gen_frame_mem (mode, mem));
8925     }
8926   else
8927     {
8928       rtx reg2 = gen_rtx_REG (mode, regno2);
8929       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8930       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
8931                                           reg2, adjustment));
8932     }
8933 }
8934
8935 /* Generate and return a store pair instruction of mode MODE to store
8936    register REG1 to MEM1 and register REG2 to MEM2.  */
8937
8938 static rtx
8939 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
8940                         rtx reg2)
8941 {
8942   switch (mode)
8943     {
8944     case E_DImode:
8945       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
8946
8947     case E_DFmode:
8948       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
8949
8950     case E_TFmode:
8951       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
8952
8953     case E_V4SImode:
8954       return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
8955
8956     case E_V16QImode:
8957       return gen_vec_store_pairv16qiv16qi (mem1, reg1, mem2, reg2);
8958
8959     default:
8960       gcc_unreachable ();
8961     }
8962 }
8963
8964 /* Generate and regurn a load pair isntruction of mode MODE to load register
8965    REG1 from MEM1 and register REG2 from MEM2.  */
8966
8967 static rtx
8968 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
8969                        rtx mem2)
8970 {
8971   switch (mode)
8972     {
8973     case E_DImode:
8974       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
8975
8976     case E_DFmode:
8977       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
8978
8979     case E_TFmode:
8980       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
8981
8982     case E_V4SImode:
8983       return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
8984
8985     default:
8986       gcc_unreachable ();
8987     }
8988 }
8989
8990 /* Return TRUE if return address signing should be enabled for the current
8991    function, otherwise return FALSE.  */
8992
8993 bool
8994 aarch64_return_address_signing_enabled (void)
8995 {
8996   /* This function should only be called after frame laid out.   */
8997   gcc_assert (cfun->machine->frame.laid_out);
8998
8999   /* Turn return address signing off in any function that uses
9000      __builtin_eh_return.  The address passed to __builtin_eh_return
9001      is not signed so either it has to be signed (with original sp)
9002      or the code path that uses it has to avoid authenticating it.
9003      Currently eh return introduces a return to anywhere gadget, no
9004      matter what we do here since it uses ret with user provided
9005      address. An ideal fix for that is to use indirect branch which
9006      can be protected with BTI j (to some extent).  */
9007   if (crtl->calls_eh_return)
9008     return false;
9009
9010   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
9011      if its LR is pushed onto stack.  */
9012   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
9013           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
9014               && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
9015 }
9016
9017 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
9018 bool
9019 aarch64_bti_enabled (void)
9020 {
9021   return (aarch64_enable_bti == 1);
9022 }
9023
9024 /* The caller is going to use ST1D or LD1D to save or restore an SVE
9025    register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
9026    the range [1, 16] * GET_MODE_SIZE (MODE).  Prepare for this by:
9027
9028      (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
9029          or LD1D address
9030
9031      (2) setting PRED to a valid predicate register for the ST1D or LD1D,
9032          if the variable isn't already nonnull
9033
9034    (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
9035    Handle this case using a temporary base register that is suitable for
9036    all offsets in that range.  Use ANCHOR_REG as this base register if it
9037    is nonnull, otherwise create a new register and store it in ANCHOR_REG.  */
9038
9039 static inline void
9040 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
9041                                      rtx &anchor_reg, poly_int64 &offset,
9042                                      rtx &ptrue)
9043 {
9044   if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
9045     {
9046       /* This is the maximum valid offset of the anchor from the base.
9047          Lower values would be valid too.  */
9048       poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
9049       if (!anchor_reg)
9050         {
9051           anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9052           emit_insn (gen_add3_insn (anchor_reg, base_rtx,
9053                                     gen_int_mode (anchor_offset, Pmode)));
9054         }
9055       base_rtx = anchor_reg;
9056       offset -= anchor_offset;
9057     }
9058   if (!ptrue)
9059     {
9060       int pred_reg = cfun->machine->frame.spare_pred_reg;
9061       emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
9062                       CONSTM1_RTX (VNx16BImode));
9063       ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
9064     }
9065 }
9066
9067 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
9068    is saved at BASE + OFFSET.  */
9069
9070 static void
9071 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
9072                             rtx base, poly_int64 offset)
9073 {
9074   rtx mem = gen_frame_mem (GET_MODE (reg),
9075                            plus_constant (Pmode, base, offset));
9076   add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
9077 }
9078
9079 /* Emit code to save the callee-saved registers from register number START
9080    to LIMIT to the stack at the location starting at offset START_OFFSET,
9081    skipping any write-back candidates if SKIP_WB is true.  HARD_FP_VALID_P
9082    is true if the hard frame pointer has been set up.  */
9083
9084 static void
9085 aarch64_save_callee_saves (poly_int64 start_offset,
9086                            unsigned start, unsigned limit, bool skip_wb,
9087                            bool hard_fp_valid_p)
9088 {
9089   rtx_insn *insn;
9090   unsigned regno;
9091   unsigned regno2;
9092   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
9093
9094   for (regno = aarch64_next_callee_save (start, limit);
9095        regno <= limit;
9096        regno = aarch64_next_callee_save (regno + 1, limit))
9097     {
9098       rtx reg, mem;
9099       poly_int64 offset;
9100       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
9101
9102       if (skip_wb
9103           && (regno == cfun->machine->frame.wb_push_candidate1
9104               || regno == cfun->machine->frame.wb_push_candidate2))
9105         continue;
9106
9107       if (cfun->machine->reg_is_wrapped_separately[regno])
9108         continue;
9109
9110       machine_mode mode = aarch64_reg_save_mode (regno);
9111       reg = gen_rtx_REG (mode, regno);
9112       offset = start_offset + cfun->machine->frame.reg_offset[regno];
9113       rtx base_rtx = stack_pointer_rtx;
9114       poly_int64 sp_offset = offset;
9115
9116       HOST_WIDE_INT const_offset;
9117       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9118         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
9119                                              offset, ptrue);
9120       else if (GP_REGNUM_P (regno)
9121                && (!offset.is_constant (&const_offset) || const_offset >= 512))
9122         {
9123           gcc_assert (known_eq (start_offset, 0));
9124           poly_int64 fp_offset
9125             = cfun->machine->frame.below_hard_fp_saved_regs_size;
9126           if (hard_fp_valid_p)
9127             base_rtx = hard_frame_pointer_rtx;
9128           else
9129             {
9130               if (!anchor_reg)
9131                 {
9132                   anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9133                   emit_insn (gen_add3_insn (anchor_reg, base_rtx,
9134                                             gen_int_mode (fp_offset, Pmode)));
9135                 }
9136               base_rtx = anchor_reg;
9137             }
9138           offset -= fp_offset;
9139         }
9140       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
9141       bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
9142
9143       if (!aarch64_sve_mode_p (mode)
9144           && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
9145           && !cfun->machine->reg_is_wrapped_separately[regno2]
9146           && known_eq (GET_MODE_SIZE (mode),
9147                        cfun->machine->frame.reg_offset[regno2]
9148                        - cfun->machine->frame.reg_offset[regno]))
9149         {
9150           rtx reg2 = gen_rtx_REG (mode, regno2);
9151           rtx mem2;
9152
9153           offset += GET_MODE_SIZE (mode);
9154           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
9155           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
9156                                                     reg2));
9157
9158           /* The first part of a frame-related parallel insn is
9159              always assumed to be relevant to the frame
9160              calculations; subsequent parts, are only
9161              frame-related if explicitly marked.  */
9162           if (aarch64_emit_cfi_for_reg_p (regno2))
9163             {
9164               if (need_cfa_note_p)
9165                 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
9166                                             sp_offset + GET_MODE_SIZE (mode));
9167               else
9168                 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
9169             }
9170
9171           regno = regno2;
9172         }
9173       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9174         {
9175           insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
9176           need_cfa_note_p = true;
9177         }
9178       else if (aarch64_sve_mode_p (mode))
9179         insn = emit_insn (gen_rtx_SET (mem, reg));
9180       else
9181         insn = emit_move_insn (mem, reg);
9182
9183       RTX_FRAME_RELATED_P (insn) = frame_related_p;
9184       if (frame_related_p && need_cfa_note_p)
9185         aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
9186     }
9187 }
9188
9189 /* Emit code to restore the callee registers from register number START
9190    up to and including LIMIT.  Restore from the stack offset START_OFFSET,
9191    skipping any write-back candidates if SKIP_WB is true.  Write the
9192    appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
9193
9194 static void
9195 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
9196                               unsigned limit, bool skip_wb, rtx *cfi_ops)
9197 {
9198   unsigned regno;
9199   unsigned regno2;
9200   poly_int64 offset;
9201   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
9202
9203   for (regno = aarch64_next_callee_save (start, limit);
9204        regno <= limit;
9205        regno = aarch64_next_callee_save (regno + 1, limit))
9206     {
9207       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
9208       if (cfun->machine->reg_is_wrapped_separately[regno])
9209         continue;
9210
9211       rtx reg, mem;
9212
9213       if (skip_wb
9214           && (regno == cfun->machine->frame.wb_pop_candidate1
9215               || regno == cfun->machine->frame.wb_pop_candidate2))
9216         continue;
9217
9218       machine_mode mode = aarch64_reg_save_mode (regno);
9219       reg = gen_rtx_REG (mode, regno);
9220       offset = start_offset + cfun->machine->frame.reg_offset[regno];
9221       rtx base_rtx = stack_pointer_rtx;
9222       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9223         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
9224                                              offset, ptrue);
9225       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
9226
9227       if (!aarch64_sve_mode_p (mode)
9228           && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
9229           && !cfun->machine->reg_is_wrapped_separately[regno2]
9230           && known_eq (GET_MODE_SIZE (mode),
9231                        cfun->machine->frame.reg_offset[regno2]
9232                        - cfun->machine->frame.reg_offset[regno]))
9233         {
9234           rtx reg2 = gen_rtx_REG (mode, regno2);
9235           rtx mem2;
9236
9237           offset += GET_MODE_SIZE (mode);
9238           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
9239           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
9240
9241           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
9242           regno = regno2;
9243         }
9244       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9245         emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
9246       else if (aarch64_sve_mode_p (mode))
9247         emit_insn (gen_rtx_SET (reg, mem));
9248       else
9249         emit_move_insn (reg, mem);
9250       if (frame_related_p)
9251         *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
9252     }
9253 }
9254
9255 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
9256    of MODE.  */
9257
9258 static inline bool
9259 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9260 {
9261   HOST_WIDE_INT multiple;
9262   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9263           && IN_RANGE (multiple, -8, 7));
9264 }
9265
9266 /* Return true if OFFSET is a signed 6-bit value multiplied by the size
9267    of MODE.  */
9268
9269 static inline bool
9270 offset_6bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9271 {
9272   HOST_WIDE_INT multiple;
9273   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9274           && IN_RANGE (multiple, -32, 31));
9275 }
9276
9277 /* Return true if OFFSET is an unsigned 6-bit value multiplied by the size
9278    of MODE.  */
9279
9280 static inline bool
9281 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
9282 {
9283   HOST_WIDE_INT multiple;
9284   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9285           && IN_RANGE (multiple, 0, 63));
9286 }
9287
9288 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
9289    of MODE.  */
9290
9291 bool
9292 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9293 {
9294   HOST_WIDE_INT multiple;
9295   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9296           && IN_RANGE (multiple, -64, 63));
9297 }
9298
9299 /* Return true if OFFSET is a signed 9-bit value.  */
9300
9301 bool
9302 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
9303                                        poly_int64 offset)
9304 {
9305   HOST_WIDE_INT const_offset;
9306   return (offset.is_constant (&const_offset)
9307           && IN_RANGE (const_offset, -256, 255));
9308 }
9309
9310 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
9311    of MODE.  */
9312
9313 static inline bool
9314 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9315 {
9316   HOST_WIDE_INT multiple;
9317   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9318           && IN_RANGE (multiple, -256, 255));
9319 }
9320
9321 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
9322    of MODE.  */
9323
9324 static inline bool
9325 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
9326 {
9327   HOST_WIDE_INT multiple;
9328   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9329           && IN_RANGE (multiple, 0, 4095));
9330 }
9331
9332 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
9333
9334 static sbitmap
9335 aarch64_get_separate_components (void)
9336 {
9337   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
9338   bitmap_clear (components);
9339
9340   /* The registers we need saved to the frame.  */
9341   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9342     if (aarch64_register_saved_on_entry (regno))
9343       {
9344         /* Punt on saves and restores that use ST1D and LD1D.  We could
9345            try to be smarter, but it would involve making sure that the
9346            spare predicate register itself is safe to use at the save
9347            and restore points.  Also, when a frame pointer is being used,
9348            the slots are often out of reach of ST1D and LD1D anyway.  */
9349         machine_mode mode = aarch64_reg_save_mode (regno);
9350         if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9351           continue;
9352
9353         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
9354
9355         /* If the register is saved in the first SVE save slot, we use
9356            it as a stack probe for -fstack-clash-protection.  */
9357         if (flag_stack_clash_protection
9358             && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
9359             && known_eq (offset, 0))
9360           continue;
9361
9362         /* Get the offset relative to the register we'll use.  */
9363         if (frame_pointer_needed)
9364           offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
9365         else
9366           offset += crtl->outgoing_args_size;
9367
9368         /* Check that we can access the stack slot of the register with one
9369            direct load with no adjustments needed.  */
9370         if (aarch64_sve_mode_p (mode)
9371             ? offset_9bit_signed_scaled_p (mode, offset)
9372             : offset_12bit_unsigned_scaled_p (mode, offset))
9373           bitmap_set_bit (components, regno);
9374       }
9375
9376   /* Don't mess with the hard frame pointer.  */
9377   if (frame_pointer_needed)
9378     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
9379
9380   /* If the spare predicate register used by big-endian SVE code
9381      is call-preserved, it must be saved in the main prologue
9382      before any saves that use it.  */
9383   if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
9384     bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
9385
9386   unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
9387   unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
9388   /* If registers have been chosen to be stored/restored with
9389      writeback don't interfere with them to avoid having to output explicit
9390      stack adjustment instructions.  */
9391   if (reg2 != INVALID_REGNUM)
9392     bitmap_clear_bit (components, reg2);
9393   if (reg1 != INVALID_REGNUM)
9394     bitmap_clear_bit (components, reg1);
9395
9396   bitmap_clear_bit (components, LR_REGNUM);
9397   bitmap_clear_bit (components, SP_REGNUM);
9398
9399   return components;
9400 }
9401
9402 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
9403
9404 static sbitmap
9405 aarch64_components_for_bb (basic_block bb)
9406 {
9407   bitmap in = DF_LIVE_IN (bb);
9408   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
9409   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
9410
9411   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
9412   bitmap_clear (components);
9413
9414   /* Clobbered registers don't generate values in any meaningful sense,
9415      since nothing after the clobber can rely on their value.  And we can't
9416      say that partially-clobbered registers are unconditionally killed,
9417      because whether they're killed or not depends on the mode of the
9418      value they're holding.  Thus partially call-clobbered registers
9419      appear in neither the kill set nor the gen set.
9420
9421      Check manually for any calls that clobber more of a register than the
9422      current function can.  */
9423   function_abi_aggregator callee_abis;
9424   rtx_insn *insn;
9425   FOR_BB_INSNS (bb, insn)
9426     if (CALL_P (insn))
9427       callee_abis.note_callee_abi (insn_callee_abi (insn));
9428   HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
9429
9430   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
9431   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9432     if (!fixed_regs[regno]
9433         && !crtl->abi->clobbers_full_reg_p (regno)
9434         && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
9435             || bitmap_bit_p (in, regno)
9436             || bitmap_bit_p (gen, regno)
9437             || bitmap_bit_p (kill, regno)))
9438       {
9439         bitmap_set_bit (components, regno);
9440
9441         /* If there is a callee-save at an adjacent offset, add it too
9442            to increase the use of LDP/STP.  */
9443         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
9444         unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
9445
9446         if (regno2 <= LAST_SAVED_REGNUM)
9447           {
9448             poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
9449             if (regno < regno2
9450                 ? known_eq (offset + 8, offset2)
9451                 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
9452               bitmap_set_bit (components, regno2);
9453           }
9454       }
9455
9456   return components;
9457 }
9458
9459 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
9460    Nothing to do for aarch64.  */
9461
9462 static void
9463 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
9464 {
9465 }
9466
9467 /* Return the next set bit in BMP from START onwards.  Return the total number
9468    of bits in BMP if no set bit is found at or after START.  */
9469
9470 static unsigned int
9471 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
9472 {
9473   unsigned int nbits = SBITMAP_SIZE (bmp);
9474   if (start == nbits)
9475     return start;
9476
9477   gcc_assert (start < nbits);
9478   for (unsigned int i = start; i < nbits; i++)
9479     if (bitmap_bit_p (bmp, i))
9480       return i;
9481
9482   return nbits;
9483 }
9484
9485 /* Do the work for aarch64_emit_prologue_components and
9486    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
9487    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
9488    for these components or the epilogue sequence.  That is, it determines
9489    whether we should emit stores or loads and what kind of CFA notes to attach
9490    to the insns.  Otherwise the logic for the two sequences is very
9491    similar.  */
9492
9493 static void
9494 aarch64_process_components (sbitmap components, bool prologue_p)
9495 {
9496   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
9497                              ? HARD_FRAME_POINTER_REGNUM
9498                              : STACK_POINTER_REGNUM);
9499
9500   unsigned last_regno = SBITMAP_SIZE (components);
9501   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
9502   rtx_insn *insn = NULL;
9503
9504   while (regno != last_regno)
9505     {
9506       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
9507       machine_mode mode = aarch64_reg_save_mode (regno);
9508
9509       rtx reg = gen_rtx_REG (mode, regno);
9510       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
9511       if (frame_pointer_needed)
9512         offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
9513       else
9514         offset += crtl->outgoing_args_size;
9515
9516       rtx addr = plus_constant (Pmode, ptr_reg, offset);
9517       rtx mem = gen_frame_mem (mode, addr);
9518
9519       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
9520       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
9521       /* No more registers to handle after REGNO.
9522          Emit a single save/restore and exit.  */
9523       if (regno2 == last_regno)
9524         {
9525           insn = emit_insn (set);
9526           if (frame_related_p)
9527             {
9528               RTX_FRAME_RELATED_P (insn) = 1;
9529               if (prologue_p)
9530                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9531               else
9532                 add_reg_note (insn, REG_CFA_RESTORE, reg);
9533             }
9534           break;
9535         }
9536
9537       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
9538       /* The next register is not of the same class or its offset is not
9539          mergeable with the current one into a pair.  */
9540       if (aarch64_sve_mode_p (mode)
9541           || !satisfies_constraint_Ump (mem)
9542           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
9543           || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
9544           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
9545                        GET_MODE_SIZE (mode)))
9546         {
9547           insn = emit_insn (set);
9548           if (frame_related_p)
9549             {
9550               RTX_FRAME_RELATED_P (insn) = 1;
9551               if (prologue_p)
9552                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9553               else
9554                 add_reg_note (insn, REG_CFA_RESTORE, reg);
9555             }
9556
9557           regno = regno2;
9558           continue;
9559         }
9560
9561       bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
9562
9563       /* REGNO2 can be saved/restored in a pair with REGNO.  */
9564       rtx reg2 = gen_rtx_REG (mode, regno2);
9565       if (frame_pointer_needed)
9566         offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
9567       else
9568         offset2 += crtl->outgoing_args_size;
9569       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
9570       rtx mem2 = gen_frame_mem (mode, addr2);
9571       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
9572                              : gen_rtx_SET (reg2, mem2);
9573
9574       if (prologue_p)
9575         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
9576       else
9577         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
9578
9579       if (frame_related_p || frame_related2_p)
9580         {
9581           RTX_FRAME_RELATED_P (insn) = 1;
9582           if (prologue_p)
9583             {
9584               if (frame_related_p)
9585                 add_reg_note (insn, REG_CFA_OFFSET, set);
9586               if (frame_related2_p)
9587                 add_reg_note (insn, REG_CFA_OFFSET, set2);
9588             }
9589           else
9590             {
9591               if (frame_related_p)
9592                 add_reg_note (insn, REG_CFA_RESTORE, reg);
9593               if (frame_related2_p)
9594                 add_reg_note (insn, REG_CFA_RESTORE, reg2);
9595             }
9596         }
9597
9598       regno = aarch64_get_next_set_bit (components, regno2 + 1);
9599     }
9600 }
9601
9602 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
9603
9604 static void
9605 aarch64_emit_prologue_components (sbitmap components)
9606 {
9607   aarch64_process_components (components, true);
9608 }
9609
9610 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
9611
9612 static void
9613 aarch64_emit_epilogue_components (sbitmap components)
9614 {
9615   aarch64_process_components (components, false);
9616 }
9617
9618 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
9619
9620 static void
9621 aarch64_set_handled_components (sbitmap components)
9622 {
9623   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9624     if (bitmap_bit_p (components, regno))
9625       cfun->machine->reg_is_wrapped_separately[regno] = true;
9626 }
9627
9628 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
9629    determining the probe offset for alloca.  */
9630
9631 static HOST_WIDE_INT
9632 aarch64_stack_clash_protection_alloca_probe_range (void)
9633 {
9634   return STACK_CLASH_CALLER_GUARD;
9635 }
9636
9637
9638 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
9639    registers.  If POLY_SIZE is not large enough to require a probe this function
9640    will only adjust the stack.  When allocating the stack space
9641    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
9642    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
9643    arguments.  If we are then we ensure that any allocation larger than the ABI
9644    defined buffer needs a probe so that the invariant of having a 1KB buffer is
9645    maintained.
9646
9647    We emit barriers after each stack adjustment to prevent optimizations from
9648    breaking the invariant that we never drop the stack more than a page.  This
9649    invariant is needed to make it easier to correctly handle asynchronous
9650    events, e.g. if we were to allow the stack to be dropped by more than a page
9651    and then have multiple probes up and we take a signal somewhere in between
9652    then the signal handler doesn't know the state of the stack and can make no
9653    assumptions about which pages have been probed.  */
9654
9655 static void
9656 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
9657                                         poly_int64 poly_size,
9658                                         bool frame_related_p,
9659                                         bool final_adjustment_p)
9660 {
9661   HOST_WIDE_INT guard_size
9662     = 1 << param_stack_clash_protection_guard_size;
9663   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9664   HOST_WIDE_INT min_probe_threshold
9665     = (final_adjustment_p
9666        ? guard_used_by_caller
9667        : guard_size - guard_used_by_caller);
9668   /* When doing the final adjustment for the outgoing arguments, take into
9669      account any unprobed space there is above the current SP.  There are
9670      two cases:
9671
9672      - When saving SVE registers below the hard frame pointer, we force
9673        the lowest save to take place in the prologue before doing the final
9674        adjustment (i.e. we don't allow the save to be shrink-wrapped).
9675        This acts as a probe at SP, so there is no unprobed space.
9676
9677      - When there are no SVE register saves, we use the store of the link
9678        register as a probe.  We can't assume that LR was saved at position 0
9679        though, so treat any space below it as unprobed.  */
9680   if (final_adjustment_p
9681       && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
9682     {
9683       poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
9684       if (known_ge (lr_offset, 0))
9685         min_probe_threshold -= lr_offset.to_constant ();
9686       else
9687         gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
9688     }
9689
9690   poly_int64 frame_size = cfun->machine->frame.frame_size;
9691
9692   /* We should always have a positive probe threshold.  */
9693   gcc_assert (min_probe_threshold > 0);
9694
9695   if (flag_stack_clash_protection && !final_adjustment_p)
9696     {
9697       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
9698       poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
9699       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
9700
9701       if (known_eq (frame_size, 0))
9702         {
9703           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
9704         }
9705       else if (known_lt (initial_adjust + sve_callee_adjust,
9706                          guard_size - guard_used_by_caller)
9707                && known_lt (final_adjust, guard_used_by_caller))
9708         {
9709           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
9710         }
9711     }
9712
9713   /* If SIZE is not large enough to require probing, just adjust the stack and
9714      exit.  */
9715   if (known_lt (poly_size, min_probe_threshold)
9716       || !flag_stack_clash_protection)
9717     {
9718       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
9719       return;
9720     }
9721
9722   HOST_WIDE_INT size;
9723   /* Handle the SVE non-constant case first.  */
9724   if (!poly_size.is_constant (&size))
9725     {
9726      if (dump_file)
9727       {
9728         fprintf (dump_file, "Stack clash SVE prologue: ");
9729         print_dec (poly_size, dump_file);
9730         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
9731       }
9732
9733       /* First calculate the amount of bytes we're actually spilling.  */
9734       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
9735                           poly_size, temp1, temp2, false, true);
9736
9737       rtx_insn *insn = get_last_insn ();
9738
9739       if (frame_related_p)
9740         {
9741           /* This is done to provide unwinding information for the stack
9742              adjustments we're about to do, however to prevent the optimizers
9743              from removing the R11 move and leaving the CFA note (which would be
9744              very wrong) we tie the old and new stack pointer together.
9745              The tie will expand to nothing but the optimizers will not touch
9746              the instruction.  */
9747           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9748           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
9749           emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
9750
9751           /* We want the CFA independent of the stack pointer for the
9752              duration of the loop.  */
9753           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
9754           RTX_FRAME_RELATED_P (insn) = 1;
9755         }
9756
9757       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
9758       rtx guard_const = gen_int_mode (guard_size, Pmode);
9759
9760       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
9761                                                    stack_pointer_rtx, temp1,
9762                                                    probe_const, guard_const));
9763
9764       /* Now reset the CFA register if needed.  */
9765       if (frame_related_p)
9766         {
9767           add_reg_note (insn, REG_CFA_DEF_CFA,
9768                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
9769                                       gen_int_mode (poly_size, Pmode)));
9770           RTX_FRAME_RELATED_P (insn) = 1;
9771         }
9772
9773       return;
9774     }
9775
9776   if (dump_file)
9777     fprintf (dump_file,
9778              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
9779              " bytes, probing will be required.\n", size);
9780
9781   /* Round size to the nearest multiple of guard_size, and calculate the
9782      residual as the difference between the original size and the rounded
9783      size.  */
9784   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
9785   HOST_WIDE_INT residual = size - rounded_size;
9786
9787   /* We can handle a small number of allocations/probes inline.  Otherwise
9788      punt to a loop.  */
9789   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
9790     {
9791       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
9792         {
9793           aarch64_sub_sp (NULL, temp2, guard_size, true);
9794           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9795                                            guard_used_by_caller));
9796           emit_insn (gen_blockage ());
9797         }
9798       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
9799     }
9800   else
9801     {
9802       /* Compute the ending address.  */
9803       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
9804                           temp1, NULL, false, true);
9805       rtx_insn *insn = get_last_insn ();
9806
9807       /* For the initial allocation, we don't have a frame pointer
9808          set up, so we always need CFI notes.  If we're doing the
9809          final allocation, then we may have a frame pointer, in which
9810          case it is the CFA, otherwise we need CFI notes.
9811
9812          We can determine which allocation we are doing by looking at
9813          the value of FRAME_RELATED_P since the final allocations are not
9814          frame related.  */
9815       if (frame_related_p)
9816         {
9817           /* We want the CFA independent of the stack pointer for the
9818              duration of the loop.  */
9819           add_reg_note (insn, REG_CFA_DEF_CFA,
9820                         plus_constant (Pmode, temp1, rounded_size));
9821           RTX_FRAME_RELATED_P (insn) = 1;
9822         }
9823
9824       /* This allocates and probes the stack.  Note that this re-uses some of
9825          the existing Ada stack protection code.  However we are guaranteed not
9826          to enter the non loop or residual branches of that code.
9827
9828          The non-loop part won't be entered because if our allocation amount
9829          doesn't require a loop, the case above would handle it.
9830
9831          The residual amount won't be entered because TEMP1 is a mutliple of
9832          the allocation size.  The residual will always be 0.  As such, the only
9833          part we are actually using from that code is the loop setup.  The
9834          actual probing is done in aarch64_output_probe_stack_range.  */
9835       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
9836                                                stack_pointer_rtx, temp1));
9837
9838       /* Now reset the CFA register if needed.  */
9839       if (frame_related_p)
9840         {
9841           add_reg_note (insn, REG_CFA_DEF_CFA,
9842                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
9843           RTX_FRAME_RELATED_P (insn) = 1;
9844         }
9845
9846       emit_insn (gen_blockage ());
9847       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
9848     }
9849
9850   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
9851      be probed.  This maintains the requirement that each page is probed at
9852      least once.  For initial probing we probe only if the allocation is
9853      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
9854      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
9855      GUARD_SIZE.  This works that for any allocation that is large enough to
9856      trigger a probe here, we'll have at least one, and if they're not large
9857      enough for this code to emit anything for them, The page would have been
9858      probed by the saving of FP/LR either by this function or any callees.  If
9859      we don't have any callees then we won't have more stack adjustments and so
9860      are still safe.  */
9861   if (residual)
9862     {
9863       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
9864       /* If we're doing final adjustments, and we've done any full page
9865          allocations then any residual needs to be probed.  */
9866       if (final_adjustment_p && rounded_size != 0)
9867         min_probe_threshold = 0;
9868       /* If doing a small final adjustment, we always probe at offset 0.
9869          This is done to avoid issues when LR is not at position 0 or when
9870          the final adjustment is smaller than the probing offset.  */
9871       else if (final_adjustment_p && rounded_size == 0)
9872         residual_probe_offset = 0;
9873
9874       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
9875       if (residual >= min_probe_threshold)
9876         {
9877           if (dump_file)
9878             fprintf (dump_file,
9879                      "Stack clash AArch64 prologue residuals: "
9880                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
9881                      "\n", residual);
9882
9883             emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9884                                              residual_probe_offset));
9885           emit_insn (gen_blockage ());
9886         }
9887     }
9888 }
9889
9890 /* Return 1 if the register is used by the epilogue.  We need to say the
9891    return register is used, but only after epilogue generation is complete.
9892    Note that in the case of sibcalls, the values "used by the epilogue" are
9893    considered live at the start of the called function.
9894
9895    For SIMD functions we need to return 1 for FP registers that are saved and
9896    restored by a function but are not zero in call_used_regs.  If we do not do
9897    this optimizations may remove the restore of the register.  */
9898
9899 int
9900 aarch64_epilogue_uses (int regno)
9901 {
9902   if (epilogue_completed)
9903     {
9904       if (regno == LR_REGNUM)
9905         return 1;
9906     }
9907   return 0;
9908 }
9909
9910 /* AArch64 stack frames generated by this compiler look like:
9911
9912         +-------------------------------+
9913         |                               |
9914         |  incoming stack arguments     |
9915         |                               |
9916         +-------------------------------+
9917         |                               | <-- incoming stack pointer (aligned)
9918         |  callee-allocated save area   |
9919         |  for register varargs         |
9920         |                               |
9921         +-------------------------------+
9922         |  local variables              | <-- frame_pointer_rtx
9923         |                               |
9924         +-------------------------------+
9925         |  padding                      | \
9926         +-------------------------------+  |
9927         |  callee-saved registers       |  | frame.saved_regs_size
9928         +-------------------------------+  |
9929         |  LR'                          |  |
9930         +-------------------------------+  |
9931         |  FP'                          |  |
9932         +-------------------------------+  |<- hard_frame_pointer_rtx (aligned)
9933         |  SVE vector registers         |  | \
9934         +-------------------------------+  |  | below_hard_fp_saved_regs_size
9935         |  SVE predicate registers      | /  /
9936         +-------------------------------+
9937         |  dynamic allocation           |
9938         +-------------------------------+
9939         |  padding                      |
9940         +-------------------------------+
9941         |  outgoing stack arguments     | <-- arg_pointer
9942         |                               |
9943         +-------------------------------+
9944         |                               | <-- stack_pointer_rtx (aligned)
9945
9946    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
9947    but leave frame_pointer_rtx and hard_frame_pointer_rtx
9948    unchanged.
9949
9950    By default for stack-clash we assume the guard is at least 64KB, but this
9951    value is configurable to either 4KB or 64KB.  We also force the guard size to
9952    be the same as the probing interval and both values are kept in sync.
9953
9954    With those assumptions the callee can allocate up to 63KB (or 3KB depending
9955    on the guard size) of stack space without probing.
9956
9957    When probing is needed, we emit a probe at the start of the prologue
9958    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
9959
9960    We have to track how much space has been allocated and the only stores
9961    to the stack we track as implicit probes are the FP/LR stores.
9962
9963    For outgoing arguments we probe if the size is larger than 1KB, such that
9964    the ABI specified buffer is maintained for the next callee.
9965
9966    The following registers are reserved during frame layout and should not be
9967    used for any other purpose:
9968
9969    - r11: Used by stack clash protection when SVE is enabled, and also
9970           as an anchor register when saving and restoring registers
9971    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
9972    - r14 and r15: Used for speculation tracking.
9973    - r16(IP0), r17(IP1): Used by indirect tailcalls.
9974    - r30(LR), r29(FP): Used by standard frame layout.
9975
9976    These registers must be avoided in frame layout related code unless the
9977    explicit intention is to interact with one of the features listed above.  */
9978
9979 /* Generate the prologue instructions for entry into a function.
9980    Establish the stack frame by decreasing the stack pointer with a
9981    properly calculated size and, if necessary, create a frame record
9982    filled with the values of LR and previous frame pointer.  The
9983    current FP is also set up if it is in use.  */
9984
9985 void
9986 aarch64_expand_prologue (void)
9987 {
9988   poly_int64 frame_size = cfun->machine->frame.frame_size;
9989   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
9990   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
9991   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
9992   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
9993   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
9994   poly_int64 below_hard_fp_saved_regs_size
9995     = cfun->machine->frame.below_hard_fp_saved_regs_size;
9996   unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
9997   unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
9998   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
9999   rtx_insn *insn;
10000
10001   if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
10002     {
10003       /* Fold the SVE allocation into the initial allocation.
10004          We don't do this in aarch64_layout_arg to avoid pessimizing
10005          the epilogue code.  */
10006       initial_adjust += sve_callee_adjust;
10007       sve_callee_adjust = 0;
10008     }
10009
10010   /* Sign return address for functions.  */
10011   if (aarch64_return_address_signing_enabled ())
10012     {
10013       switch (aarch64_ra_sign_key)
10014         {
10015           case AARCH64_KEY_A:
10016             insn = emit_insn (gen_paciasp ());
10017             break;
10018           case AARCH64_KEY_B:
10019             insn = emit_insn (gen_pacibsp ());
10020             break;
10021           default:
10022             gcc_unreachable ();
10023         }
10024       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
10025       RTX_FRAME_RELATED_P (insn) = 1;
10026     }
10027
10028   /* Push return address to shadow call stack.  */
10029   if (cfun->machine->frame.is_scs_enabled)
10030     emit_insn (gen_scs_push ());
10031
10032   if (flag_stack_usage_info)
10033     current_function_static_stack_size = constant_lower_bound (frame_size);
10034
10035   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10036     {
10037       if (crtl->is_leaf && !cfun->calls_alloca)
10038         {
10039           if (maybe_gt (frame_size, PROBE_INTERVAL)
10040               && maybe_gt (frame_size, get_stack_check_protect ()))
10041             aarch64_emit_probe_stack_range (get_stack_check_protect (),
10042                                             (frame_size
10043                                              - get_stack_check_protect ()));
10044         }
10045       else if (maybe_gt (frame_size, 0))
10046         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
10047     }
10048
10049   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
10050   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
10051
10052   /* In theory we should never have both an initial adjustment
10053      and a callee save adjustment.  Verify that is the case since the
10054      code below does not handle it for -fstack-clash-protection.  */
10055   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
10056
10057   /* Will only probe if the initial adjustment is larger than the guard
10058      less the amount of the guard reserved for use by the caller's
10059      outgoing args.  */
10060   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
10061                                           true, false);
10062
10063   if (callee_adjust != 0)
10064     aarch64_push_regs (reg1, reg2, callee_adjust);
10065
10066   /* The offset of the frame chain record (if any) from the current SP.  */
10067   poly_int64 chain_offset = (initial_adjust + callee_adjust
10068                              - cfun->machine->frame.hard_fp_offset);
10069   gcc_assert (known_ge (chain_offset, 0));
10070
10071   /* The offset of the bottom of the save area from the current SP.  */
10072   poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
10073
10074   if (emit_frame_chain)
10075     {
10076       if (callee_adjust == 0)
10077         {
10078           reg1 = R29_REGNUM;
10079           reg2 = R30_REGNUM;
10080           aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
10081                                      false, false);
10082         }
10083       else
10084         gcc_assert (known_eq (chain_offset, 0));
10085       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
10086                           stack_pointer_rtx, chain_offset,
10087                           tmp1_rtx, tmp0_rtx, frame_pointer_needed);
10088       if (frame_pointer_needed && !frame_size.is_constant ())
10089         {
10090           /* Variable-sized frames need to describe the save slot
10091              address using DW_CFA_expression rather than DW_CFA_offset.
10092              This means that, without taking further action, the
10093              locations of the registers that we've already saved would
10094              remain based on the stack pointer even after we redefine
10095              the CFA based on the frame pointer.  We therefore need new
10096              DW_CFA_expressions to re-express the save slots with addresses
10097              based on the frame pointer.  */
10098           rtx_insn *insn = get_last_insn ();
10099           gcc_assert (RTX_FRAME_RELATED_P (insn));
10100
10101           /* Add an explicit CFA definition if this was previously
10102              implicit.  */
10103           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
10104             {
10105               rtx src = plus_constant (Pmode, stack_pointer_rtx,
10106                                        callee_offset);
10107               add_reg_note (insn, REG_CFA_ADJUST_CFA,
10108                             gen_rtx_SET (hard_frame_pointer_rtx, src));
10109             }
10110
10111           /* Change the save slot expressions for the registers that
10112              we've already saved.  */
10113           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
10114                                       hard_frame_pointer_rtx, UNITS_PER_WORD);
10115           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
10116                                       hard_frame_pointer_rtx, 0);
10117         }
10118       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
10119     }
10120
10121   aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
10122                              callee_adjust != 0 || emit_frame_chain,
10123                              emit_frame_chain);
10124   if (maybe_ne (sve_callee_adjust, 0))
10125     {
10126       gcc_assert (!flag_stack_clash_protection
10127                   || known_eq (initial_adjust, 0));
10128       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
10129                                               sve_callee_adjust,
10130                                               !frame_pointer_needed, false);
10131       saved_regs_offset += sve_callee_adjust;
10132     }
10133   aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
10134                              false, emit_frame_chain);
10135   aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
10136                              callee_adjust != 0 || emit_frame_chain,
10137                              emit_frame_chain);
10138
10139   /* We may need to probe the final adjustment if it is larger than the guard
10140      that is assumed by the called.  */
10141   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
10142                                           !frame_pointer_needed, true);
10143 }
10144
10145 /* Return TRUE if we can use a simple_return insn.
10146
10147    This function checks whether the callee saved stack is empty, which
10148    means no restore actions are need. The pro_and_epilogue will use
10149    this to check whether shrink-wrapping opt is feasible.  */
10150
10151 bool
10152 aarch64_use_return_insn_p (void)
10153 {
10154   if (!reload_completed)
10155     return false;
10156
10157   if (crtl->profile)
10158     return false;
10159
10160   return known_eq (cfun->machine->frame.frame_size, 0);
10161 }
10162
10163 /* Generate the epilogue instructions for returning from a function.
10164    This is almost exactly the reverse of the prolog sequence, except
10165    that we need to insert barriers to avoid scheduling loads that read
10166    from a deallocated stack, and we optimize the unwind records by
10167    emitting them all together if possible.  */
10168 void
10169 aarch64_expand_epilogue (bool for_sibcall)
10170 {
10171   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
10172   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
10173   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
10174   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
10175   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
10176   poly_int64 below_hard_fp_saved_regs_size
10177     = cfun->machine->frame.below_hard_fp_saved_regs_size;
10178   unsigned reg1 = cfun->machine->frame.wb_pop_candidate1;
10179   unsigned reg2 = cfun->machine->frame.wb_pop_candidate2;
10180   unsigned int last_gpr = (cfun->machine->frame.is_scs_enabled
10181                            ? R29_REGNUM : R30_REGNUM);
10182   rtx cfi_ops = NULL;
10183   rtx_insn *insn;
10184   /* A stack clash protection prologue may not have left EP0_REGNUM or
10185      EP1_REGNUM in a usable state.  The same is true for allocations
10186      with an SVE component, since we then need both temporary registers
10187      for each allocation.  For stack clash we are in a usable state if
10188      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
10189   HOST_WIDE_INT guard_size
10190     = 1 << param_stack_clash_protection_guard_size;
10191   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
10192
10193   /* We can re-use the registers when:
10194
10195      (a) the deallocation amount is the same as the corresponding
10196          allocation amount (which is false if we combine the initial
10197          and SVE callee save allocations in the prologue); and
10198
10199      (b) the allocation amount doesn't need a probe (which is false
10200          if the amount is guard_size - guard_used_by_caller or greater).
10201
10202      In such situations the register should remain live with the correct
10203      value.  */
10204   bool can_inherit_p = (initial_adjust.is_constant ()
10205                         && final_adjust.is_constant ()
10206                         && (!flag_stack_clash_protection
10207                             || (known_lt (initial_adjust,
10208                                           guard_size - guard_used_by_caller)
10209                                 && known_eq (sve_callee_adjust, 0))));
10210
10211   /* We need to add memory barrier to prevent read from deallocated stack.  */
10212   bool need_barrier_p
10213     = maybe_ne (get_frame_size ()
10214                 + cfun->machine->frame.saved_varargs_size, 0);
10215
10216   /* Emit a barrier to prevent loads from a deallocated stack.  */
10217   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
10218       || cfun->calls_alloca
10219       || crtl->calls_eh_return)
10220     {
10221       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
10222       need_barrier_p = false;
10223     }
10224
10225   /* Restore the stack pointer from the frame pointer if it may not
10226      be the same as the stack pointer.  */
10227   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
10228   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
10229   if (frame_pointer_needed
10230       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
10231     /* If writeback is used when restoring callee-saves, the CFA
10232        is restored on the instruction doing the writeback.  */
10233     aarch64_add_offset (Pmode, stack_pointer_rtx,
10234                         hard_frame_pointer_rtx,
10235                         -callee_offset - below_hard_fp_saved_regs_size,
10236                         tmp1_rtx, tmp0_rtx, callee_adjust == 0);
10237   else
10238      /* The case where we need to re-use the register here is very rare, so
10239         avoid the complicated condition and just always emit a move if the
10240         immediate doesn't fit.  */
10241      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
10242
10243   /* Restore the vector registers before the predicate registers,
10244      so that we can use P4 as a temporary for big-endian SVE frames.  */
10245   aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
10246                                 callee_adjust != 0, &cfi_ops);
10247   aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
10248                                 false, &cfi_ops);
10249   if (maybe_ne (sve_callee_adjust, 0))
10250     aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
10251
10252   /* When shadow call stack is enabled, the scs_pop in the epilogue will
10253      restore x30, we don't need to restore x30 again in the traditional
10254      way.  */
10255   aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
10256                                 R0_REGNUM, last_gpr,
10257                                 callee_adjust != 0, &cfi_ops);
10258
10259   if (need_barrier_p)
10260     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
10261
10262   if (callee_adjust != 0)
10263     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
10264
10265   /* If we have no register restore information, the CFA must have been
10266      defined in terms of the stack pointer since the end of the prologue.  */
10267   gcc_assert (cfi_ops || !frame_pointer_needed);
10268
10269   if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
10270     {
10271       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
10272       insn = get_last_insn ();
10273       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
10274       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
10275       RTX_FRAME_RELATED_P (insn) = 1;
10276       cfi_ops = NULL;
10277     }
10278
10279   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
10280      add restriction on emit_move optimization to leaf functions.  */
10281   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
10282                   (!can_inherit_p || !crtl->is_leaf
10283                    || df_regs_ever_live_p (EP0_REGNUM)));
10284
10285   if (cfi_ops)
10286     {
10287       /* Emit delayed restores and reset the CFA to be SP.  */
10288       insn = get_last_insn ();
10289       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
10290       REG_NOTES (insn) = cfi_ops;
10291       RTX_FRAME_RELATED_P (insn) = 1;
10292     }
10293
10294   /* Pop return address from shadow call stack.  */
10295   if (cfun->machine->frame.is_scs_enabled)
10296     {
10297       machine_mode mode = aarch64_reg_save_mode (R30_REGNUM);
10298       rtx reg = gen_rtx_REG (mode, R30_REGNUM);
10299
10300       insn = emit_insn (gen_scs_pop ());
10301       add_reg_note (insn, REG_CFA_RESTORE, reg);
10302       RTX_FRAME_RELATED_P (insn) = 1;
10303     }
10304
10305   /* We prefer to emit the combined return/authenticate instruction RETAA,
10306      however there are three cases in which we must instead emit an explicit
10307      authentication instruction.
10308
10309         1) Sibcalls don't return in a normal way, so if we're about to call one
10310            we must authenticate.
10311
10312         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
10313            generating code for !TARGET_ARMV8_3 we can't use it and must
10314            explicitly authenticate.
10315     */
10316   if (aarch64_return_address_signing_enabled ()
10317       && (for_sibcall || !TARGET_ARMV8_3))
10318     {
10319       switch (aarch64_ra_sign_key)
10320         {
10321           case AARCH64_KEY_A:
10322             insn = emit_insn (gen_autiasp ());
10323             break;
10324           case AARCH64_KEY_B:
10325             insn = emit_insn (gen_autibsp ());
10326             break;
10327           default:
10328             gcc_unreachable ();
10329         }
10330       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
10331       RTX_FRAME_RELATED_P (insn) = 1;
10332     }
10333
10334   /* Stack adjustment for exception handler.  */
10335   if (crtl->calls_eh_return && !for_sibcall)
10336     {
10337       /* We need to unwind the stack by the offset computed by
10338          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
10339          to be SP; letting the CFA move during this adjustment
10340          is just as correct as retaining the CFA from the body
10341          of the function.  Therefore, do nothing special.  */
10342       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
10343     }
10344
10345   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
10346   if (!for_sibcall)
10347     emit_jump_insn (ret_rtx);
10348 }
10349
10350 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
10351    normally or return to a previous frame after unwinding.
10352
10353    An EH return uses a single shared return sequence.  The epilogue is
10354    exactly like a normal epilogue except that it has an extra input
10355    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
10356    that must be applied after the frame has been destroyed.  An extra label
10357    is inserted before the epilogue which initializes this register to zero,
10358    and this is the entry point for a normal return.
10359
10360    An actual EH return updates the return address, initializes the stack
10361    adjustment and jumps directly into the epilogue (bypassing the zeroing
10362    of the adjustment).  Since the return address is typically saved on the
10363    stack when a function makes a call, the saved LR must be updated outside
10364    the epilogue.
10365
10366    This poses problems as the store is generated well before the epilogue,
10367    so the offset of LR is not known yet.  Also optimizations will remove the
10368    store as it appears dead, even after the epilogue is generated (as the
10369    base or offset for loading LR is different in many cases).
10370
10371    To avoid these problems this implementation forces the frame pointer
10372    in eh_return functions so that the location of LR is fixed and known early.
10373    It also marks the store volatile, so no optimization is permitted to
10374    remove the store.  */
10375 rtx
10376 aarch64_eh_return_handler_rtx (void)
10377 {
10378   rtx tmp = gen_frame_mem (Pmode,
10379     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
10380
10381   /* Mark the store volatile, so no optimization is permitted to remove it.  */
10382   MEM_VOLATILE_P (tmp) = true;
10383   return tmp;
10384 }
10385
10386 /* Output code to add DELTA to the first argument, and then jump
10387    to FUNCTION.  Used for C++ multiple inheritance.  */
10388 static void
10389 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
10390                          HOST_WIDE_INT delta,
10391                          HOST_WIDE_INT vcall_offset,
10392                          tree function)
10393 {
10394   /* The this pointer is always in x0.  Note that this differs from
10395      Arm where the this pointer maybe bumped to r1 if r0 is required
10396      to return a pointer to an aggregate.  On AArch64 a result value
10397      pointer will be in x8.  */
10398   int this_regno = R0_REGNUM;
10399   rtx this_rtx, temp0, temp1, addr, funexp;
10400   rtx_insn *insn;
10401   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
10402
10403   if (aarch64_bti_enabled ())
10404     emit_insn (gen_bti_c());
10405
10406   reload_completed = 1;
10407   emit_note (NOTE_INSN_PROLOGUE_END);
10408
10409   this_rtx = gen_rtx_REG (Pmode, this_regno);
10410   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
10411   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
10412
10413   if (vcall_offset == 0)
10414     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
10415   else
10416     {
10417       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
10418
10419       addr = this_rtx;
10420       if (delta != 0)
10421         {
10422           if (delta >= -256 && delta < 256)
10423             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
10424                                        plus_constant (Pmode, this_rtx, delta));
10425           else
10426             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
10427                                 temp1, temp0, false);
10428         }
10429
10430       if (Pmode == ptr_mode)
10431         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
10432       else
10433         aarch64_emit_move (temp0,
10434                            gen_rtx_ZERO_EXTEND (Pmode,
10435                                                 gen_rtx_MEM (ptr_mode, addr)));
10436
10437       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
10438           addr = plus_constant (Pmode, temp0, vcall_offset);
10439       else
10440         {
10441           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
10442                                           Pmode);
10443           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
10444         }
10445
10446       if (Pmode == ptr_mode)
10447         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
10448       else
10449         aarch64_emit_move (temp1,
10450                            gen_rtx_SIGN_EXTEND (Pmode,
10451                                                 gen_rtx_MEM (ptr_mode, addr)));
10452
10453       emit_insn (gen_add2_insn (this_rtx, temp1));
10454     }
10455
10456   /* Generate a tail call to the target function.  */
10457   if (!TREE_USED (function))
10458     {
10459       assemble_external (function);
10460       TREE_USED (function) = 1;
10461     }
10462   funexp = XEXP (DECL_RTL (function), 0);
10463   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
10464   rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
10465   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
10466   SIBLING_CALL_P (insn) = 1;
10467
10468   insn = get_insns ();
10469   shorten_branches (insn);
10470
10471   assemble_start_function (thunk, fnname);
10472   final_start_function (insn, file, 1);
10473   final (insn, file, 1);
10474   final_end_function ();
10475   assemble_end_function (thunk, fnname);
10476
10477   /* Stop pretending to be a post-reload pass.  */
10478   reload_completed = 0;
10479 }
10480
10481 static bool
10482 aarch64_tls_referenced_p (rtx x)
10483 {
10484   if (!TARGET_HAVE_TLS)
10485     return false;
10486   subrtx_iterator::array_type array;
10487   FOR_EACH_SUBRTX (iter, array, x, ALL)
10488     {
10489       const_rtx x = *iter;
10490       if (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) != 0)
10491         return true;
10492       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
10493          TLS offsets, not real symbol references.  */
10494       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10495         iter.skip_subrtxes ();
10496     }
10497   return false;
10498 }
10499
10500
10501 static bool
10502 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
10503 {
10504   if (GET_CODE (x) == HIGH)
10505     return true;
10506
10507   /* There's no way to calculate VL-based values using relocations.  */
10508   subrtx_iterator::array_type array;
10509   FOR_EACH_SUBRTX (iter, array, x, ALL)
10510     if (GET_CODE (*iter) == CONST_POLY_INT)
10511       return true;
10512
10513   poly_int64 offset;
10514   rtx base = strip_offset_and_salt (x, &offset);
10515   if (SYMBOL_REF_P (base) || LABEL_REF_P (base))
10516     {
10517       /* We checked for POLY_INT_CST offsets above.  */
10518       if (aarch64_classify_symbol (base, offset.to_constant ())
10519           != SYMBOL_FORCE_TO_MEM)
10520         return true;
10521       else
10522         /* Avoid generating a 64-bit relocation in ILP32; leave
10523            to aarch64_expand_mov_immediate to handle it properly.  */
10524         return mode != ptr_mode;
10525     }
10526
10527   return aarch64_tls_referenced_p (x);
10528 }
10529
10530 /* Implement TARGET_CASE_VALUES_THRESHOLD.
10531    The expansion for a table switch is quite expensive due to the number
10532    of instructions, the table lookup and hard to predict indirect jump.
10533    When optimizing for speed, and -O3 enabled, use the per-core tuning if
10534    set, otherwise use tables for >= 11 cases as a tradeoff between size and
10535    performance.  When optimizing for size, use 8 for smallest codesize.  */
10536
10537 static unsigned int
10538 aarch64_case_values_threshold (void)
10539 {
10540   /* Use the specified limit for the number of cases before using jump
10541      tables at higher optimization levels.  */
10542   if (optimize > 2
10543       && aarch64_tune_params.max_case_values != 0)
10544     return aarch64_tune_params.max_case_values;
10545   else
10546     return optimize_size ? 8 : 11;
10547 }
10548
10549 /* Return true if register REGNO is a valid index register.
10550    STRICT_P is true if REG_OK_STRICT is in effect.  */
10551
10552 bool
10553 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
10554 {
10555   if (!HARD_REGISTER_NUM_P (regno))
10556     {
10557       if (!strict_p)
10558         return true;
10559
10560       if (!reg_renumber)
10561         return false;
10562
10563       regno = reg_renumber[regno];
10564     }
10565   return GP_REGNUM_P (regno);
10566 }
10567
10568 /* Return true if register REGNO is a valid base register for mode MODE.
10569    STRICT_P is true if REG_OK_STRICT is in effect.  */
10570
10571 bool
10572 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
10573 {
10574   if (!HARD_REGISTER_NUM_P (regno))
10575     {
10576       if (!strict_p)
10577         return true;
10578
10579       if (!reg_renumber)
10580         return false;
10581
10582       regno = reg_renumber[regno];
10583     }
10584
10585   /* The fake registers will be eliminated to either the stack or
10586      hard frame pointer, both of which are usually valid base registers.
10587      Reload deals with the cases where the eliminated form isn't valid.  */
10588   return (GP_REGNUM_P (regno)
10589           || regno == SP_REGNUM
10590           || regno == FRAME_POINTER_REGNUM
10591           || regno == ARG_POINTER_REGNUM);
10592 }
10593
10594 /* Return true if X is a valid base register for mode MODE.
10595    STRICT_P is true if REG_OK_STRICT is in effect.  */
10596
10597 static bool
10598 aarch64_base_register_rtx_p (rtx x, bool strict_p)
10599 {
10600   if (!strict_p
10601       && SUBREG_P (x)
10602       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
10603     x = SUBREG_REG (x);
10604
10605   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
10606 }
10607
10608 /* Return true if address offset is a valid index.  If it is, fill in INFO
10609    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
10610
10611 static bool
10612 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
10613                         machine_mode mode, bool strict_p)
10614 {
10615   enum aarch64_address_type type;
10616   rtx index;
10617   int shift;
10618
10619   /* (reg:P) */
10620   if ((REG_P (x) || SUBREG_P (x))
10621       && GET_MODE (x) == Pmode)
10622     {
10623       type = ADDRESS_REG_REG;
10624       index = x;
10625       shift = 0;
10626     }
10627   /* (sign_extend:DI (reg:SI)) */
10628   else if ((GET_CODE (x) == SIGN_EXTEND
10629             || GET_CODE (x) == ZERO_EXTEND)
10630            && GET_MODE (x) == DImode
10631            && GET_MODE (XEXP (x, 0)) == SImode)
10632     {
10633       type = (GET_CODE (x) == SIGN_EXTEND)
10634         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10635       index = XEXP (x, 0);
10636       shift = 0;
10637     }
10638   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
10639   else if (GET_CODE (x) == MULT
10640            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10641                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10642            && GET_MODE (XEXP (x, 0)) == DImode
10643            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10644            && CONST_INT_P (XEXP (x, 1)))
10645     {
10646       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10647         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10648       index = XEXP (XEXP (x, 0), 0);
10649       shift = exact_log2 (INTVAL (XEXP (x, 1)));
10650     }
10651   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
10652   else if (GET_CODE (x) == ASHIFT
10653            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10654                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10655            && GET_MODE (XEXP (x, 0)) == DImode
10656            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10657            && CONST_INT_P (XEXP (x, 1)))
10658     {
10659       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10660         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10661       index = XEXP (XEXP (x, 0), 0);
10662       shift = INTVAL (XEXP (x, 1));
10663     }
10664   /* (and:DI (mult:DI (reg:DI) (const_int scale))
10665      (const_int 0xffffffff<<shift)) */
10666   else if (GET_CODE (x) == AND
10667            && GET_MODE (x) == DImode
10668            && GET_CODE (XEXP (x, 0)) == MULT
10669            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10670            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10671            && CONST_INT_P (XEXP (x, 1)))
10672     {
10673       type = ADDRESS_REG_UXTW;
10674       index = XEXP (XEXP (x, 0), 0);
10675       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
10676       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10677         shift = -1;
10678     }
10679   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
10680      (const_int 0xffffffff<<shift)) */
10681   else if (GET_CODE (x) == AND
10682            && GET_MODE (x) == DImode
10683            && GET_CODE (XEXP (x, 0)) == ASHIFT
10684            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10685            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10686            && CONST_INT_P (XEXP (x, 1)))
10687     {
10688       type = ADDRESS_REG_UXTW;
10689       index = XEXP (XEXP (x, 0), 0);
10690       shift = INTVAL (XEXP (XEXP (x, 0), 1));
10691       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10692         shift = -1;
10693     }
10694   /* (mult:P (reg:P) (const_int scale)) */
10695   else if (GET_CODE (x) == MULT
10696            && GET_MODE (x) == Pmode
10697            && GET_MODE (XEXP (x, 0)) == Pmode
10698            && CONST_INT_P (XEXP (x, 1)))
10699     {
10700       type = ADDRESS_REG_REG;
10701       index = XEXP (x, 0);
10702       shift = exact_log2 (INTVAL (XEXP (x, 1)));
10703     }
10704   /* (ashift:P (reg:P) (const_int shift)) */
10705   else if (GET_CODE (x) == ASHIFT
10706            && GET_MODE (x) == Pmode
10707            && GET_MODE (XEXP (x, 0)) == Pmode
10708            && CONST_INT_P (XEXP (x, 1)))
10709     {
10710       type = ADDRESS_REG_REG;
10711       index = XEXP (x, 0);
10712       shift = INTVAL (XEXP (x, 1));
10713     }
10714   else
10715     return false;
10716
10717   if (!strict_p
10718       && SUBREG_P (index)
10719       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
10720     index = SUBREG_REG (index);
10721
10722   if (aarch64_sve_data_mode_p (mode))
10723     {
10724       if (type != ADDRESS_REG_REG
10725           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
10726         return false;
10727     }
10728   else
10729     {
10730       if (shift != 0
10731           && !(IN_RANGE (shift, 1, 3)
10732                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
10733         return false;
10734     }
10735
10736   if (REG_P (index)
10737       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
10738     {
10739       info->type = type;
10740       info->offset = index;
10741       info->shift = shift;
10742       return true;
10743     }
10744
10745   return false;
10746 }
10747
10748 /* Return true if MODE is one of the modes for which we
10749    support LDP/STP operations.  */
10750
10751 static bool
10752 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
10753 {
10754   return mode == SImode || mode == DImode
10755          || mode == SFmode || mode == DFmode
10756          || mode == SDmode || mode == DDmode
10757          || (aarch64_vector_mode_supported_p (mode)
10758              && (known_eq (GET_MODE_SIZE (mode), 8)
10759                  || (known_eq (GET_MODE_SIZE (mode), 16)
10760                     && (aarch64_tune_params.extra_tuning_flags
10761                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
10762 }
10763
10764 /* Return true if REGNO is a virtual pointer register, or an eliminable
10765    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
10766    include stack_pointer or hard_frame_pointer.  */
10767 static bool
10768 virt_or_elim_regno_p (unsigned regno)
10769 {
10770   return ((regno >= FIRST_VIRTUAL_REGISTER
10771            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
10772           || regno == FRAME_POINTER_REGNUM
10773           || regno == ARG_POINTER_REGNUM);
10774 }
10775
10776 /* Return true if X is a valid address of type TYPE for machine mode MODE.
10777    If it is, fill in INFO appropriately.  STRICT_P is true if
10778    REG_OK_STRICT is in effect.  */
10779
10780 bool
10781 aarch64_classify_address (struct aarch64_address_info *info,
10782                           rtx x, machine_mode mode, bool strict_p,
10783                           aarch64_addr_query_type type)
10784 {
10785   enum rtx_code code = GET_CODE (x);
10786   rtx op0, op1;
10787   poly_int64 offset;
10788
10789   HOST_WIDE_INT const_size;
10790
10791   /* Whether a vector mode is partial doesn't affect address legitimacy.
10792      Partial vectors like VNx8QImode allow the same indexed addressing
10793      mode and MUL VL addressing mode as full vectors like VNx16QImode;
10794      in both cases, MUL VL counts multiples of GET_MODE_SIZE.  */
10795   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10796   vec_flags &= ~VEC_PARTIAL;
10797
10798   /* On BE, we use load/store pair for all large int mode load/stores.
10799      TI/TF/TDmode may also use a load/store pair.  */
10800   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
10801   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
10802                             || type == ADDR_QUERY_LDP_STP_N
10803                             || mode == TImode
10804                             || mode == TFmode
10805                             || mode == TDmode
10806                             || ((!TARGET_SIMD || BYTES_BIG_ENDIAN)
10807                                 && advsimd_struct_p));
10808   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
10809      corresponds to the actual size of the memory being loaded/stored and the
10810      mode of the corresponding addressing mode is half of that.  */
10811   if (type == ADDR_QUERY_LDP_STP_N)
10812     {
10813       if (known_eq (GET_MODE_SIZE (mode), 16))
10814         mode = DFmode;
10815       else if (known_eq (GET_MODE_SIZE (mode), 8))
10816         mode = SFmode;
10817       else
10818         return false;
10819     }
10820
10821   bool allow_reg_index_p = (!load_store_pair_p
10822                             && ((vec_flags == 0
10823                                  && known_lt (GET_MODE_SIZE (mode), 16))
10824                                 || vec_flags == VEC_ADVSIMD
10825                                 || vec_flags & VEC_SVE_DATA));
10826
10827   /* For SVE, only accept [Rn], [Rn, #offset, MUL VL] and [Rn, Rm, LSL #shift].
10828      The latter is not valid for SVE predicates, and that's rejected through
10829      allow_reg_index_p above.  */
10830   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
10831       && (code != REG && code != PLUS))
10832     return false;
10833
10834   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
10835      REG addressing.  */
10836   if (advsimd_struct_p
10837       && TARGET_SIMD
10838       && !BYTES_BIG_ENDIAN
10839       && (code != POST_INC && code != REG))
10840     return false;
10841
10842   gcc_checking_assert (GET_MODE (x) == VOIDmode
10843                        || SCALAR_INT_MODE_P (GET_MODE (x)));
10844
10845   switch (code)
10846     {
10847     case REG:
10848     case SUBREG:
10849       info->type = ADDRESS_REG_IMM;
10850       info->base = x;
10851       info->offset = const0_rtx;
10852       info->const_offset = 0;
10853       return aarch64_base_register_rtx_p (x, strict_p);
10854
10855     case PLUS:
10856       op0 = XEXP (x, 0);
10857       op1 = XEXP (x, 1);
10858
10859       if (! strict_p
10860           && REG_P (op0)
10861           && virt_or_elim_regno_p (REGNO (op0))
10862           && poly_int_rtx_p (op1, &offset))
10863         {
10864           info->type = ADDRESS_REG_IMM;
10865           info->base = op0;
10866           info->offset = op1;
10867           info->const_offset = offset;
10868
10869           return true;
10870         }
10871
10872       if (maybe_ne (GET_MODE_SIZE (mode), 0)
10873           && aarch64_base_register_rtx_p (op0, strict_p)
10874           && poly_int_rtx_p (op1, &offset))
10875         {
10876           info->type = ADDRESS_REG_IMM;
10877           info->base = op0;
10878           info->offset = op1;
10879           info->const_offset = offset;
10880
10881           /* TImode, TFmode and TDmode values are allowed in both pairs of X
10882              registers and individual Q registers.  The available
10883              address modes are:
10884              X,X: 7-bit signed scaled offset
10885              Q:   9-bit signed offset
10886              We conservatively require an offset representable in either mode.
10887              When performing the check for pairs of X registers i.e.  LDP/STP
10888              pass down DImode since that is the natural size of the LDP/STP
10889              instruction memory accesses.  */
10890           if (mode == TImode || mode == TFmode || mode == TDmode)
10891             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10892                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10893                         || offset_12bit_unsigned_scaled_p (mode, offset)));
10894
10895           if (mode == V8DImode)
10896             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10897                     && aarch64_offset_7bit_signed_scaled_p (DImode, offset + 48));
10898
10899           /* A 7bit offset check because OImode will emit a ldp/stp
10900              instruction (only !TARGET_SIMD or big endian will get here).
10901              For ldp/stp instructions, the offset is scaled for the size of a
10902              single element of the pair.  */
10903           if (aarch64_advsimd_partial_struct_mode_p (mode)
10904               && known_eq (GET_MODE_SIZE (mode), 16))
10905             return aarch64_offset_7bit_signed_scaled_p (DImode, offset);
10906           if (aarch64_advsimd_full_struct_mode_p (mode)
10907               && known_eq (GET_MODE_SIZE (mode), 32))
10908             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
10909
10910           /* Three 9/12 bit offsets checks because CImode will emit three
10911              ldr/str instructions (only !TARGET_SIMD or big endian will
10912              get here).  */
10913           if (aarch64_advsimd_partial_struct_mode_p (mode)
10914               && known_eq (GET_MODE_SIZE (mode), 24))
10915             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10916                     && (aarch64_offset_9bit_signed_unscaled_p (DImode,
10917                                                                offset + 16)
10918                         || offset_12bit_unsigned_scaled_p (DImode,
10919                                                            offset + 16)));
10920           if (aarch64_advsimd_full_struct_mode_p (mode)
10921               && known_eq (GET_MODE_SIZE (mode), 48))
10922             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10923                     && (aarch64_offset_9bit_signed_unscaled_p (TImode,
10924                                                                offset + 32)
10925                         || offset_12bit_unsigned_scaled_p (TImode,
10926                                                            offset + 32)));
10927
10928           /* Two 7bit offsets checks because XImode will emit two ldp/stp
10929              instructions (only big endian will get here).  */
10930           if (aarch64_advsimd_partial_struct_mode_p (mode)
10931               && known_eq (GET_MODE_SIZE (mode), 32))
10932             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10933                     && aarch64_offset_7bit_signed_scaled_p (DImode,
10934                                                             offset + 16));
10935           if (aarch64_advsimd_full_struct_mode_p (mode)
10936               && known_eq (GET_MODE_SIZE (mode), 64))
10937             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10938                     && aarch64_offset_7bit_signed_scaled_p (TImode,
10939                                                             offset + 32));
10940
10941           /* Make "m" use the LD1 offset range for SVE data modes, so
10942              that pre-RTL optimizers like ivopts will work to that
10943              instead of the wider LDR/STR range.  */
10944           if (vec_flags == VEC_SVE_DATA)
10945             return (type == ADDR_QUERY_M
10946                     ? offset_4bit_signed_scaled_p (mode, offset)
10947                     : offset_9bit_signed_scaled_p (mode, offset));
10948
10949           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
10950             {
10951               poly_int64 end_offset = (offset
10952                                        + GET_MODE_SIZE (mode)
10953                                        - BYTES_PER_SVE_VECTOR);
10954               return (type == ADDR_QUERY_M
10955                       ? offset_4bit_signed_scaled_p (mode, offset)
10956                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
10957                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
10958                                                          end_offset)));
10959             }
10960
10961           if (vec_flags == VEC_SVE_PRED)
10962             return offset_9bit_signed_scaled_p (mode, offset);
10963
10964           if (load_store_pair_p)
10965             return ((known_eq (GET_MODE_SIZE (mode), 4)
10966                      || known_eq (GET_MODE_SIZE (mode), 8)
10967                      || known_eq (GET_MODE_SIZE (mode), 16))
10968                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10969           else
10970             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10971                     || offset_12bit_unsigned_scaled_p (mode, offset));
10972         }
10973
10974       if (allow_reg_index_p)
10975         {
10976           /* Look for base + (scaled/extended) index register.  */
10977           if (aarch64_base_register_rtx_p (op0, strict_p)
10978               && aarch64_classify_index (info, op1, mode, strict_p))
10979             {
10980               info->base = op0;
10981               return true;
10982             }
10983           if (aarch64_base_register_rtx_p (op1, strict_p)
10984               && aarch64_classify_index (info, op0, mode, strict_p))
10985             {
10986               info->base = op1;
10987               return true;
10988             }
10989         }
10990
10991       return false;
10992
10993     case POST_INC:
10994     case POST_DEC:
10995     case PRE_INC:
10996     case PRE_DEC:
10997       info->type = ADDRESS_REG_WB;
10998       info->base = XEXP (x, 0);
10999       info->offset = NULL_RTX;
11000       return aarch64_base_register_rtx_p (info->base, strict_p);
11001
11002     case POST_MODIFY:
11003     case PRE_MODIFY:
11004       info->type = ADDRESS_REG_WB;
11005       info->base = XEXP (x, 0);
11006       if (GET_CODE (XEXP (x, 1)) == PLUS
11007           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
11008           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
11009           && aarch64_base_register_rtx_p (info->base, strict_p))
11010         {
11011           info->offset = XEXP (XEXP (x, 1), 1);
11012           info->const_offset = offset;
11013
11014           /* TImode, TFmode and TDmode values are allowed in both pairs of X
11015              registers and individual Q registers.  The available
11016              address modes are:
11017              X,X: 7-bit signed scaled offset
11018              Q:   9-bit signed offset
11019              We conservatively require an offset representable in either mode.
11020            */
11021           if (mode == TImode || mode == TFmode || mode == TDmode)
11022             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
11023                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
11024
11025           if (load_store_pair_p)
11026             return ((known_eq (GET_MODE_SIZE (mode), 4)
11027                      || known_eq (GET_MODE_SIZE (mode), 8)
11028                      || known_eq (GET_MODE_SIZE (mode), 16))
11029                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
11030           else
11031             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
11032         }
11033       return false;
11034
11035     case CONST:
11036     case SYMBOL_REF:
11037     case LABEL_REF:
11038       /* load literal: pc-relative constant pool entry.  Only supported
11039          for SI mode or larger.  */
11040       info->type = ADDRESS_SYMBOLIC;
11041
11042       if (!load_store_pair_p
11043           && GET_MODE_SIZE (mode).is_constant (&const_size)
11044           && const_size >= 4)
11045         {
11046           poly_int64 offset;
11047           rtx sym = strip_offset_and_salt (x, &offset);
11048           return ((LABEL_REF_P (sym)
11049                    || (SYMBOL_REF_P (sym)
11050                        && CONSTANT_POOL_ADDRESS_P (sym)
11051                        && aarch64_pcrelative_literal_loads)));
11052         }
11053       return false;
11054
11055     case LO_SUM:
11056       info->type = ADDRESS_LO_SUM;
11057       info->base = XEXP (x, 0);
11058       info->offset = XEXP (x, 1);
11059       if (allow_reg_index_p
11060           && aarch64_base_register_rtx_p (info->base, strict_p))
11061         {
11062           poly_int64 offset;
11063           HOST_WIDE_INT const_offset;
11064           rtx sym = strip_offset_and_salt (info->offset, &offset);
11065           if (SYMBOL_REF_P (sym)
11066               && offset.is_constant (&const_offset)
11067               && (aarch64_classify_symbol (sym, const_offset)
11068                   == SYMBOL_SMALL_ABSOLUTE))
11069             {
11070               /* The symbol and offset must be aligned to the access size.  */
11071               unsigned int align;
11072
11073               if (CONSTANT_POOL_ADDRESS_P (sym))
11074                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
11075               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
11076                 {
11077                   tree exp = SYMBOL_REF_DECL (sym);
11078                   align = TYPE_ALIGN (TREE_TYPE (exp));
11079                   align = aarch64_constant_alignment (exp, align);
11080                 }
11081               else if (SYMBOL_REF_DECL (sym))
11082                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
11083               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
11084                        && SYMBOL_REF_BLOCK (sym) != NULL)
11085                 align = SYMBOL_REF_BLOCK (sym)->alignment;
11086               else
11087                 align = BITS_PER_UNIT;
11088
11089               poly_int64 ref_size = GET_MODE_SIZE (mode);
11090               if (known_eq (ref_size, 0))
11091                 ref_size = GET_MODE_SIZE (DImode);
11092
11093               return (multiple_p (const_offset, ref_size)
11094                       && multiple_p (align / BITS_PER_UNIT, ref_size));
11095             }
11096         }
11097       return false;
11098
11099     default:
11100       return false;
11101     }
11102 }
11103
11104 /* Return true if the address X is valid for a PRFM instruction.
11105    STRICT_P is true if we should do strict checking with
11106    aarch64_classify_address.  */
11107
11108 bool
11109 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
11110 {
11111   struct aarch64_address_info addr;
11112
11113   /* PRFM accepts the same addresses as DImode...  */
11114   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
11115   if (!res)
11116     return false;
11117
11118   /* ... except writeback forms.  */
11119   return addr.type != ADDRESS_REG_WB;
11120 }
11121
11122 bool
11123 aarch64_symbolic_address_p (rtx x)
11124 {
11125   poly_int64 offset;
11126   x = strip_offset_and_salt (x, &offset);
11127   return SYMBOL_REF_P (x) || LABEL_REF_P (x);
11128 }
11129
11130 /* Classify the base of symbolic expression X.  */
11131
11132 enum aarch64_symbol_type
11133 aarch64_classify_symbolic_expression (rtx x)
11134 {
11135   rtx offset;
11136
11137   split_const (x, &x, &offset);
11138   return aarch64_classify_symbol (x, INTVAL (offset));
11139 }
11140
11141
11142 /* Return TRUE if X is a legitimate address for accessing memory in
11143    mode MODE.  */
11144 static bool
11145 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
11146 {
11147   struct aarch64_address_info addr;
11148
11149   return aarch64_classify_address (&addr, x, mode, strict_p);
11150 }
11151
11152 /* Return TRUE if X is a legitimate address of type TYPE for accessing
11153    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
11154 bool
11155 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
11156                               aarch64_addr_query_type type)
11157 {
11158   struct aarch64_address_info addr;
11159
11160   return aarch64_classify_address (&addr, x, mode, strict_p, type);
11161 }
11162
11163 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
11164
11165 static bool
11166 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
11167                                          poly_int64 orig_offset,
11168                                          machine_mode mode)
11169 {
11170   HOST_WIDE_INT size;
11171   if (GET_MODE_SIZE (mode).is_constant (&size))
11172     {
11173       HOST_WIDE_INT const_offset, second_offset;
11174
11175       /* A general SVE offset is A * VQ + B.  Remove the A component from
11176          coefficient 0 in order to get the constant B.  */
11177       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
11178
11179       /* Split an out-of-range address displacement into a base and
11180          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
11181          range otherwise to increase opportunities for sharing the base
11182          address of different sizes.  Unaligned accesses use the signed
11183          9-bit range, TImode/TFmode/TDmode use the intersection of signed
11184          scaled 7-bit and signed 9-bit offset.  */
11185       if (mode == TImode || mode == TFmode || mode == TDmode)
11186         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
11187       else if ((const_offset & (size - 1)) != 0)
11188         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
11189       else
11190         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
11191
11192       if (second_offset == 0 || known_eq (orig_offset, second_offset))
11193         return false;
11194
11195       /* Split the offset into second_offset and the rest.  */
11196       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
11197       *offset2 = gen_int_mode (second_offset, Pmode);
11198       return true;
11199     }
11200   else
11201     {
11202       /* Get the mode we should use as the basis of the range.  For structure
11203          modes this is the mode of one vector.  */
11204       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11205       machine_mode step_mode
11206         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
11207
11208       /* Get the "mul vl" multiplier we'd like to use.  */
11209       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
11210       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
11211       if (vec_flags & VEC_SVE_DATA)
11212         /* LDR supports a 9-bit range, but the move patterns for
11213            structure modes require all vectors to be in range of the
11214            same base.  The simplest way of accomodating that while still
11215            promoting reuse of anchor points between different modes is
11216            to use an 8-bit range unconditionally.  */
11217         vnum = ((vnum + 128) & 255) - 128;
11218       else
11219         /* Predicates are only handled singly, so we might as well use
11220            the full range.  */
11221         vnum = ((vnum + 256) & 511) - 256;
11222       if (vnum == 0)
11223         return false;
11224
11225       /* Convert the "mul vl" multiplier into a byte offset.  */
11226       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
11227       if (known_eq (second_offset, orig_offset))
11228         return false;
11229
11230       /* Split the offset into second_offset and the rest.  */
11231       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
11232       *offset2 = gen_int_mode (second_offset, Pmode);
11233       return true;
11234     }
11235 }
11236
11237 /* Return the binary representation of floating point constant VALUE in INTVAL.
11238    If the value cannot be converted, return false without setting INTVAL.
11239    The conversion is done in the given MODE.  */
11240 bool
11241 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
11242 {
11243
11244   /* We make a general exception for 0.  */
11245   if (aarch64_float_const_zero_rtx_p (value))
11246     {
11247       *intval = 0;
11248       return true;
11249     }
11250
11251   scalar_float_mode mode;
11252   if (!CONST_DOUBLE_P (value)
11253       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
11254       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
11255       /* Only support up to DF mode.  */
11256       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
11257     return false;
11258
11259   unsigned HOST_WIDE_INT ival = 0;
11260
11261   long res[2];
11262   real_to_target (res,
11263                   CONST_DOUBLE_REAL_VALUE (value),
11264                   REAL_MODE_FORMAT (mode));
11265
11266   if (mode == DFmode || mode == DDmode)
11267     {
11268       int order = BYTES_BIG_ENDIAN ? 1 : 0;
11269       ival = zext_hwi (res[order], 32);
11270       ival |= (zext_hwi (res[1 - order], 32) << 32);
11271     }
11272   else
11273       ival = zext_hwi (res[0], 32);
11274
11275   *intval = ival;
11276   return true;
11277 }
11278
11279 /* Return TRUE if rtx X is an immediate constant that can be moved using a
11280    single MOV(+MOVK) followed by an FMOV.  */
11281 bool
11282 aarch64_float_const_rtx_p (rtx x)
11283 {
11284   machine_mode mode = GET_MODE (x);
11285   if (mode == VOIDmode)
11286     return false;
11287
11288   /* Determine whether it's cheaper to write float constants as
11289      mov/movk pairs over ldr/adrp pairs.  */
11290   unsigned HOST_WIDE_INT ival;
11291
11292   if (CONST_DOUBLE_P (x)
11293       && SCALAR_FLOAT_MODE_P (mode)
11294       && aarch64_reinterpret_float_as_int (x, &ival))
11295     {
11296       machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8) ? DImode : SImode;
11297       int num_instr = aarch64_internal_mov_immediate
11298                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
11299       return num_instr < 3;
11300     }
11301
11302   return false;
11303 }
11304
11305 /* Return TRUE if rtx X is immediate constant 0.0 (but not in Decimal
11306    Floating Point).  */
11307 bool
11308 aarch64_float_const_zero_rtx_p (rtx x)
11309 {
11310   /* 0.0 in Decimal Floating Point cannot be represented by #0 or
11311      zr as our callers expect, so no need to check the actual
11312      value if X is of Decimal Floating Point type.  */
11313   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_DECIMAL_FLOAT)
11314     return false;
11315
11316   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
11317     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
11318   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
11319 }
11320
11321 /* Return TRUE if rtx X is immediate constant that fits in a single
11322    MOVI immediate operation.  */
11323 bool
11324 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
11325 {
11326   if (!TARGET_SIMD)
11327      return false;
11328
11329   machine_mode vmode;
11330   scalar_int_mode imode;
11331   unsigned HOST_WIDE_INT ival;
11332
11333   if (CONST_DOUBLE_P (x)
11334       && SCALAR_FLOAT_MODE_P (mode))
11335     {
11336       if (!aarch64_reinterpret_float_as_int (x, &ival))
11337         return false;
11338
11339       /* We make a general exception for 0.  */
11340       if (aarch64_float_const_zero_rtx_p (x))
11341         return true;
11342
11343       imode = int_mode_for_mode (mode).require ();
11344     }
11345   else if (CONST_INT_P (x)
11346            && is_a <scalar_int_mode> (mode, &imode))
11347     ival = INTVAL (x);
11348   else
11349     return false;
11350
11351    /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
11352      a 128 bit vector mode.  */
11353   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
11354
11355   vmode = aarch64_simd_container_mode (imode, width);
11356   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
11357
11358   return aarch64_simd_valid_immediate (v_op, NULL);
11359 }
11360
11361
11362 /* Return the fixed registers used for condition codes.  */
11363
11364 static bool
11365 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
11366 {
11367   *p1 = CC_REGNUM;
11368   *p2 = INVALID_REGNUM;
11369   return true;
11370 }
11371
11372 /* This function is used by the call expanders of the machine description.
11373    RESULT is the register in which the result is returned.  It's NULL for
11374    "call" and "sibcall".
11375    MEM is the location of the function call.
11376    CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
11377    SIBCALL indicates whether this function call is normal call or sibling call.
11378    It will generate different pattern accordingly.  */
11379
11380 void
11381 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
11382 {
11383   rtx call, callee, tmp;
11384   rtvec vec;
11385   machine_mode mode;
11386
11387   gcc_assert (MEM_P (mem));
11388   callee = XEXP (mem, 0);
11389   mode = GET_MODE (callee);
11390   gcc_assert (mode == Pmode);
11391
11392   /* Decide if we should generate indirect calls by loading the
11393      address of the callee into a register before performing
11394      the branch-and-link.  */
11395   if (SYMBOL_REF_P (callee)
11396       ? (aarch64_is_long_call_p (callee)
11397          || aarch64_is_noplt_call_p (callee))
11398       : !REG_P (callee))
11399     XEXP (mem, 0) = force_reg (mode, callee);
11400
11401   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
11402
11403   if (result != NULL_RTX)
11404     call = gen_rtx_SET (result, call);
11405
11406   if (sibcall)
11407     tmp = ret_rtx;
11408   else
11409     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
11410
11411   gcc_assert (CONST_INT_P (callee_abi));
11412   callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
11413                                UNSPEC_CALLEE_ABI);
11414
11415   vec = gen_rtvec (3, call, callee_abi, tmp);
11416   call = gen_rtx_PARALLEL (VOIDmode, vec);
11417
11418   aarch64_emit_call_insn (call);
11419 }
11420
11421 /* Emit call insn with PAT and do aarch64-specific handling.  */
11422
11423 void
11424 aarch64_emit_call_insn (rtx pat)
11425 {
11426   rtx insn = emit_call_insn (pat);
11427
11428   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
11429   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
11430   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
11431 }
11432
11433 machine_mode
11434 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
11435 {
11436   machine_mode mode_x = GET_MODE (x);
11437   rtx_code code_x = GET_CODE (x);
11438
11439   /* All floating point compares return CCFP if it is an equality
11440      comparison, and CCFPE otherwise.  */
11441   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
11442     {
11443       switch (code)
11444         {
11445         case EQ:
11446         case NE:
11447         case UNORDERED:
11448         case ORDERED:
11449         case UNLT:
11450         case UNLE:
11451         case UNGT:
11452         case UNGE:
11453         case UNEQ:
11454           return CCFPmode;
11455
11456         case LT:
11457         case LE:
11458         case GT:
11459         case GE:
11460         case LTGT:
11461           return CCFPEmode;
11462
11463         default:
11464           gcc_unreachable ();
11465         }
11466     }
11467
11468   /* Equality comparisons of short modes against zero can be performed
11469      using the TST instruction with the appropriate bitmask.  */
11470   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
11471       && (code == EQ || code == NE)
11472       && (mode_x == HImode || mode_x == QImode))
11473     return CC_Zmode;
11474
11475   /* Similarly, comparisons of zero_extends from shorter modes can
11476      be performed using an ANDS with an immediate mask.  */
11477   if (y == const0_rtx && code_x == ZERO_EXTEND
11478       && (mode_x == SImode || mode_x == DImode)
11479       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
11480       && (code == EQ || code == NE))
11481     return CC_Zmode;
11482
11483   /* Zero extracts support equality comparisons.  */
11484   if ((mode_x == SImode || mode_x == DImode)
11485       && y == const0_rtx
11486       && (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
11487           && CONST_INT_P (XEXP (x, 2)))
11488       && (code == EQ || code == NE))
11489     return CC_Zmode;
11490
11491   /* ANDS/BICS/TST support equality and all signed comparisons.  */
11492   if ((mode_x == SImode || mode_x == DImode)
11493       && y == const0_rtx
11494       && (code_x == AND)
11495       && (code == EQ || code == NE || code == LT || code == GE
11496           || code == GT || code == LE))
11497     return CC_NZVmode;
11498
11499   /* ADDS/SUBS correctly set N and Z flags.  */
11500   if ((mode_x == SImode || mode_x == DImode)
11501       && y == const0_rtx
11502       && (code == EQ || code == NE || code == LT || code == GE)
11503       && (code_x == PLUS || code_x == MINUS || code_x == NEG))
11504     return CC_NZmode;
11505
11506   /* A compare with a shifted operand.  Because of canonicalization,
11507      the comparison will have to be swapped when we emit the assembly
11508      code.  */
11509   if ((mode_x == SImode || mode_x == DImode)
11510       && (REG_P (y) || SUBREG_P (y) || y == const0_rtx)
11511       && (code_x == ASHIFT || code_x == ASHIFTRT
11512           || code_x == LSHIFTRT
11513           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
11514     return CC_SWPmode;
11515
11516   /* Similarly for a negated operand, but we can only do this for
11517      equalities.  */
11518   if ((mode_x == SImode || mode_x == DImode)
11519       && (REG_P (y) || SUBREG_P (y))
11520       && (code == EQ || code == NE)
11521       && code_x == NEG)
11522     return CC_Zmode;
11523
11524   /* A test for unsigned overflow from an addition.  */
11525   if ((mode_x == DImode || mode_x == TImode)
11526       && (code == LTU || code == GEU)
11527       && code_x == PLUS
11528       && rtx_equal_p (XEXP (x, 0), y))
11529     return CC_Cmode;
11530
11531   /* A test for unsigned overflow from an add with carry.  */
11532   if ((mode_x == DImode || mode_x == TImode)
11533       && (code == LTU || code == GEU)
11534       && code_x == PLUS
11535       && CONST_SCALAR_INT_P (y)
11536       && (rtx_mode_t (y, mode_x)
11537           == (wi::shwi (1, mode_x)
11538               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
11539     return CC_ADCmode;
11540
11541   /* A test for signed overflow.  */
11542   if ((mode_x == DImode || mode_x == TImode)
11543       && code == NE
11544       && code_x == PLUS
11545       && GET_CODE (y) == SIGN_EXTEND)
11546     return CC_Vmode;
11547
11548   /* For everything else, return CCmode.  */
11549   return CCmode;
11550 }
11551
11552 static int
11553 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
11554
11555 int
11556 aarch64_get_condition_code (rtx x)
11557 {
11558   machine_mode mode = GET_MODE (XEXP (x, 0));
11559   enum rtx_code comp_code = GET_CODE (x);
11560
11561   if (GET_MODE_CLASS (mode) != MODE_CC)
11562     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
11563   return aarch64_get_condition_code_1 (mode, comp_code);
11564 }
11565
11566 static int
11567 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
11568 {
11569   switch (mode)
11570     {
11571     case E_CCFPmode:
11572     case E_CCFPEmode:
11573       switch (comp_code)
11574         {
11575         case GE: return AARCH64_GE;
11576         case GT: return AARCH64_GT;
11577         case LE: return AARCH64_LS;
11578         case LT: return AARCH64_MI;
11579         case NE: return AARCH64_NE;
11580         case EQ: return AARCH64_EQ;
11581         case ORDERED: return AARCH64_VC;
11582         case UNORDERED: return AARCH64_VS;
11583         case UNLT: return AARCH64_LT;
11584         case UNLE: return AARCH64_LE;
11585         case UNGT: return AARCH64_HI;
11586         case UNGE: return AARCH64_PL;
11587         default: return -1;
11588         }
11589       break;
11590
11591     case E_CCmode:
11592       switch (comp_code)
11593         {
11594         case NE: return AARCH64_NE;
11595         case EQ: return AARCH64_EQ;
11596         case GE: return AARCH64_GE;
11597         case GT: return AARCH64_GT;
11598         case LE: return AARCH64_LE;
11599         case LT: return AARCH64_LT;
11600         case GEU: return AARCH64_CS;
11601         case GTU: return AARCH64_HI;
11602         case LEU: return AARCH64_LS;
11603         case LTU: return AARCH64_CC;
11604         default: return -1;
11605         }
11606       break;
11607
11608     case E_CC_SWPmode:
11609       switch (comp_code)
11610         {
11611         case NE: return AARCH64_NE;
11612         case EQ: return AARCH64_EQ;
11613         case GE: return AARCH64_LE;
11614         case GT: return AARCH64_LT;
11615         case LE: return AARCH64_GE;
11616         case LT: return AARCH64_GT;
11617         case GEU: return AARCH64_LS;
11618         case GTU: return AARCH64_CC;
11619         case LEU: return AARCH64_CS;
11620         case LTU: return AARCH64_HI;
11621         default: return -1;
11622         }
11623       break;
11624
11625     case E_CC_NZCmode:
11626       switch (comp_code)
11627         {
11628         case NE: return AARCH64_NE; /* = any */
11629         case EQ: return AARCH64_EQ; /* = none */
11630         case GE: return AARCH64_PL; /* = nfrst */
11631         case LT: return AARCH64_MI; /* = first */
11632         case GEU: return AARCH64_CS; /* = nlast */
11633         case GTU: return AARCH64_HI; /* = pmore */
11634         case LEU: return AARCH64_LS; /* = plast */
11635         case LTU: return AARCH64_CC; /* = last */
11636         default: return -1;
11637         }
11638       break;
11639
11640     case E_CC_NZVmode:
11641       switch (comp_code)
11642         {
11643         case NE: return AARCH64_NE;
11644         case EQ: return AARCH64_EQ;
11645         case GE: return AARCH64_PL;
11646         case LT: return AARCH64_MI;
11647         case GT: return AARCH64_GT;
11648         case LE: return AARCH64_LE;
11649         default: return -1;
11650         }
11651       break;
11652
11653     case E_CC_NZmode:
11654       switch (comp_code)
11655         {
11656         case NE: return AARCH64_NE;
11657         case EQ: return AARCH64_EQ;
11658         case GE: return AARCH64_PL;
11659         case LT: return AARCH64_MI;
11660         default: return -1;
11661         }
11662       break;
11663
11664     case E_CC_Zmode:
11665       switch (comp_code)
11666         {
11667         case NE: return AARCH64_NE;
11668         case EQ: return AARCH64_EQ;
11669         default: return -1;
11670         }
11671       break;
11672
11673     case E_CC_Cmode:
11674       switch (comp_code)
11675         {
11676         case LTU: return AARCH64_CS;
11677         case GEU: return AARCH64_CC;
11678         default: return -1;
11679         }
11680       break;
11681
11682     case E_CC_ADCmode:
11683       switch (comp_code)
11684         {
11685         case GEU: return AARCH64_CS;
11686         case LTU: return AARCH64_CC;
11687         default: return -1;
11688         }
11689       break;
11690
11691     case E_CC_Vmode:
11692       switch (comp_code)
11693         {
11694         case NE: return AARCH64_VS;
11695         case EQ: return AARCH64_VC;
11696         default: return -1;
11697         }
11698       break;
11699
11700     default:
11701       return -1;
11702     }
11703
11704   return -1;
11705 }
11706
11707 bool
11708 aarch64_const_vec_all_same_in_range_p (rtx x,
11709                                        HOST_WIDE_INT minval,
11710                                        HOST_WIDE_INT maxval)
11711 {
11712   rtx elt;
11713   return (const_vec_duplicate_p (x, &elt)
11714           && CONST_INT_P (elt)
11715           && IN_RANGE (INTVAL (elt), minval, maxval));
11716 }
11717
11718 bool
11719 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
11720 {
11721   return aarch64_const_vec_all_same_in_range_p (x, val, val);
11722 }
11723
11724 /* Return true if VEC is a constant in which every element is in the range
11725    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
11726
11727 static bool
11728 aarch64_const_vec_all_in_range_p (rtx vec,
11729                                   HOST_WIDE_INT minval,
11730                                   HOST_WIDE_INT maxval)
11731 {
11732   if (!CONST_VECTOR_P (vec)
11733       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
11734     return false;
11735
11736   int nunits;
11737   if (!CONST_VECTOR_STEPPED_P (vec))
11738     nunits = const_vector_encoded_nelts (vec);
11739   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
11740     return false;
11741
11742   for (int i = 0; i < nunits; i++)
11743     {
11744       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
11745       if (!CONST_INT_P (vec_elem)
11746           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
11747         return false;
11748     }
11749   return true;
11750 }
11751
11752 /* N Z C V.  */
11753 #define AARCH64_CC_V 1
11754 #define AARCH64_CC_C (1 << 1)
11755 #define AARCH64_CC_Z (1 << 2)
11756 #define AARCH64_CC_N (1 << 3)
11757
11758 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
11759 static const int aarch64_nzcv_codes[] =
11760 {
11761   0,            /* EQ, Z == 1.  */
11762   AARCH64_CC_Z, /* NE, Z == 0.  */
11763   0,            /* CS, C == 1.  */
11764   AARCH64_CC_C, /* CC, C == 0.  */
11765   0,            /* MI, N == 1.  */
11766   AARCH64_CC_N, /* PL, N == 0.  */
11767   0,            /* VS, V == 1.  */
11768   AARCH64_CC_V, /* VC, V == 0.  */
11769   0,            /* HI, C ==1 && Z == 0.  */
11770   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
11771   AARCH64_CC_V, /* GE, N == V.  */
11772   0,            /* LT, N != V.  */
11773   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
11774   0,            /* LE, !(Z == 0 && N == V).  */
11775   0,            /* AL, Any.  */
11776   0             /* NV, Any.  */
11777 };
11778
11779 /* Print floating-point vector immediate operand X to F, negating it
11780    first if NEGATE is true.  Return true on success, false if it isn't
11781    a constant we can handle.  */
11782
11783 static bool
11784 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
11785 {
11786   rtx elt;
11787
11788   if (!const_vec_duplicate_p (x, &elt))
11789     return false;
11790
11791   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
11792   if (negate)
11793     r = real_value_negate (&r);
11794
11795   /* Handle the SVE single-bit immediates specially, since they have a
11796      fixed form in the assembly syntax.  */
11797   if (real_equal (&r, &dconst0))
11798     asm_fprintf (f, "0.0");
11799   else if (real_equal (&r, &dconst2))
11800     asm_fprintf (f, "2.0");
11801   else if (real_equal (&r, &dconst1))
11802     asm_fprintf (f, "1.0");
11803   else if (real_equal (&r, &dconsthalf))
11804     asm_fprintf (f, "0.5");
11805   else
11806     {
11807       const int buf_size = 20;
11808       char float_buf[buf_size] = {'\0'};
11809       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
11810                                 1, GET_MODE (elt));
11811       asm_fprintf (f, "%s", float_buf);
11812     }
11813
11814   return true;
11815 }
11816
11817 /* Return the equivalent letter for size.  */
11818 static char
11819 sizetochar (int size)
11820 {
11821   switch (size)
11822     {
11823     case 64: return 'd';
11824     case 32: return 's';
11825     case 16: return 'h';
11826     case 8 : return 'b';
11827     default: gcc_unreachable ();
11828     }
11829 }
11830
11831 /* Print operand X to file F in a target specific manner according to CODE.
11832    The acceptable formatting commands given by CODE are:
11833      'c':               An integer or symbol address without a preceding #
11834                         sign.
11835      'C':               Take the duplicated element in a vector constant
11836                         and print it in hex.
11837      'D':               Take the duplicated element in a vector constant
11838                         and print it as an unsigned integer, in decimal.
11839      'e':               Print the sign/zero-extend size as a character 8->b,
11840                         16->h, 32->w.  Can also be used for masks:
11841                         0xff->b, 0xffff->h, 0xffffffff->w.
11842      'I':               If the operand is a duplicated vector constant,
11843                         replace it with the duplicated scalar.  If the
11844                         operand is then a floating-point constant, replace
11845                         it with the integer bit representation.  Print the
11846                         transformed constant as a signed decimal number.
11847      'p':               Prints N such that 2^N == X (X must be power of 2 and
11848                         const int).
11849      'P':               Print the number of non-zero bits in X (a const_int).
11850      'H':               Print the higher numbered register of a pair (TImode)
11851                         of regs.
11852      'm':               Print a condition (eq, ne, etc).
11853      'M':               Same as 'm', but invert condition.
11854      'N':               Take the duplicated element in a vector constant
11855                         and print the negative of it in decimal.
11856      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
11857      'S/T/U/V':         Print a FP/SIMD register name for a register list.
11858                         The register printed is the FP/SIMD register name
11859                         of X + 0/1/2/3 for S/T/U/V.
11860      'R':               Print a scalar Integer/FP/SIMD register name + 1.
11861      'X':               Print bottom 16 bits of integer constant in hex.
11862      'w/x':             Print a general register name or the zero register
11863                         (32-bit or 64-bit).
11864      '0':               Print a normal operand, if it's a general register,
11865                         then we assume DImode.
11866      'k':               Print NZCV for conditional compare instructions.
11867      'A':               Output address constant representing the first
11868                         argument of X, specifying a relocation offset
11869                         if appropriate.
11870      'L':               Output constant address specified by X
11871                         with a relocation offset if appropriate.
11872      'G':               Prints address of X, specifying a PC relative
11873                         relocation mode if appropriate.
11874      'y':               Output address of LDP or STP - this is used for
11875                         some LDP/STPs which don't use a PARALLEL in their
11876                         pattern (so the mode needs to be adjusted).
11877      'z':               Output address of a typical LDP or STP.  */
11878
11879 static void
11880 aarch64_print_operand (FILE *f, rtx x, int code)
11881 {
11882   rtx elt;
11883   switch (code)
11884     {
11885     case 'c':
11886       if (CONST_INT_P (x))
11887         fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
11888       else
11889         {
11890           poly_int64 offset;
11891           rtx base = strip_offset_and_salt (x, &offset);
11892           if (SYMBOL_REF_P (base))
11893             output_addr_const (f, x);
11894           else
11895             output_operand_lossage ("unsupported operand for code '%c'", code);
11896         }
11897       break;
11898
11899     case 'e':
11900       {
11901         x = unwrap_const_vec_duplicate (x);
11902         if (!CONST_INT_P (x))
11903           {
11904             output_operand_lossage ("invalid operand for '%%%c'", code);
11905             return;
11906           }
11907
11908         HOST_WIDE_INT val = INTVAL (x);
11909         if ((val & ~7) == 8 || val == 0xff)
11910           fputc ('b', f);
11911         else if ((val & ~7) == 16 || val == 0xffff)
11912           fputc ('h', f);
11913         else if ((val & ~7) == 32 || val == 0xffffffff)
11914           fputc ('w', f);
11915         else
11916           {
11917             output_operand_lossage ("invalid operand for '%%%c'", code);
11918             return;
11919           }
11920       }
11921       break;
11922
11923     case 'p':
11924       {
11925         int n;
11926
11927         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
11928           {
11929             output_operand_lossage ("invalid operand for '%%%c'", code);
11930             return;
11931           }
11932
11933         asm_fprintf (f, "%d", n);
11934       }
11935       break;
11936
11937     case 'P':
11938       if (!CONST_INT_P (x))
11939         {
11940           output_operand_lossage ("invalid operand for '%%%c'", code);
11941           return;
11942         }
11943
11944       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
11945       break;
11946
11947     case 'H':
11948       if (x == const0_rtx)
11949         {
11950           asm_fprintf (f, "xzr");
11951           break;
11952         }
11953
11954       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
11955         {
11956           output_operand_lossage ("invalid operand for '%%%c'", code);
11957           return;
11958         }
11959
11960       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
11961       break;
11962
11963     case 'I':
11964       {
11965         x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
11966         if (CONST_INT_P (x))
11967           asm_fprintf (f, "%wd", INTVAL (x));
11968         else
11969           {
11970             output_operand_lossage ("invalid operand for '%%%c'", code);
11971             return;
11972           }
11973         break;
11974       }
11975
11976     case 'M':
11977     case 'm':
11978       {
11979         int cond_code;
11980         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
11981         if (x == const_true_rtx)
11982           {
11983             if (code == 'M')
11984               fputs ("nv", f);
11985             return;
11986           }
11987
11988         if (!COMPARISON_P (x))
11989           {
11990             output_operand_lossage ("invalid operand for '%%%c'", code);
11991             return;
11992           }
11993
11994         cond_code = aarch64_get_condition_code (x);
11995         gcc_assert (cond_code >= 0);
11996         if (code == 'M')
11997           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
11998         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
11999           fputs (aarch64_sve_condition_codes[cond_code], f);
12000         else
12001           fputs (aarch64_condition_codes[cond_code], f);
12002       }
12003       break;
12004
12005     case 'N':
12006       if (!const_vec_duplicate_p (x, &elt))
12007         {
12008           output_operand_lossage ("invalid vector constant");
12009           return;
12010         }
12011
12012       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12013         asm_fprintf (f, "%wd", (HOST_WIDE_INT) -UINTVAL (elt));
12014       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12015                && aarch64_print_vector_float_operand (f, x, true))
12016         ;
12017       else
12018         {
12019           output_operand_lossage ("invalid vector constant");
12020           return;
12021         }
12022       break;
12023
12024     case 'b':
12025     case 'h':
12026     case 's':
12027     case 'd':
12028     case 'q':
12029       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
12030         {
12031           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
12032           return;
12033         }
12034       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
12035       break;
12036
12037     case 'S':
12038     case 'T':
12039     case 'U':
12040     case 'V':
12041       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
12042         {
12043           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
12044           return;
12045         }
12046       asm_fprintf (f, "%c%d",
12047                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
12048                    REGNO (x) - V0_REGNUM + (code - 'S'));
12049       break;
12050
12051     case 'R':
12052       if (REG_P (x) && FP_REGNUM_P (REGNO (x))
12053           && (aarch64_advsimd_partial_struct_mode_p (GET_MODE (x))))
12054         asm_fprintf (f, "d%d", REGNO (x) - V0_REGNUM + 1);
12055       else if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
12056         asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
12057       else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12058         asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
12059       else
12060         output_operand_lossage ("incompatible register operand for '%%%c'",
12061                                 code);
12062       break;
12063
12064     case 'X':
12065       if (!CONST_INT_P (x))
12066         {
12067           output_operand_lossage ("invalid operand for '%%%c'", code);
12068           return;
12069         }
12070       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
12071       break;
12072
12073     case 'C':
12074       {
12075         /* Print a replicated constant in hex.  */
12076         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12077           {
12078             output_operand_lossage ("invalid operand for '%%%c'", code);
12079             return;
12080           }
12081         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12082         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12083       }
12084       break;
12085
12086     case 'D':
12087       {
12088         /* Print a replicated constant in decimal, treating it as
12089            unsigned.  */
12090         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12091           {
12092             output_operand_lossage ("invalid operand for '%%%c'", code);
12093             return;
12094           }
12095         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12096         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12097       }
12098       break;
12099
12100     case 'w':
12101     case 'x':
12102       if (x == const0_rtx
12103           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
12104         {
12105           asm_fprintf (f, "%czr", code);
12106           break;
12107         }
12108
12109       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12110         {
12111           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
12112           break;
12113         }
12114
12115       if (REG_P (x) && REGNO (x) == SP_REGNUM)
12116         {
12117           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
12118           break;
12119         }
12120
12121       /* Fall through */
12122
12123     case 0:
12124       if (x == NULL)
12125         {
12126           output_operand_lossage ("missing operand");
12127           return;
12128         }
12129
12130       switch (GET_CODE (x))
12131         {
12132         case REG:
12133           if (aarch64_sve_data_mode_p (GET_MODE (x)))
12134             {
12135               if (REG_NREGS (x) == 1)
12136                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
12137               else
12138                 {
12139                   char suffix
12140                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
12141                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
12142                                REGNO (x) - V0_REGNUM, suffix,
12143                                END_REGNO (x) - V0_REGNUM - 1, suffix);
12144                 }
12145             }
12146           else
12147             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
12148           break;
12149
12150         case MEM:
12151           output_address (GET_MODE (x), XEXP (x, 0));
12152           break;
12153
12154         case LABEL_REF:
12155         case SYMBOL_REF:
12156           output_addr_const (asm_out_file, x);
12157           break;
12158
12159         case CONST_INT:
12160           asm_fprintf (f, "%wd", INTVAL (x));
12161           break;
12162
12163         case CONST:
12164           if (!VECTOR_MODE_P (GET_MODE (x)))
12165             {
12166               output_addr_const (asm_out_file, x);
12167               break;
12168             }
12169           /* fall through */
12170
12171         case CONST_VECTOR:
12172           if (!const_vec_duplicate_p (x, &elt))
12173             {
12174               output_operand_lossage ("invalid vector constant");
12175               return;
12176             }
12177
12178           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12179             asm_fprintf (f, "%wd", INTVAL (elt));
12180           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12181                    && aarch64_print_vector_float_operand (f, x, false))
12182             ;
12183           else
12184             {
12185               output_operand_lossage ("invalid vector constant");
12186               return;
12187             }
12188           break;
12189
12190         case CONST_DOUBLE:
12191           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
12192              be getting CONST_DOUBLEs holding integers.  */
12193           gcc_assert (GET_MODE (x) != VOIDmode);
12194           if (aarch64_float_const_zero_rtx_p (x))
12195             {
12196               fputc ('0', f);
12197               break;
12198             }
12199           else if (aarch64_float_const_representable_p (x))
12200             {
12201 #define buf_size 20
12202               char float_buf[buf_size] = {'\0'};
12203               real_to_decimal_for_mode (float_buf,
12204                                         CONST_DOUBLE_REAL_VALUE (x),
12205                                         buf_size, buf_size,
12206                                         1, GET_MODE (x));
12207               asm_fprintf (asm_out_file, "%s", float_buf);
12208               break;
12209 #undef buf_size
12210             }
12211           output_operand_lossage ("invalid constant");
12212           return;
12213         default:
12214           output_operand_lossage ("invalid operand");
12215           return;
12216         }
12217       break;
12218
12219     case 'A':
12220       if (GET_CODE (x) == HIGH)
12221         x = XEXP (x, 0);
12222
12223       switch (aarch64_classify_symbolic_expression (x))
12224         {
12225         case SYMBOL_SMALL_GOT_4G:
12226           asm_fprintf (asm_out_file, ":got:");
12227           break;
12228
12229         case SYMBOL_SMALL_TLSGD:
12230           asm_fprintf (asm_out_file, ":tlsgd:");
12231           break;
12232
12233         case SYMBOL_SMALL_TLSDESC:
12234           asm_fprintf (asm_out_file, ":tlsdesc:");
12235           break;
12236
12237         case SYMBOL_SMALL_TLSIE:
12238           asm_fprintf (asm_out_file, ":gottprel:");
12239           break;
12240
12241         case SYMBOL_TLSLE24:
12242           asm_fprintf (asm_out_file, ":tprel:");
12243           break;
12244
12245         case SYMBOL_TINY_GOT:
12246           gcc_unreachable ();
12247           break;
12248
12249         default:
12250           break;
12251         }
12252       output_addr_const (asm_out_file, x);
12253       break;
12254
12255     case 'L':
12256       switch (aarch64_classify_symbolic_expression (x))
12257         {
12258         case SYMBOL_SMALL_GOT_4G:
12259           asm_fprintf (asm_out_file, ":got_lo12:");
12260           break;
12261
12262         case SYMBOL_SMALL_TLSGD:
12263           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
12264           break;
12265
12266         case SYMBOL_SMALL_TLSDESC:
12267           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
12268           break;
12269
12270         case SYMBOL_SMALL_TLSIE:
12271           asm_fprintf (asm_out_file, ":gottprel_lo12:");
12272           break;
12273
12274         case SYMBOL_TLSLE12:
12275           asm_fprintf (asm_out_file, ":tprel_lo12:");
12276           break;
12277
12278         case SYMBOL_TLSLE24:
12279           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
12280           break;
12281
12282         case SYMBOL_TINY_GOT:
12283           asm_fprintf (asm_out_file, ":got:");
12284           break;
12285
12286         case SYMBOL_TINY_TLSIE:
12287           asm_fprintf (asm_out_file, ":gottprel:");
12288           break;
12289
12290         default:
12291           break;
12292         }
12293       output_addr_const (asm_out_file, x);
12294       break;
12295
12296     case 'G':
12297       switch (aarch64_classify_symbolic_expression (x))
12298         {
12299         case SYMBOL_TLSLE24:
12300           asm_fprintf (asm_out_file, ":tprel_hi12:");
12301           break;
12302         default:
12303           break;
12304         }
12305       output_addr_const (asm_out_file, x);
12306       break;
12307
12308     case 'k':
12309       {
12310         HOST_WIDE_INT cond_code;
12311
12312         if (!CONST_INT_P (x))
12313           {
12314             output_operand_lossage ("invalid operand for '%%%c'", code);
12315             return;
12316           }
12317
12318         cond_code = INTVAL (x);
12319         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
12320         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
12321       }
12322       break;
12323
12324     case 'y':
12325     case 'z':
12326       {
12327         machine_mode mode = GET_MODE (x);
12328
12329         if (!MEM_P (x)
12330             || (code == 'y'
12331                 && maybe_ne (GET_MODE_SIZE (mode), 8)
12332                 && maybe_ne (GET_MODE_SIZE (mode), 16)))
12333           {
12334             output_operand_lossage ("invalid operand for '%%%c'", code);
12335             return;
12336           }
12337
12338         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
12339                                             code == 'y'
12340                                             ? ADDR_QUERY_LDP_STP_N
12341                                             : ADDR_QUERY_LDP_STP))
12342           output_operand_lossage ("invalid operand prefix '%%%c'", code);
12343       }
12344       break;
12345
12346     default:
12347       output_operand_lossage ("invalid operand prefix '%%%c'", code);
12348       return;
12349     }
12350 }
12351
12352 /* Print address 'x' of a memory access with mode 'mode'.
12353    'op' is the context required by aarch64_classify_address.  It can either be
12354    MEM for a normal memory access or PARALLEL for LDP/STP.  */
12355 static bool
12356 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
12357                                 aarch64_addr_query_type type)
12358 {
12359   struct aarch64_address_info addr;
12360   unsigned int size, vec_flags;
12361
12362   /* Check all addresses are Pmode - including ILP32.  */
12363   if (GET_MODE (x) != Pmode
12364       && (!CONST_INT_P (x)
12365           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
12366     {
12367       output_operand_lossage ("invalid address mode");
12368       return false;
12369     }
12370
12371   if (aarch64_classify_address (&addr, x, mode, true, type))
12372     switch (addr.type)
12373       {
12374       case ADDRESS_REG_IMM:
12375         if (known_eq (addr.const_offset, 0))
12376           {
12377             asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
12378             return true;
12379           }
12380
12381         vec_flags = aarch64_classify_vector_mode (mode);
12382         if (vec_flags & VEC_ANY_SVE)
12383           {
12384             HOST_WIDE_INT vnum
12385               = exact_div (addr.const_offset,
12386                            aarch64_vl_bytes (mode, vec_flags)).to_constant ();
12387             asm_fprintf (f, "[%s, #%wd, mul vl]",
12388                          reg_names[REGNO (addr.base)], vnum);
12389             return true;
12390           }
12391
12392         asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
12393                      INTVAL (addr.offset));
12394         return true;
12395
12396       case ADDRESS_REG_REG:
12397         if (addr.shift == 0)
12398           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
12399                        reg_names [REGNO (addr.offset)]);
12400         else
12401           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
12402                        reg_names [REGNO (addr.offset)], addr.shift);
12403         return true;
12404
12405       case ADDRESS_REG_UXTW:
12406         if (addr.shift == 0)
12407           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
12408                        REGNO (addr.offset) - R0_REGNUM);
12409         else
12410           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
12411                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
12412         return true;
12413
12414       case ADDRESS_REG_SXTW:
12415         if (addr.shift == 0)
12416           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
12417                        REGNO (addr.offset) - R0_REGNUM);
12418         else
12419           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
12420                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
12421         return true;
12422
12423       case ADDRESS_REG_WB:
12424         /* Writeback is only supported for fixed-width modes.  */
12425         size = GET_MODE_SIZE (mode).to_constant ();
12426         switch (GET_CODE (x))
12427           {
12428           case PRE_INC:
12429             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
12430             return true;
12431           case POST_INC:
12432             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
12433             return true;
12434           case PRE_DEC:
12435             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
12436             return true;
12437           case POST_DEC:
12438             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
12439             return true;
12440           case PRE_MODIFY:
12441             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
12442                          INTVAL (addr.offset));
12443             return true;
12444           case POST_MODIFY:
12445             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
12446                          INTVAL (addr.offset));
12447             return true;
12448           default:
12449             break;
12450           }
12451         break;
12452
12453       case ADDRESS_LO_SUM:
12454         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
12455         output_addr_const (f, addr.offset);
12456         asm_fprintf (f, "]");
12457         return true;
12458
12459       case ADDRESS_SYMBOLIC:
12460         output_addr_const (f, x);
12461         return true;
12462       }
12463
12464   return false;
12465 }
12466
12467 /* Print address 'x' of a memory access with mode 'mode'.  */
12468 static void
12469 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
12470 {
12471   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
12472     output_addr_const (f, x);
12473 }
12474
12475 /* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA.  */
12476
12477 static bool
12478 aarch64_output_addr_const_extra (FILE *file, rtx x)
12479 {
12480   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
12481     {
12482       output_addr_const (file, XVECEXP (x, 0, 0));
12483       return true;
12484    }
12485   return false;
12486 }
12487
12488 bool
12489 aarch64_label_mentioned_p (rtx x)
12490 {
12491   const char *fmt;
12492   int i;
12493
12494   if (LABEL_REF_P (x))
12495     return true;
12496
12497   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
12498      referencing instruction, but they are constant offsets, not
12499      symbols.  */
12500   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
12501     return false;
12502
12503   fmt = GET_RTX_FORMAT (GET_CODE (x));
12504   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
12505     {
12506       if (fmt[i] == 'E')
12507         {
12508           int j;
12509
12510           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
12511             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
12512               return 1;
12513         }
12514       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
12515         return 1;
12516     }
12517
12518   return 0;
12519 }
12520
12521 /* Implement REGNO_REG_CLASS.  */
12522
12523 enum reg_class
12524 aarch64_regno_regclass (unsigned regno)
12525 {
12526   if (STUB_REGNUM_P (regno))
12527     return STUB_REGS;
12528
12529   if (GP_REGNUM_P (regno))
12530     return GENERAL_REGS;
12531
12532   if (regno == SP_REGNUM)
12533     return STACK_REG;
12534
12535   if (regno == FRAME_POINTER_REGNUM
12536       || regno == ARG_POINTER_REGNUM)
12537     return POINTER_REGS;
12538
12539   if (FP_REGNUM_P (regno))
12540     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
12541             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
12542
12543   if (PR_REGNUM_P (regno))
12544     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
12545
12546   if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
12547     return FFR_REGS;
12548
12549   return NO_REGS;
12550 }
12551
12552 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
12553    If OFFSET is out of range, return an offset of an anchor point
12554    that is in range.  Return 0 otherwise.  */
12555
12556 static HOST_WIDE_INT
12557 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
12558                        machine_mode mode)
12559 {
12560   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
12561   if (size > 16)
12562     return (offset + 0x400) & ~0x7f0;
12563
12564   /* For offsets that aren't a multiple of the access size, the limit is
12565      -256...255.  */
12566   if (offset & (size - 1))
12567     {
12568       /* BLKmode typically uses LDP of X-registers.  */
12569       if (mode == BLKmode)
12570         return (offset + 512) & ~0x3ff;
12571       return (offset + 0x100) & ~0x1ff;
12572     }
12573
12574   /* Small negative offsets are supported.  */
12575   if (IN_RANGE (offset, -256, 0))
12576     return 0;
12577
12578   if (mode == TImode || mode == TFmode || mode == TDmode)
12579     return (offset + 0x100) & ~0x1ff;
12580
12581   /* Use 12-bit offset by access size.  */
12582   return offset & (~0xfff * size);
12583 }
12584
12585 static rtx
12586 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
12587 {
12588   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
12589      where mask is selected by alignment and size of the offset.
12590      We try to pick as large a range for the offset as possible to
12591      maximize the chance of a CSE.  However, for aligned addresses
12592      we limit the range to 4k so that structures with different sized
12593      elements are likely to use the same base.  We need to be careful
12594      not to split a CONST for some forms of address expression, otherwise
12595      it will generate sub-optimal code.  */
12596
12597   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
12598     {
12599       rtx base = XEXP (x, 0);
12600       rtx offset_rtx = XEXP (x, 1);
12601       HOST_WIDE_INT offset = INTVAL (offset_rtx);
12602
12603       if (GET_CODE (base) == PLUS)
12604         {
12605           rtx op0 = XEXP (base, 0);
12606           rtx op1 = XEXP (base, 1);
12607
12608           /* Force any scaling into a temp for CSE.  */
12609           op0 = force_reg (Pmode, op0);
12610           op1 = force_reg (Pmode, op1);
12611
12612           /* Let the pointer register be in op0.  */
12613           if (REG_POINTER (op1))
12614             std::swap (op0, op1);
12615
12616           /* If the pointer is virtual or frame related, then we know that
12617              virtual register instantiation or register elimination is going
12618              to apply a second constant.  We want the two constants folded
12619              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
12620           if (virt_or_elim_regno_p (REGNO (op0)))
12621             {
12622               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
12623                                    NULL_RTX, true, OPTAB_DIRECT);
12624               return gen_rtx_PLUS (Pmode, base, op1);
12625             }
12626
12627           /* Otherwise, in order to encourage CSE (and thence loop strength
12628              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
12629           base = expand_binop (Pmode, add_optab, op0, op1,
12630                                NULL_RTX, true, OPTAB_DIRECT);
12631           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
12632         }
12633
12634       HOST_WIDE_INT size;
12635       if (GET_MODE_SIZE (mode).is_constant (&size))
12636         {
12637           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
12638                                                              mode);
12639           if (base_offset != 0)
12640             {
12641               base = plus_constant (Pmode, base, base_offset);
12642               base = force_operand (base, NULL_RTX);
12643               return plus_constant (Pmode, base, offset - base_offset);
12644             }
12645         }
12646     }
12647
12648   return x;
12649 }
12650
12651 static reg_class_t
12652 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
12653                           reg_class_t rclass,
12654                           machine_mode mode,
12655                           secondary_reload_info *sri)
12656 {
12657   /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
12658      LDR and STR.  See the comment at the head of aarch64-sve.md for
12659      more details about the big-endian handling.  */
12660   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12661   if (reg_class_subset_p (rclass, FP_REGS)
12662       && !((REG_P (x) && HARD_REGISTER_P (x))
12663            || aarch64_simd_valid_immediate (x, NULL))
12664       && mode != VNx16QImode
12665       && (vec_flags & VEC_SVE_DATA)
12666       && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
12667     {
12668       sri->icode = CODE_FOR_aarch64_sve_reload_mem;
12669       return NO_REGS;
12670     }
12671
12672   /* If we have to disable direct literal pool loads and stores because the
12673      function is too big, then we need a scratch register.  */
12674   if (MEM_P (x) && SYMBOL_REF_P (x) && CONSTANT_POOL_ADDRESS_P (x)
12675       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
12676           || targetm.vector_mode_supported_p (GET_MODE (x)))
12677       && !aarch64_pcrelative_literal_loads)
12678     {
12679       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
12680       return NO_REGS;
12681     }
12682
12683   /* Without the TARGET_SIMD instructions we cannot move a Q register
12684      to a Q register directly.  We need a scratch.  */
12685   if (REG_P (x)
12686       && (mode == TFmode
12687           || mode == TImode
12688           || mode == TDmode
12689           || (vec_flags == VEC_ADVSIMD && known_eq (GET_MODE_SIZE (mode), 16)))
12690       && mode == GET_MODE (x)
12691       && !TARGET_SIMD
12692       && FP_REGNUM_P (REGNO (x))
12693       && reg_class_subset_p (rclass, FP_REGS))
12694     {
12695       sri->icode = code_for_aarch64_reload_mov (mode);
12696       return NO_REGS;
12697     }
12698
12699   /* A TFmode, TImode or TDmode memory access should be handled via an FP_REGS
12700      because AArch64 has richer addressing modes for LDR/STR instructions
12701      than LDP/STP instructions.  */
12702   if (TARGET_FLOAT && rclass == GENERAL_REGS
12703       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
12704     return FP_REGS;
12705
12706   if (rclass == FP_REGS
12707       && (mode == TImode || mode == TFmode || mode == TDmode)
12708       && CONSTANT_P(x))
12709       return GENERAL_REGS;
12710
12711   return NO_REGS;
12712 }
12713
12714 /* Implement TARGET_SECONDARY_MEMORY_NEEDED.  */
12715
12716 static bool
12717 aarch64_secondary_memory_needed (machine_mode mode, reg_class_t class1,
12718                                  reg_class_t class2)
12719 {
12720   if (!TARGET_SIMD
12721       && reg_classes_intersect_p (class1, FP_REGS)
12722       && reg_classes_intersect_p (class2, FP_REGS))
12723     {
12724       /* We can't do a 128-bit FPR-to-FPR move without TARGET_SIMD,
12725          so we can't easily split a move involving tuples of 128-bit
12726          vectors.  Force the copy through memory instead.
12727
12728          (Tuples of 64-bit vectors are fine.)  */
12729       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12730       if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
12731         return true;
12732     }
12733   return false;
12734 }
12735
12736 static bool
12737 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
12738 {
12739   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
12740
12741   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
12742      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
12743   if (frame_pointer_needed)
12744     return to == HARD_FRAME_POINTER_REGNUM;
12745   return true;
12746 }
12747
12748 poly_int64
12749 aarch64_initial_elimination_offset (unsigned from, unsigned to)
12750 {
12751   if (to == HARD_FRAME_POINTER_REGNUM)
12752     {
12753       if (from == ARG_POINTER_REGNUM)
12754         return cfun->machine->frame.hard_fp_offset;
12755
12756       if (from == FRAME_POINTER_REGNUM)
12757         return cfun->machine->frame.hard_fp_offset
12758                - cfun->machine->frame.locals_offset;
12759     }
12760
12761   if (to == STACK_POINTER_REGNUM)
12762     {
12763       if (from == FRAME_POINTER_REGNUM)
12764           return cfun->machine->frame.frame_size
12765                  - cfun->machine->frame.locals_offset;
12766     }
12767
12768   return cfun->machine->frame.frame_size;
12769 }
12770
12771
12772 /* Get return address without mangling.  */
12773
12774 rtx
12775 aarch64_return_addr_rtx (void)
12776 {
12777   rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
12778   /* Note: aarch64_return_address_signing_enabled only
12779      works after cfun->machine->frame.laid_out is set,
12780      so here we don't know if the return address will
12781      be signed or not.  */
12782   rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
12783   emit_move_insn (lr, val);
12784   emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
12785   return lr;
12786 }
12787
12788
12789 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
12790    previous frame.  */
12791
12792 rtx
12793 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
12794 {
12795   if (count != 0)
12796     return const0_rtx;
12797   return aarch64_return_addr_rtx ();
12798 }
12799
12800 static void
12801 aarch64_asm_trampoline_template (FILE *f)
12802 {
12803   /* Even if the current function doesn't have branch protection, some
12804      later function might, so since this template is only generated once
12805      we have to add a BTI just in case. */
12806   asm_fprintf (f, "\thint\t34 // bti c\n");
12807
12808   if (TARGET_ILP32)
12809     {
12810       asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
12811       asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
12812     }
12813   else
12814     {
12815       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
12816       asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
12817     }
12818   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
12819
12820   /* We always emit a speculation barrier.
12821      This is because the same trampoline template is used for every nested
12822      function.  Since nested functions are not particularly common or
12823      performant we don't worry too much about the extra instructions to copy
12824      around.
12825      This is not yet a problem, since we have not yet implemented function
12826      specific attributes to choose between hardening against straight line
12827      speculation or not, but such function specific attributes are likely to
12828      happen in the future.  */
12829   asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
12830
12831   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
12832   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
12833 }
12834
12835 static void
12836 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
12837 {
12838   rtx fnaddr, mem, a_tramp;
12839   const int tramp_code_sz = 24;
12840
12841   /* Don't need to copy the trailing D-words, we fill those in below.  */
12842   /* We create our own memory address in Pmode so that `emit_block_move` can
12843      use parts of the backend which expect Pmode addresses.  */
12844   rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
12845   emit_block_move (gen_rtx_MEM (BLKmode, temp),
12846                    assemble_trampoline_template (),
12847                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
12848   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
12849   fnaddr = XEXP (DECL_RTL (fndecl), 0);
12850   if (GET_MODE (fnaddr) != ptr_mode)
12851     fnaddr = convert_memory_address (ptr_mode, fnaddr);
12852   emit_move_insn (mem, fnaddr);
12853
12854   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
12855   emit_move_insn (mem, chain_value);
12856
12857   /* XXX We should really define a "clear_cache" pattern and use
12858      gen_clear_cache().  */
12859   a_tramp = XEXP (m_tramp, 0);
12860   maybe_emit_call_builtin___clear_cache (a_tramp,
12861                                          plus_constant (ptr_mode,
12862                                                         a_tramp,
12863                                                         TRAMPOLINE_SIZE));
12864 }
12865
12866 static unsigned char
12867 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
12868 {
12869   /* ??? Logically we should only need to provide a value when
12870      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
12871      can hold MODE, but at the moment we need to handle all modes.
12872      Just ignore any runtime parts for registers that can't store them.  */
12873   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
12874   unsigned int nregs, vec_flags;
12875   switch (regclass)
12876     {
12877     case STUB_REGS:
12878     case TAILCALL_ADDR_REGS:
12879     case POINTER_REGS:
12880     case GENERAL_REGS:
12881     case ALL_REGS:
12882     case POINTER_AND_FP_REGS:
12883     case FP_REGS:
12884     case FP_LO_REGS:
12885     case FP_LO8_REGS:
12886       vec_flags = aarch64_classify_vector_mode (mode);
12887       if ((vec_flags & VEC_SVE_DATA)
12888           && constant_multiple_p (GET_MODE_SIZE (mode),
12889                                   aarch64_vl_bytes (mode, vec_flags), &nregs))
12890         return nregs;
12891       return (vec_flags & VEC_ADVSIMD
12892               ? CEIL (lowest_size, UNITS_PER_VREG)
12893               : CEIL (lowest_size, UNITS_PER_WORD));
12894     case STACK_REG:
12895     case PR_REGS:
12896     case PR_LO_REGS:
12897     case PR_HI_REGS:
12898     case FFR_REGS:
12899     case PR_AND_FFR_REGS:
12900       return 1;
12901
12902     case NO_REGS:
12903       return 0;
12904
12905     default:
12906       break;
12907     }
12908   gcc_unreachable ();
12909 }
12910
12911 static reg_class_t
12912 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
12913 {
12914   if (regclass == POINTER_REGS)
12915     return GENERAL_REGS;
12916
12917   if (regclass == STACK_REG)
12918     {
12919       if (REG_P(x)
12920           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
12921           return regclass;
12922
12923       return NO_REGS;
12924     }
12925
12926   /* Register eliminiation can result in a request for
12927      SP+constant->FP_REGS.  We cannot support such operations which
12928      use SP as source and an FP_REG as destination, so reject out
12929      right now.  */
12930   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
12931     {
12932       rtx lhs = XEXP (x, 0);
12933
12934       /* Look through a possible SUBREG introduced by ILP32.  */
12935       if (SUBREG_P (lhs))
12936         lhs = SUBREG_REG (lhs);
12937
12938       gcc_assert (REG_P (lhs));
12939       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
12940                                       POINTER_REGS));
12941       return NO_REGS;
12942     }
12943
12944   return regclass;
12945 }
12946
12947 void
12948 aarch64_asm_output_labelref (FILE* f, const char *name)
12949 {
12950   asm_fprintf (f, "%U%s", name);
12951 }
12952
12953 static void
12954 aarch64_elf_asm_constructor (rtx symbol, int priority)
12955 {
12956   if (priority == DEFAULT_INIT_PRIORITY)
12957     default_ctor_section_asm_out_constructor (symbol, priority);
12958   else
12959     {
12960       section *s;
12961       /* While priority is known to be in range [0, 65535], so 18 bytes
12962          would be enough, the compiler might not know that.  To avoid
12963          -Wformat-truncation false positive, use a larger size.  */
12964       char buf[23];
12965       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
12966       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
12967       switch_to_section (s);
12968       assemble_align (POINTER_SIZE);
12969       assemble_aligned_integer (POINTER_BYTES, symbol);
12970     }
12971 }
12972
12973 static void
12974 aarch64_elf_asm_destructor (rtx symbol, int priority)
12975 {
12976   if (priority == DEFAULT_INIT_PRIORITY)
12977     default_dtor_section_asm_out_destructor (symbol, priority);
12978   else
12979     {
12980       section *s;
12981       /* While priority is known to be in range [0, 65535], so 18 bytes
12982          would be enough, the compiler might not know that.  To avoid
12983          -Wformat-truncation false positive, use a larger size.  */
12984       char buf[23];
12985       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
12986       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
12987       switch_to_section (s);
12988       assemble_align (POINTER_SIZE);
12989       assemble_aligned_integer (POINTER_BYTES, symbol);
12990     }
12991 }
12992
12993 const char*
12994 aarch64_output_casesi (rtx *operands)
12995 {
12996   char buf[100];
12997   char label[100];
12998   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
12999   int index;
13000   static const char *const patterns[4][2] =
13001   {
13002     {
13003       "ldrb\t%w3, [%0,%w1,uxtw]",
13004       "add\t%3, %4, %w3, sxtb #2"
13005     },
13006     {
13007       "ldrh\t%w3, [%0,%w1,uxtw #1]",
13008       "add\t%3, %4, %w3, sxth #2"
13009     },
13010     {
13011       "ldr\t%w3, [%0,%w1,uxtw #2]",
13012       "add\t%3, %4, %w3, sxtw #2"
13013     },
13014     /* We assume that DImode is only generated when not optimizing and
13015        that we don't really need 64-bit address offsets.  That would
13016        imply an object file with 8GB of code in a single function!  */
13017     {
13018       "ldr\t%w3, [%0,%w1,uxtw #2]",
13019       "add\t%3, %4, %w3, sxtw #2"
13020     }
13021   };
13022
13023   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
13024
13025   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
13026   index = exact_log2 (GET_MODE_SIZE (mode));
13027
13028   gcc_assert (index >= 0 && index <= 3);
13029
13030   /* Need to implement table size reduction, by chaning the code below.  */
13031   output_asm_insn (patterns[index][0], operands);
13032   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
13033   snprintf (buf, sizeof (buf),
13034             "adr\t%%4, %s", targetm.strip_name_encoding (label));
13035   output_asm_insn (buf, operands);
13036   output_asm_insn (patterns[index][1], operands);
13037   output_asm_insn ("br\t%3", operands);
13038   output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
13039                    operands);
13040   assemble_label (asm_out_file, label);
13041   return "";
13042 }
13043
13044
13045 /* Return size in bits of an arithmetic operand which is shifted/scaled and
13046    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
13047    operator.  */
13048
13049 int
13050 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
13051 {
13052   if (shift >= 0 && shift <= 3)
13053     {
13054       int size;
13055       for (size = 8; size <= 32; size *= 2)
13056         {
13057           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
13058           if (mask == bits << shift)
13059             return size;
13060         }
13061     }
13062   return 0;
13063 }
13064
13065 /* Constant pools are per function only when PC relative
13066    literal loads are true or we are in the large memory
13067    model.  */
13068
13069 static inline bool
13070 aarch64_can_use_per_function_literal_pools_p (void)
13071 {
13072   return (aarch64_pcrelative_literal_loads
13073           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
13074 }
13075
13076 static bool
13077 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
13078 {
13079   /* We can't use blocks for constants when we're using a per-function
13080      constant pool.  */
13081   return !aarch64_can_use_per_function_literal_pools_p ();
13082 }
13083
13084 /* Select appropriate section for constants depending
13085    on where we place literal pools.  */
13086
13087 static section *
13088 aarch64_select_rtx_section (machine_mode mode,
13089                             rtx x,
13090                             unsigned HOST_WIDE_INT align)
13091 {
13092   if (aarch64_can_use_per_function_literal_pools_p ())
13093     return function_section (current_function_decl);
13094
13095   return default_elf_select_rtx_section (mode, x, align);
13096 }
13097
13098 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
13099 void
13100 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
13101                                   HOST_WIDE_INT offset)
13102 {
13103   /* When using per-function literal pools, we must ensure that any code
13104      section is aligned to the minimal instruction length, lest we get
13105      errors from the assembler re "unaligned instructions".  */
13106   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
13107     ASM_OUTPUT_ALIGN (f, 2);
13108 }
13109
13110 /* Costs.  */
13111
13112 /* Helper function for rtx cost calculation.  Strip a shift expression
13113    from X.  Returns the inner operand if successful, or the original
13114    expression on failure.  */
13115 static rtx
13116 aarch64_strip_shift (rtx x)
13117 {
13118   rtx op = x;
13119
13120   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
13121      we can convert both to ROR during final output.  */
13122   if ((GET_CODE (op) == ASHIFT
13123        || GET_CODE (op) == ASHIFTRT
13124        || GET_CODE (op) == LSHIFTRT
13125        || GET_CODE (op) == ROTATERT
13126        || GET_CODE (op) == ROTATE)
13127       && CONST_INT_P (XEXP (op, 1)))
13128     return XEXP (op, 0);
13129
13130   if (GET_CODE (op) == MULT
13131       && CONST_INT_P (XEXP (op, 1))
13132       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
13133     return XEXP (op, 0);
13134
13135   return x;
13136 }
13137
13138 /* Helper function for rtx cost calculation.  Strip an extend
13139    expression from X.  Returns the inner operand if successful, or the
13140    original expression on failure.  We deal with a number of possible
13141    canonicalization variations here. If STRIP_SHIFT is true, then
13142    we can strip off a shift also.  */
13143 static rtx
13144 aarch64_strip_extend (rtx x, bool strip_shift)
13145 {
13146   scalar_int_mode mode;
13147   rtx op = x;
13148
13149   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
13150     return op;
13151
13152   if (GET_CODE (op) == AND
13153       && GET_CODE (XEXP (op, 0)) == MULT
13154       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
13155       && CONST_INT_P (XEXP (op, 1))
13156       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
13157                            INTVAL (XEXP (op, 1))) != 0)
13158     return XEXP (XEXP (op, 0), 0);
13159
13160   /* Now handle extended register, as this may also have an optional
13161      left shift by 1..4.  */
13162   if (strip_shift
13163       && GET_CODE (op) == ASHIFT
13164       && CONST_INT_P (XEXP (op, 1))
13165       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
13166     op = XEXP (op, 0);
13167
13168   if (GET_CODE (op) == ZERO_EXTEND
13169       || GET_CODE (op) == SIGN_EXTEND)
13170     op = XEXP (op, 0);
13171
13172   if (op != x)
13173     return op;
13174
13175   return x;
13176 }
13177
13178 /* Helper function for rtx cost calculation. Strip extension as well as any
13179    inner VEC_SELECT high-half from X. Returns the inner vector operand if
13180    successful, or the original expression on failure.  */
13181 static rtx
13182 aarch64_strip_extend_vec_half (rtx x)
13183 {
13184   if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13185     {
13186       x = XEXP (x, 0);
13187       if (GET_CODE (x) == VEC_SELECT
13188           && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)),
13189                                     XEXP (x, 1)))
13190         x = XEXP (x, 0);
13191     }
13192   return x;
13193 }
13194
13195 /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
13196    any subsequent extend and VEC_SELECT from X. Returns the inner scalar
13197    operand if successful, or the original expression on failure.  */
13198 static rtx
13199 aarch64_strip_duplicate_vec_elt (rtx x)
13200 {
13201   if (GET_CODE (x) == VEC_DUPLICATE
13202       && is_a<scalar_mode> (GET_MODE (XEXP (x, 0))))
13203     {
13204       x = XEXP (x, 0);
13205       if (GET_CODE (x) == VEC_SELECT)
13206         x = XEXP (x, 0);
13207       else if ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13208                && GET_CODE (XEXP (x, 0)) == VEC_SELECT)
13209         x = XEXP (XEXP (x, 0), 0);
13210     }
13211   return x;
13212 }
13213
13214 /* Return true iff CODE is a shift supported in combination
13215    with arithmetic instructions.  */
13216
13217 static bool
13218 aarch64_shift_p (enum rtx_code code)
13219 {
13220   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
13221 }
13222
13223
13224 /* Return true iff X is a cheap shift without a sign extend. */
13225
13226 static bool
13227 aarch64_cheap_mult_shift_p (rtx x)
13228 {
13229   rtx op0, op1;
13230
13231   op0 = XEXP (x, 0);
13232   op1 = XEXP (x, 1);
13233
13234   if (!(aarch64_tune_params.extra_tuning_flags
13235                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
13236     return false;
13237
13238   if (GET_CODE (op0) == SIGN_EXTEND)
13239     return false;
13240
13241   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
13242       && UINTVAL (op1) <= 4)
13243     return true;
13244
13245   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
13246     return false;
13247
13248   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
13249
13250   if (l2 > 0 && l2 <= 4)
13251     return true;
13252
13253   return false;
13254 }
13255
13256 /* Helper function for rtx cost calculation.  Calculate the cost of
13257    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
13258    Return the calculated cost of the expression, recursing manually in to
13259    operands where needed.  */
13260
13261 static int
13262 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
13263 {
13264   rtx op0, op1;
13265   const struct cpu_cost_table *extra_cost
13266     = aarch64_tune_params.insn_extra_cost;
13267   int cost = 0;
13268   bool compound_p = (outer == PLUS || outer == MINUS);
13269   machine_mode mode = GET_MODE (x);
13270
13271   gcc_checking_assert (code == MULT);
13272
13273   op0 = XEXP (x, 0);
13274   op1 = XEXP (x, 1);
13275
13276   if (VECTOR_MODE_P (mode))
13277     {
13278       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13279       if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
13280         {
13281           /* The select-operand-high-half versions of the instruction have the
13282              same cost as the three vector version - don't add the costs of the
13283              extension or selection into the costs of the multiply.  */
13284           op0 = aarch64_strip_extend_vec_half (op0);
13285           op1 = aarch64_strip_extend_vec_half (op1);
13286           /* The by-element versions of the instruction have the same costs as
13287              the normal 3-vector version.  We make an assumption that the input
13288              to the VEC_DUPLICATE is already on the FP & SIMD side.  This means
13289              costing of a MUL by element pre RA is a bit optimistic.  */
13290           op0 = aarch64_strip_duplicate_vec_elt (op0);
13291           op1 = aarch64_strip_duplicate_vec_elt (op1);
13292         }
13293       cost += rtx_cost (op0, mode, MULT, 0, speed);
13294       cost += rtx_cost (op1, mode, MULT, 1, speed);
13295       if (speed)
13296         {
13297           if (GET_CODE (x) == MULT)
13298             cost += extra_cost->vect.mult;
13299           /* This is to catch the SSRA costing currently flowing here.  */
13300           else
13301             cost += extra_cost->vect.alu;
13302         }
13303       return cost;
13304     }
13305
13306   /* Integer multiply/fma.  */
13307   if (GET_MODE_CLASS (mode) == MODE_INT)
13308     {
13309       /* The multiply will be canonicalized as a shift, cost it as such.  */
13310       if (aarch64_shift_p (GET_CODE (x))
13311           || (CONST_INT_P (op1)
13312               && exact_log2 (INTVAL (op1)) > 0))
13313         {
13314           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
13315                            || GET_CODE (op0) == SIGN_EXTEND;
13316           if (speed)
13317             {
13318               if (compound_p)
13319                 {
13320                   /* If the shift is considered cheap,
13321                      then don't add any cost. */
13322                   if (aarch64_cheap_mult_shift_p (x))
13323                     ;
13324                   else if (REG_P (op1))
13325                     /* ARITH + shift-by-register.  */
13326                     cost += extra_cost->alu.arith_shift_reg;
13327                   else if (is_extend)
13328                     /* ARITH + extended register.  We don't have a cost field
13329                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
13330                     cost += extra_cost->alu.extend_arith;
13331                   else
13332                     /* ARITH + shift-by-immediate.  */
13333                     cost += extra_cost->alu.arith_shift;
13334                 }
13335               else
13336                 /* LSL (immediate).  */
13337                 cost += extra_cost->alu.shift;
13338
13339             }
13340           /* Strip extends as we will have costed them in the case above.  */
13341           if (is_extend)
13342             op0 = aarch64_strip_extend (op0, true);
13343
13344           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
13345
13346           return cost;
13347         }
13348
13349       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
13350          compound and let the below cases handle it.  After all, MNEG is a
13351          special-case alias of MSUB.  */
13352       if (GET_CODE (op0) == NEG)
13353         {
13354           op0 = XEXP (op0, 0);
13355           compound_p = true;
13356         }
13357
13358       /* Integer multiplies or FMAs have zero/sign extending variants.  */
13359       if ((GET_CODE (op0) == ZERO_EXTEND
13360            && GET_CODE (op1) == ZERO_EXTEND)
13361           || (GET_CODE (op0) == SIGN_EXTEND
13362               && GET_CODE (op1) == SIGN_EXTEND))
13363         {
13364           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
13365           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
13366
13367           if (speed)
13368             {
13369               if (compound_p)
13370                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
13371                 cost += extra_cost->mult[0].extend_add;
13372               else
13373                 /* MUL/SMULL/UMULL.  */
13374                 cost += extra_cost->mult[0].extend;
13375             }
13376
13377           return cost;
13378         }
13379
13380       /* This is either an integer multiply or a MADD.  In both cases
13381          we want to recurse and cost the operands.  */
13382       cost += rtx_cost (op0, mode, MULT, 0, speed);
13383       cost += rtx_cost (op1, mode, MULT, 1, speed);
13384
13385       if (speed)
13386         {
13387           if (compound_p)
13388             /* MADD/MSUB.  */
13389             cost += extra_cost->mult[mode == DImode].add;
13390           else
13391             /* MUL.  */
13392             cost += extra_cost->mult[mode == DImode].simple;
13393         }
13394
13395       return cost;
13396     }
13397   else
13398     {
13399       if (speed)
13400         {
13401           /* Floating-point FMA/FMUL can also support negations of the
13402              operands, unless the rounding mode is upward or downward in
13403              which case FNMUL is different than FMUL with operand negation.  */
13404           bool neg0 = GET_CODE (op0) == NEG;
13405           bool neg1 = GET_CODE (op1) == NEG;
13406           if (compound_p || !flag_rounding_math || (neg0 && neg1))
13407             {
13408               if (neg0)
13409                 op0 = XEXP (op0, 0);
13410               if (neg1)
13411                 op1 = XEXP (op1, 0);
13412             }
13413
13414           if (compound_p)
13415             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
13416             cost += extra_cost->fp[mode == DFmode].fma;
13417           else
13418             /* FMUL/FNMUL.  */
13419             cost += extra_cost->fp[mode == DFmode].mult;
13420         }
13421
13422       cost += rtx_cost (op0, mode, MULT, 0, speed);
13423       cost += rtx_cost (op1, mode, MULT, 1, speed);
13424       return cost;
13425     }
13426 }
13427
13428 static int
13429 aarch64_address_cost (rtx x,
13430                       machine_mode mode,
13431                       addr_space_t as ATTRIBUTE_UNUSED,
13432                       bool speed)
13433 {
13434   enum rtx_code c = GET_CODE (x);
13435   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
13436   struct aarch64_address_info info;
13437   int cost = 0;
13438   info.shift = 0;
13439
13440   if (!aarch64_classify_address (&info, x, mode, false))
13441     {
13442       if (GET_CODE (x) == CONST || SYMBOL_REF_P (x))
13443         {
13444           /* This is a CONST or SYMBOL ref which will be split
13445              in a different way depending on the code model in use.
13446              Cost it through the generic infrastructure.  */
13447           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
13448           /* Divide through by the cost of one instruction to
13449              bring it to the same units as the address costs.  */
13450           cost_symbol_ref /= COSTS_N_INSNS (1);
13451           /* The cost is then the cost of preparing the address,
13452              followed by an immediate (possibly 0) offset.  */
13453           return cost_symbol_ref + addr_cost->imm_offset;
13454         }
13455       else
13456         {
13457           /* This is most likely a jump table from a case
13458              statement.  */
13459           return addr_cost->register_offset;
13460         }
13461     }
13462
13463   switch (info.type)
13464     {
13465       case ADDRESS_LO_SUM:
13466       case ADDRESS_SYMBOLIC:
13467       case ADDRESS_REG_IMM:
13468         cost += addr_cost->imm_offset;
13469         break;
13470
13471       case ADDRESS_REG_WB:
13472         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
13473           cost += addr_cost->pre_modify;
13474         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
13475           {
13476             unsigned int nvectors = aarch64_ldn_stn_vectors (mode);
13477             if (nvectors == 3)
13478               cost += addr_cost->post_modify_ld3_st3;
13479             else if (nvectors == 4)
13480               cost += addr_cost->post_modify_ld4_st4;
13481             else
13482               cost += addr_cost->post_modify;
13483           }
13484         else
13485           gcc_unreachable ();
13486
13487         break;
13488
13489       case ADDRESS_REG_REG:
13490         cost += addr_cost->register_offset;
13491         break;
13492
13493       case ADDRESS_REG_SXTW:
13494         cost += addr_cost->register_sextend;
13495         break;
13496
13497       case ADDRESS_REG_UXTW:
13498         cost += addr_cost->register_zextend;
13499         break;
13500
13501       default:
13502         gcc_unreachable ();
13503     }
13504
13505
13506   if (info.shift > 0)
13507     {
13508       /* For the sake of calculating the cost of the shifted register
13509          component, we can treat same sized modes in the same way.  */
13510       if (known_eq (GET_MODE_BITSIZE (mode), 16))
13511         cost += addr_cost->addr_scale_costs.hi;
13512       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
13513         cost += addr_cost->addr_scale_costs.si;
13514       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
13515         cost += addr_cost->addr_scale_costs.di;
13516       else
13517         /* We can't tell, or this is a 128-bit vector.  */
13518         cost += addr_cost->addr_scale_costs.ti;
13519     }
13520
13521   return cost;
13522 }
13523
13524 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
13525    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
13526    to be taken.  */
13527
13528 int
13529 aarch64_branch_cost (bool speed_p, bool predictable_p)
13530 {
13531   /* When optimizing for speed, use the cost of unpredictable branches.  */
13532   const struct cpu_branch_cost *branch_costs =
13533     aarch64_tune_params.branch_costs;
13534
13535   if (!speed_p || predictable_p)
13536     return branch_costs->predictable;
13537   else
13538     return branch_costs->unpredictable;
13539 }
13540
13541 /* Return true if X is a zero or sign extract
13542    usable in an ADD or SUB (extended register) instruction.  */
13543 static bool
13544 aarch64_rtx_arith_op_extract_p (rtx x)
13545 {
13546   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
13547      No shift.  */
13548   if (GET_CODE (x) == SIGN_EXTEND
13549       || GET_CODE (x) == ZERO_EXTEND)
13550     return REG_P (XEXP (x, 0));
13551
13552   return false;
13553 }
13554
13555 static bool
13556 aarch64_frint_unspec_p (unsigned int u)
13557 {
13558   switch (u)
13559     {
13560       case UNSPEC_FRINTZ:
13561       case UNSPEC_FRINTP:
13562       case UNSPEC_FRINTM:
13563       case UNSPEC_FRINTA:
13564       case UNSPEC_FRINTN:
13565       case UNSPEC_FRINTX:
13566       case UNSPEC_FRINTI:
13567         return true;
13568
13569       default:
13570         return false;
13571     }
13572 }
13573
13574 /* Return true iff X is an rtx that will match an extr instruction
13575    i.e. as described in the *extr<mode>5_insn family of patterns.
13576    OP0 and OP1 will be set to the operands of the shifts involved
13577    on success and will be NULL_RTX otherwise.  */
13578
13579 static bool
13580 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
13581 {
13582   rtx op0, op1;
13583   scalar_int_mode mode;
13584   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
13585     return false;
13586
13587   *res_op0 = NULL_RTX;
13588   *res_op1 = NULL_RTX;
13589
13590   if (GET_CODE (x) != IOR)
13591     return false;
13592
13593   op0 = XEXP (x, 0);
13594   op1 = XEXP (x, 1);
13595
13596   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
13597       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
13598     {
13599      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
13600       if (GET_CODE (op1) == ASHIFT)
13601         std::swap (op0, op1);
13602
13603       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
13604         return false;
13605
13606       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
13607       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
13608
13609       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
13610           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
13611         {
13612           *res_op0 = XEXP (op0, 0);
13613           *res_op1 = XEXP (op1, 0);
13614           return true;
13615         }
13616     }
13617
13618   return false;
13619 }
13620
13621 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
13622    storing it in *COST.  Result is true if the total cost of the operation
13623    has now been calculated.  */
13624 static bool
13625 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
13626 {
13627   rtx inner;
13628   rtx comparator;
13629   enum rtx_code cmpcode;
13630   const struct cpu_cost_table *extra_cost
13631     = aarch64_tune_params.insn_extra_cost;
13632
13633   if (COMPARISON_P (op0))
13634     {
13635       inner = XEXP (op0, 0);
13636       comparator = XEXP (op0, 1);
13637       cmpcode = GET_CODE (op0);
13638     }
13639   else
13640     {
13641       inner = op0;
13642       comparator = const0_rtx;
13643       cmpcode = NE;
13644     }
13645
13646   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
13647     {
13648       /* Conditional branch.  */
13649       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13650         return true;
13651       else
13652         {
13653           if (cmpcode == NE || cmpcode == EQ)
13654             {
13655               if (comparator == const0_rtx)
13656                 {
13657                   /* TBZ/TBNZ/CBZ/CBNZ.  */
13658                   if (GET_CODE (inner) == ZERO_EXTRACT)
13659                     /* TBZ/TBNZ.  */
13660                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
13661                                        ZERO_EXTRACT, 0, speed);
13662                   else
13663                     /* CBZ/CBNZ.  */
13664                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
13665
13666                   return true;
13667                 }
13668               if (register_operand (inner, VOIDmode)
13669                   && aarch64_imm24 (comparator, VOIDmode))
13670                 {
13671                   /* SUB and SUBS.  */
13672                   *cost += COSTS_N_INSNS (2);
13673                   if (speed)
13674                     *cost += extra_cost->alu.arith * 2;
13675                   return true;
13676                 }
13677             }
13678           else if (cmpcode == LT || cmpcode == GE)
13679             {
13680               /* TBZ/TBNZ.  */
13681               if (comparator == const0_rtx)
13682                 return true;
13683             }
13684         }
13685     }
13686   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13687     {
13688       /* CCMP.  */
13689       if (GET_CODE (op1) == COMPARE)
13690         {
13691           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
13692           if (XEXP (op1, 1) == const0_rtx)
13693             *cost += 1;
13694           if (speed)
13695             {
13696               machine_mode mode = GET_MODE (XEXP (op1, 0));
13697
13698               if (GET_MODE_CLASS (mode) == MODE_INT)
13699                 *cost += extra_cost->alu.arith;
13700               else
13701                 *cost += extra_cost->fp[mode == DFmode].compare;
13702             }
13703           return true;
13704         }
13705
13706       /* It's a conditional operation based on the status flags,
13707          so it must be some flavor of CSEL.  */
13708
13709       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
13710       if (GET_CODE (op1) == NEG
13711           || GET_CODE (op1) == NOT
13712           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
13713         op1 = XEXP (op1, 0);
13714       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
13715         {
13716           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
13717           op1 = XEXP (op1, 0);
13718           op2 = XEXP (op2, 0);
13719         }
13720       else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
13721         {
13722           inner = XEXP (op1, 0);
13723           if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
13724             /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3).  */
13725             op1 = XEXP (inner, 0);
13726         }
13727
13728       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
13729       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
13730       return true;
13731     }
13732
13733   /* We don't know what this is, cost all operands.  */
13734   return false;
13735 }
13736
13737 /* Check whether X is a bitfield operation of the form shift + extend that
13738    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
13739    operand to which the bitfield operation is applied.  Otherwise return
13740    NULL_RTX.  */
13741
13742 static rtx
13743 aarch64_extend_bitfield_pattern_p (rtx x)
13744 {
13745   rtx_code outer_code = GET_CODE (x);
13746   machine_mode outer_mode = GET_MODE (x);
13747
13748   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
13749       && outer_mode != SImode && outer_mode != DImode)
13750     return NULL_RTX;
13751
13752   rtx inner = XEXP (x, 0);
13753   rtx_code inner_code = GET_CODE (inner);
13754   machine_mode inner_mode = GET_MODE (inner);
13755   rtx op = NULL_RTX;
13756
13757   switch (inner_code)
13758     {
13759       case ASHIFT:
13760         if (CONST_INT_P (XEXP (inner, 1))
13761             && (inner_mode == QImode || inner_mode == HImode))
13762           op = XEXP (inner, 0);
13763         break;
13764       case LSHIFTRT:
13765         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
13766             && (inner_mode == QImode || inner_mode == HImode))
13767           op = XEXP (inner, 0);
13768         break;
13769       case ASHIFTRT:
13770         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
13771             && (inner_mode == QImode || inner_mode == HImode))
13772           op = XEXP (inner, 0);
13773         break;
13774       default:
13775         break;
13776     }
13777
13778   return op;
13779 }
13780
13781 /* Return true if the mask and a shift amount from an RTX of the form
13782    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
13783    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
13784
13785 bool
13786 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
13787                                     rtx shft_amnt)
13788 {
13789   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
13790          && INTVAL (mask) > 0
13791          && UINTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
13792          && exact_log2 ((UINTVAL (mask) >> UINTVAL (shft_amnt)) + 1) >= 0
13793          && (UINTVAL (mask)
13794              & ((HOST_WIDE_INT_1U << UINTVAL (shft_amnt)) - 1)) == 0;
13795 }
13796
13797 /* Return true if the masks and a shift amount from an RTX of the form
13798    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
13799    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
13800
13801 bool
13802 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
13803                                    unsigned HOST_WIDE_INT mask1,
13804                                    unsigned HOST_WIDE_INT shft_amnt,
13805                                    unsigned HOST_WIDE_INT mask2)
13806 {
13807   unsigned HOST_WIDE_INT t;
13808
13809   /* Verify that there is no overlap in what bits are set in the two masks.  */
13810   if (mask1 != ~mask2)
13811     return false;
13812
13813   /* Verify that mask2 is not all zeros or ones.  */
13814   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
13815     return false;
13816
13817   /* The shift amount should always be less than the mode size.  */
13818   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
13819
13820   /* Verify that the mask being shifted is contiguous and would be in the
13821      least significant bits after shifting by shft_amnt.  */
13822   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
13823   return (t == (t & -t));
13824 }
13825
13826 /* Calculate the cost of calculating X, storing it in *COST.  Result
13827    is true if the total cost of the operation has now been calculated.  */
13828 static bool
13829 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
13830                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
13831 {
13832   rtx op0, op1, op2;
13833   const struct cpu_cost_table *extra_cost
13834     = aarch64_tune_params.insn_extra_cost;
13835   rtx_code code = GET_CODE (x);
13836   scalar_int_mode int_mode;
13837
13838   /* By default, assume that everything has equivalent cost to the
13839      cheapest instruction.  Any additional costs are applied as a delta
13840      above this default.  */
13841   *cost = COSTS_N_INSNS (1);
13842
13843   switch (code)
13844     {
13845     case SET:
13846       /* The cost depends entirely on the operands to SET.  */
13847       *cost = 0;
13848       op0 = SET_DEST (x);
13849       op1 = SET_SRC (x);
13850
13851       switch (GET_CODE (op0))
13852         {
13853         case MEM:
13854           if (speed)
13855             {
13856               rtx address = XEXP (op0, 0);
13857               if (VECTOR_MODE_P (mode))
13858                 *cost += extra_cost->ldst.storev;
13859               else if (GET_MODE_CLASS (mode) == MODE_INT)
13860                 *cost += extra_cost->ldst.store;
13861               else if (mode == SFmode || mode == SDmode)
13862                 *cost += extra_cost->ldst.storef;
13863               else if (mode == DFmode || mode == DDmode)
13864                 *cost += extra_cost->ldst.stored;
13865
13866               *cost +=
13867                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
13868                                                      0, speed));
13869             }
13870
13871           *cost += rtx_cost (op1, mode, SET, 1, speed);
13872           return true;
13873
13874         case SUBREG:
13875           if (! REG_P (SUBREG_REG (op0)))
13876             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
13877
13878           /* Fall through.  */
13879         case REG:
13880           /* The cost is one per vector-register copied.  */
13881           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
13882             {
13883               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
13884               *cost = COSTS_N_INSNS (nregs);
13885             }
13886           /* const0_rtx is in general free, but we will use an
13887              instruction to set a register to 0.  */
13888           else if (REG_P (op1) || op1 == const0_rtx)
13889             {
13890               /* The cost is 1 per register copied.  */
13891               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
13892               *cost = COSTS_N_INSNS (nregs);
13893             }
13894           else
13895             /* Cost is just the cost of the RHS of the set.  */
13896             *cost += rtx_cost (op1, mode, SET, 1, speed);
13897           return true;
13898
13899         case ZERO_EXTRACT:
13900         case SIGN_EXTRACT:
13901           /* Bit-field insertion.  Strip any redundant widening of
13902              the RHS to meet the width of the target.  */
13903           if (SUBREG_P (op1))
13904             op1 = SUBREG_REG (op1);
13905           if ((GET_CODE (op1) == ZERO_EXTEND
13906                || GET_CODE (op1) == SIGN_EXTEND)
13907               && CONST_INT_P (XEXP (op0, 1))
13908               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
13909               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
13910             op1 = XEXP (op1, 0);
13911
13912           if (CONST_INT_P (op1))
13913             {
13914               /* MOV immediate is assumed to always be cheap.  */
13915               *cost = COSTS_N_INSNS (1);
13916             }
13917           else
13918             {
13919               /* BFM.  */
13920               if (speed)
13921                 *cost += extra_cost->alu.bfi;
13922               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
13923             }
13924
13925           return true;
13926
13927         default:
13928           /* We can't make sense of this, assume default cost.  */
13929           *cost = COSTS_N_INSNS (1);
13930           return false;
13931         }
13932       return false;
13933
13934     case CONST_INT:
13935       /* If an instruction can incorporate a constant within the
13936          instruction, the instruction's expression avoids calling
13937          rtx_cost() on the constant.  If rtx_cost() is called on a
13938          constant, then it is usually because the constant must be
13939          moved into a register by one or more instructions.
13940
13941          The exception is constant 0, which can be expressed
13942          as XZR/WZR and is therefore free.  The exception to this is
13943          if we have (set (reg) (const0_rtx)) in which case we must cost
13944          the move.  However, we can catch that when we cost the SET, so
13945          we don't need to consider that here.  */
13946       if (x == const0_rtx)
13947         *cost = 0;
13948       else
13949         {
13950           /* To an approximation, building any other constant is
13951              proportionally expensive to the number of instructions
13952              required to build that constant.  This is true whether we
13953              are compiling for SPEED or otherwise.  */
13954           machine_mode imode = known_le (GET_MODE_SIZE (mode), 4)
13955                                 ? SImode : DImode;
13956           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
13957                                  (NULL_RTX, x, false, imode));
13958         }
13959       return true;
13960
13961     case CONST_DOUBLE:
13962
13963       /* First determine number of instructions to do the move
13964           as an integer constant.  */
13965       if (!aarch64_float_const_representable_p (x)
13966            && !aarch64_can_const_movi_rtx_p (x, mode)
13967            && aarch64_float_const_rtx_p (x))
13968         {
13969           unsigned HOST_WIDE_INT ival;
13970           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
13971           gcc_assert (succeed);
13972
13973           machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8)
13974                                 ? DImode : SImode;
13975           int ncost = aarch64_internal_mov_immediate
13976                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
13977           *cost += COSTS_N_INSNS (ncost);
13978           return true;
13979         }
13980
13981       if (speed)
13982         {
13983           /* mov[df,sf]_aarch64.  */
13984           if (aarch64_float_const_representable_p (x))
13985             /* FMOV (scalar immediate).  */
13986             *cost += extra_cost->fp[mode == DFmode || mode == DDmode].fpconst;
13987           else if (!aarch64_float_const_zero_rtx_p (x))
13988             {
13989               /* This will be a load from memory.  */
13990               if (mode == DFmode || mode == DDmode)
13991                 *cost += extra_cost->ldst.loadd;
13992               else
13993                 *cost += extra_cost->ldst.loadf;
13994             }
13995           else
13996             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
13997                or MOV v0.s[0], wzr - neither of which are modeled by the
13998                cost tables.  Just use the default cost.  */
13999             {
14000             }
14001         }
14002
14003       return true;
14004
14005     case MEM:
14006       if (speed)
14007         {
14008           /* For loads we want the base cost of a load, plus an
14009              approximation for the additional cost of the addressing
14010              mode.  */
14011           rtx address = XEXP (x, 0);
14012           if (VECTOR_MODE_P (mode))
14013             *cost += extra_cost->ldst.loadv;
14014           else if (GET_MODE_CLASS (mode) == MODE_INT)
14015             *cost += extra_cost->ldst.load;
14016           else if (mode == SFmode || mode == SDmode)
14017             *cost += extra_cost->ldst.loadf;
14018           else if (mode == DFmode || mode == DDmode)
14019             *cost += extra_cost->ldst.loadd;
14020
14021           *cost +=
14022                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14023                                                      0, speed));
14024         }
14025
14026       return true;
14027
14028     case NEG:
14029       op0 = XEXP (x, 0);
14030
14031       if (VECTOR_MODE_P (mode))
14032         {
14033           if (speed)
14034             {
14035               /* FNEG.  */
14036               *cost += extra_cost->vect.alu;
14037             }
14038           return false;
14039         }
14040
14041       if (GET_MODE_CLASS (mode) == MODE_INT)
14042         {
14043           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14044               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14045             {
14046               /* CSETM.  */
14047               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
14048               return true;
14049             }
14050
14051           /* Cost this as SUB wzr, X.  */
14052           op0 = CONST0_RTX (mode);
14053           op1 = XEXP (x, 0);
14054           goto cost_minus;
14055         }
14056
14057       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14058         {
14059           /* Support (neg(fma...)) as a single instruction only if
14060              sign of zeros is unimportant.  This matches the decision
14061              making in aarch64.md.  */
14062           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
14063             {
14064               /* FNMADD.  */
14065               *cost = rtx_cost (op0, mode, NEG, 0, speed);
14066               return true;
14067             }
14068           if (GET_CODE (op0) == MULT)
14069             {
14070               /* FNMUL.  */
14071               *cost = rtx_cost (op0, mode, NEG, 0, speed);
14072               return true;
14073             }
14074           if (speed)
14075             /* FNEG.  */
14076             *cost += extra_cost->fp[mode == DFmode].neg;
14077           return false;
14078         }
14079
14080       return false;
14081
14082     case CLRSB:
14083     case CLZ:
14084       if (speed)
14085         {
14086           if (VECTOR_MODE_P (mode))
14087             *cost += extra_cost->vect.alu;
14088           else
14089             *cost += extra_cost->alu.clz;
14090         }
14091
14092       return false;
14093
14094     case CTZ:
14095       *cost = COSTS_N_INSNS (2);
14096
14097       if (speed)
14098         *cost += extra_cost->alu.clz + extra_cost->alu.rev;
14099       return false;
14100
14101     case COMPARE:
14102       op0 = XEXP (x, 0);
14103       op1 = XEXP (x, 1);
14104
14105       if (op1 == const0_rtx
14106           && GET_CODE (op0) == AND)
14107         {
14108           x = op0;
14109           mode = GET_MODE (op0);
14110           goto cost_logic;
14111         }
14112
14113       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
14114         {
14115           /* TODO: A write to the CC flags possibly costs extra, this
14116              needs encoding in the cost tables.  */
14117
14118           mode = GET_MODE (op0);
14119           /* ANDS.  */
14120           if (GET_CODE (op0) == AND)
14121             {
14122               x = op0;
14123               goto cost_logic;
14124             }
14125
14126           if (GET_CODE (op0) == PLUS)
14127             {
14128               /* ADDS (and CMN alias).  */
14129               x = op0;
14130               goto cost_plus;
14131             }
14132
14133           if (GET_CODE (op0) == MINUS)
14134             {
14135               /* SUBS.  */
14136               x = op0;
14137               goto cost_minus;
14138             }
14139
14140           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
14141               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
14142               && CONST_INT_P (XEXP (op0, 2)))
14143             {
14144               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
14145                  Handle it here directly rather than going to cost_logic
14146                  since we know the immediate generated for the TST is valid
14147                  so we can avoid creating an intermediate rtx for it only
14148                  for costing purposes.  */
14149               if (speed)
14150                 *cost += extra_cost->alu.logical;
14151
14152               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
14153                                  ZERO_EXTRACT, 0, speed);
14154               return true;
14155             }
14156
14157           if (GET_CODE (op1) == NEG)
14158             {
14159               /* CMN.  */
14160               if (speed)
14161                 *cost += extra_cost->alu.arith;
14162
14163               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
14164               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
14165               return true;
14166             }
14167
14168           /* CMP.
14169
14170              Compare can freely swap the order of operands, and
14171              canonicalization puts the more complex operation first.
14172              But the integer MINUS logic expects the shift/extend
14173              operation in op1.  */
14174           if (! (REG_P (op0)
14175                  || (SUBREG_P (op0) && REG_P (SUBREG_REG (op0)))))
14176           {
14177             op0 = XEXP (x, 1);
14178             op1 = XEXP (x, 0);
14179           }
14180           goto cost_minus;
14181         }
14182
14183       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
14184         {
14185           /* FCMP.  */
14186           if (speed)
14187             *cost += extra_cost->fp[mode == DFmode].compare;
14188
14189           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
14190             {
14191               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
14192               /* FCMP supports constant 0.0 for no extra cost. */
14193               return true;
14194             }
14195           return false;
14196         }
14197
14198       if (VECTOR_MODE_P (mode))
14199         {
14200           /* Vector compare.  */
14201           if (speed)
14202             *cost += extra_cost->vect.alu;
14203
14204           if (aarch64_float_const_zero_rtx_p (op1))
14205             {
14206               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
14207                  cost.  */
14208               return true;
14209             }
14210           return false;
14211         }
14212       return false;
14213
14214     case MINUS:
14215       {
14216         op0 = XEXP (x, 0);
14217         op1 = XEXP (x, 1);
14218
14219 cost_minus:
14220         if (VECTOR_MODE_P (mode))
14221           {
14222             /* SUBL2 and SUBW2.  */
14223             unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14224             if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14225               {
14226                 /* The select-operand-high-half versions of the sub instruction
14227                    have the same cost as the regular three vector version -
14228                    don't add the costs of the select into the costs of the sub.
14229                    */
14230                 op0 = aarch64_strip_extend_vec_half (op0);
14231                 op1 = aarch64_strip_extend_vec_half (op1);
14232               }
14233           }
14234
14235         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
14236
14237         /* Detect valid immediates.  */
14238         if ((GET_MODE_CLASS (mode) == MODE_INT
14239              || (GET_MODE_CLASS (mode) == MODE_CC
14240                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
14241             && CONST_INT_P (op1)
14242             && aarch64_uimm12_shift (INTVAL (op1)))
14243           {
14244             if (speed)
14245               /* SUB(S) (immediate).  */
14246               *cost += extra_cost->alu.arith;
14247             return true;
14248           }
14249
14250         /* Look for SUB (extended register).  */
14251         if (is_a <scalar_int_mode> (mode)
14252             && aarch64_rtx_arith_op_extract_p (op1))
14253           {
14254             if (speed)
14255               *cost += extra_cost->alu.extend_arith;
14256
14257             op1 = aarch64_strip_extend (op1, true);
14258             *cost += rtx_cost (op1, VOIDmode,
14259                                (enum rtx_code) GET_CODE (op1), 0, speed);
14260             return true;
14261           }
14262
14263         rtx new_op1 = aarch64_strip_extend (op1, false);
14264
14265         /* Cost this as an FMA-alike operation.  */
14266         if ((GET_CODE (new_op1) == MULT
14267              || aarch64_shift_p (GET_CODE (new_op1)))
14268             && code != COMPARE)
14269           {
14270             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
14271                                             (enum rtx_code) code,
14272                                             speed);
14273             return true;
14274           }
14275
14276         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
14277
14278         if (speed)
14279           {
14280             if (VECTOR_MODE_P (mode))
14281               {
14282                 /* Vector SUB.  */
14283                 *cost += extra_cost->vect.alu;
14284               }
14285             else if (GET_MODE_CLASS (mode) == MODE_INT)
14286               {
14287                 /* SUB(S).  */
14288                 *cost += extra_cost->alu.arith;
14289               }
14290             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14291               {
14292                 /* FSUB.  */
14293                 *cost += extra_cost->fp[mode == DFmode].addsub;
14294               }
14295           }
14296         return true;
14297       }
14298
14299     case PLUS:
14300       {
14301         rtx new_op0;
14302
14303         op0 = XEXP (x, 0);
14304         op1 = XEXP (x, 1);
14305
14306 cost_plus:
14307         if (VECTOR_MODE_P (mode))
14308           {
14309             /* ADDL2 and ADDW2.  */
14310             unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14311             if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14312               {
14313                 /* The select-operand-high-half versions of the add instruction
14314                    have the same cost as the regular three vector version -
14315                    don't add the costs of the select into the costs of the add.
14316                    */
14317                 op0 = aarch64_strip_extend_vec_half (op0);
14318                 op1 = aarch64_strip_extend_vec_half (op1);
14319               }
14320           }
14321
14322         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14323             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14324           {
14325             /* CSINC.  */
14326             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
14327             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14328             return true;
14329           }
14330
14331         if (GET_MODE_CLASS (mode) == MODE_INT
14332             && (aarch64_plus_immediate (op1, mode)
14333                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
14334           {
14335             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14336
14337             if (speed)
14338               {
14339                 /* ADD (immediate).  */
14340                 *cost += extra_cost->alu.arith;
14341
14342                 /* Some tunings prefer to not use the VL-based scalar ops.
14343                    Increase the cost of the poly immediate to prevent their
14344                    formation.  */
14345                 if (GET_CODE (op1) == CONST_POLY_INT
14346                     && (aarch64_tune_params.extra_tuning_flags
14347                         & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
14348                   *cost += COSTS_N_INSNS (1);
14349               }
14350             return true;
14351           }
14352
14353         if (aarch64_pluslong_immediate (op1, mode))
14354           {
14355             /* 24-bit add in 2 instructions or 12-bit shifted add.  */
14356             if ((INTVAL (op1) & 0xfff) != 0)
14357               *cost += COSTS_N_INSNS (1);
14358
14359             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14360             return true;
14361           }
14362
14363         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14364
14365         /* Look for ADD (extended register).  */
14366         if (is_a <scalar_int_mode> (mode)
14367             && aarch64_rtx_arith_op_extract_p (op0))
14368           {
14369             if (speed)
14370               *cost += extra_cost->alu.extend_arith;
14371
14372             op0 = aarch64_strip_extend (op0, true);
14373             *cost += rtx_cost (op0, VOIDmode,
14374                                (enum rtx_code) GET_CODE (op0), 0, speed);
14375             return true;
14376           }
14377
14378         /* Strip any extend, leave shifts behind as we will
14379            cost them through mult_cost.  */
14380         new_op0 = aarch64_strip_extend (op0, false);
14381
14382         if (GET_CODE (new_op0) == MULT
14383             || aarch64_shift_p (GET_CODE (new_op0)))
14384           {
14385             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
14386                                             speed);
14387             return true;
14388           }
14389
14390         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
14391
14392         if (speed)
14393           {
14394             if (VECTOR_MODE_P (mode))
14395               {
14396                 /* Vector ADD.  */
14397                 *cost += extra_cost->vect.alu;
14398               }
14399             else if (GET_MODE_CLASS (mode) == MODE_INT)
14400               {
14401                 /* ADD.  */
14402                 *cost += extra_cost->alu.arith;
14403               }
14404             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14405               {
14406                 /* FADD.  */
14407                 *cost += extra_cost->fp[mode == DFmode].addsub;
14408               }
14409           }
14410         return true;
14411       }
14412
14413     case BSWAP:
14414       *cost = COSTS_N_INSNS (1);
14415
14416       if (speed)
14417         {
14418           if (VECTOR_MODE_P (mode))
14419             *cost += extra_cost->vect.alu;
14420           else
14421             *cost += extra_cost->alu.rev;
14422         }
14423       return false;
14424
14425     case IOR:
14426       if (aarch_rev16_p (x))
14427         {
14428           *cost = COSTS_N_INSNS (1);
14429
14430           if (speed)
14431             {
14432               if (VECTOR_MODE_P (mode))
14433                 *cost += extra_cost->vect.alu;
14434               else
14435                 *cost += extra_cost->alu.rev;
14436             }
14437           return true;
14438         }
14439
14440       if (aarch64_extr_rtx_p (x, &op0, &op1))
14441         {
14442           *cost += rtx_cost (op0, mode, IOR, 0, speed);
14443           *cost += rtx_cost (op1, mode, IOR, 1, speed);
14444           if (speed)
14445             *cost += extra_cost->alu.shift;
14446
14447           return true;
14448         }
14449     /* Fall through.  */
14450     case XOR:
14451     case AND:
14452     cost_logic:
14453       op0 = XEXP (x, 0);
14454       op1 = XEXP (x, 1);
14455
14456       if (VECTOR_MODE_P (mode))
14457         {
14458           if (speed)
14459             *cost += extra_cost->vect.alu;
14460           return true;
14461         }
14462
14463       if (code == AND
14464           && GET_CODE (op0) == MULT
14465           && CONST_INT_P (XEXP (op0, 1))
14466           && CONST_INT_P (op1)
14467           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
14468                                INTVAL (op1)) != 0)
14469         {
14470           /* This is a UBFM/SBFM.  */
14471           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
14472           if (speed)
14473             *cost += extra_cost->alu.bfx;
14474           return true;
14475         }
14476
14477       if (is_int_mode (mode, &int_mode))
14478         {
14479           if (CONST_INT_P (op1))
14480             {
14481               /* We have a mask + shift version of a UBFIZ
14482                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
14483               if (GET_CODE (op0) == ASHIFT
14484                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
14485                                                          XEXP (op0, 1)))
14486                 {
14487                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
14488                                      (enum rtx_code) code, 0, speed);
14489                   if (speed)
14490                     *cost += extra_cost->alu.bfx;
14491
14492                   return true;
14493                 }
14494               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
14495                 {
14496                 /* We possibly get the immediate for free, this is not
14497                    modelled.  */
14498                   *cost += rtx_cost (op0, int_mode,
14499                                      (enum rtx_code) code, 0, speed);
14500                   if (speed)
14501                     *cost += extra_cost->alu.logical;
14502
14503                   return true;
14504                 }
14505             }
14506           else
14507             {
14508               rtx new_op0 = op0;
14509
14510               /* Handle ORN, EON, or BIC.  */
14511               if (GET_CODE (op0) == NOT)
14512                 op0 = XEXP (op0, 0);
14513
14514               new_op0 = aarch64_strip_shift (op0);
14515
14516               /* If we had a shift on op0 then this is a logical-shift-
14517                  by-register/immediate operation.  Otherwise, this is just
14518                  a logical operation.  */
14519               if (speed)
14520                 {
14521                   if (new_op0 != op0)
14522                     {
14523                       /* Shift by immediate.  */
14524                       if (CONST_INT_P (XEXP (op0, 1)))
14525                         *cost += extra_cost->alu.log_shift;
14526                       else
14527                         *cost += extra_cost->alu.log_shift_reg;
14528                     }
14529                   else
14530                     *cost += extra_cost->alu.logical;
14531                 }
14532
14533               /* In both cases we want to cost both operands.  */
14534               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
14535                                  0, speed);
14536               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
14537                                  1, speed);
14538
14539               return true;
14540             }
14541         }
14542       return false;
14543
14544     case NOT:
14545       x = XEXP (x, 0);
14546       op0 = aarch64_strip_shift (x);
14547
14548       if (VECTOR_MODE_P (mode))
14549         {
14550           /* Vector NOT.  */
14551           *cost += extra_cost->vect.alu;
14552           return false;
14553         }
14554
14555       /* MVN-shifted-reg.  */
14556       if (op0 != x)
14557         {
14558           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
14559
14560           if (speed)
14561             *cost += extra_cost->alu.log_shift;
14562
14563           return true;
14564         }
14565       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
14566          Handle the second form here taking care that 'a' in the above can
14567          be a shift.  */
14568       else if (GET_CODE (op0) == XOR)
14569         {
14570           rtx newop0 = XEXP (op0, 0);
14571           rtx newop1 = XEXP (op0, 1);
14572           rtx op0_stripped = aarch64_strip_shift (newop0);
14573
14574           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
14575           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
14576
14577           if (speed)
14578             {
14579               if (op0_stripped != newop0)
14580                 *cost += extra_cost->alu.log_shift;
14581               else
14582                 *cost += extra_cost->alu.logical;
14583             }
14584
14585           return true;
14586         }
14587       /* MVN.  */
14588       if (speed)
14589         *cost += extra_cost->alu.logical;
14590
14591       return false;
14592
14593     case ZERO_EXTEND:
14594
14595       op0 = XEXP (x, 0);
14596       /* If a value is written in SI mode, then zero extended to DI
14597          mode, the operation will in general be free as a write to
14598          a 'w' register implicitly zeroes the upper bits of an 'x'
14599          register.  However, if this is
14600
14601            (set (reg) (zero_extend (reg)))
14602
14603          we must cost the explicit register move.  */
14604       if (mode == DImode
14605           && GET_MODE (op0) == SImode)
14606         {
14607           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
14608
14609         /* If OP_COST is non-zero, then the cost of the zero extend
14610            is effectively the cost of the inner operation.  Otherwise
14611            we have a MOV instruction and we take the cost from the MOV
14612            itself.  This is true independently of whether we are
14613            optimizing for space or time.  */
14614           if (op_cost)
14615             *cost = op_cost;
14616
14617           return true;
14618         }
14619       else if (MEM_P (op0))
14620         {
14621           /* All loads can zero extend to any size for free.  */
14622           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
14623           return true;
14624         }
14625
14626       op0 = aarch64_extend_bitfield_pattern_p (x);
14627       if (op0)
14628         {
14629           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
14630           if (speed)
14631             *cost += extra_cost->alu.bfx;
14632           return true;
14633         }
14634
14635       if (speed)
14636         {
14637           if (VECTOR_MODE_P (mode))
14638             {
14639               /* UMOV.  */
14640               *cost += extra_cost->vect.alu;
14641             }
14642           else
14643             {
14644               /* We generate an AND instead of UXTB/UXTH.  */
14645               *cost += extra_cost->alu.logical;
14646             }
14647         }
14648       return false;
14649
14650     case SIGN_EXTEND:
14651       if (MEM_P (XEXP (x, 0)))
14652         {
14653           /* LDRSH.  */
14654           if (speed)
14655             {
14656               rtx address = XEXP (XEXP (x, 0), 0);
14657               *cost += extra_cost->ldst.load_sign_extend;
14658
14659               *cost +=
14660                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14661                                                      0, speed));
14662             }
14663           return true;
14664         }
14665
14666       op0 = aarch64_extend_bitfield_pattern_p (x);
14667       if (op0)
14668         {
14669           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
14670           if (speed)
14671             *cost += extra_cost->alu.bfx;
14672           return true;
14673         }
14674
14675       if (speed)
14676         {
14677           if (VECTOR_MODE_P (mode))
14678             *cost += extra_cost->vect.alu;
14679           else
14680             *cost += extra_cost->alu.extend;
14681         }
14682       return false;
14683
14684     case ASHIFT:
14685       op0 = XEXP (x, 0);
14686       op1 = XEXP (x, 1);
14687
14688       if (CONST_INT_P (op1))
14689         {
14690           if (speed)
14691             {
14692               if (VECTOR_MODE_P (mode))
14693                 {
14694                   /* Vector shift (immediate).  */
14695                   *cost += extra_cost->vect.alu;
14696                 }
14697               else
14698                 {
14699                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
14700                      aliases.  */
14701                   *cost += extra_cost->alu.shift;
14702                 }
14703             }
14704
14705           /* We can incorporate zero/sign extend for free.  */
14706           if (GET_CODE (op0) == ZERO_EXTEND
14707               || GET_CODE (op0) == SIGN_EXTEND)
14708             op0 = XEXP (op0, 0);
14709
14710           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
14711           return true;
14712         }
14713       else
14714         {
14715           if (VECTOR_MODE_P (mode))
14716             {
14717               if (speed)
14718                 /* Vector shift (register).  */
14719                 *cost += extra_cost->vect.alu;
14720             }
14721           else
14722             {
14723               if (speed)
14724                 /* LSLV.  */
14725                 *cost += extra_cost->alu.shift_reg;
14726
14727               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
14728                   && CONST_INT_P (XEXP (op1, 1))
14729                   && known_eq (INTVAL (XEXP (op1, 1)),
14730                                GET_MODE_BITSIZE (mode) - 1))
14731                 {
14732                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
14733                   /* We already demanded XEXP (op1, 0) to be REG_P, so
14734                      don't recurse into it.  */
14735                   return true;
14736                 }
14737             }
14738           return false;  /* All arguments need to be in registers.  */
14739         }
14740
14741     case ROTATE:
14742     case ROTATERT:
14743     case LSHIFTRT:
14744     case ASHIFTRT:
14745       op0 = XEXP (x, 0);
14746       op1 = XEXP (x, 1);
14747
14748       if (CONST_INT_P (op1))
14749         {
14750           /* ASR (immediate) and friends.  */
14751           if (speed)
14752             {
14753               if (VECTOR_MODE_P (mode))
14754                 *cost += extra_cost->vect.alu;
14755               else
14756                 *cost += extra_cost->alu.shift;
14757             }
14758
14759           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
14760           return true;
14761         }
14762       else
14763         {
14764           if (VECTOR_MODE_P (mode))
14765             {
14766               if (speed)
14767                 /* Vector shift (register).  */
14768                 *cost += extra_cost->vect.alu;
14769             }
14770           else
14771             {
14772               if (speed)
14773                 /* ASR (register) and friends.  */
14774                 *cost += extra_cost->alu.shift_reg;
14775
14776               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
14777                   && CONST_INT_P (XEXP (op1, 1))
14778                   && known_eq (INTVAL (XEXP (op1, 1)),
14779                                GET_MODE_BITSIZE (mode) - 1))
14780                 {
14781                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
14782                   /* We already demanded XEXP (op1, 0) to be REG_P, so
14783                      don't recurse into it.  */
14784                   return true;
14785                 }
14786             }
14787           return false;  /* All arguments need to be in registers.  */
14788         }
14789
14790     case SYMBOL_REF:
14791
14792       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
14793           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
14794         {
14795           /* LDR.  */
14796           if (speed)
14797             *cost += extra_cost->ldst.load;
14798         }
14799       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
14800                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
14801         {
14802           /* ADRP, followed by ADD.  */
14803           *cost += COSTS_N_INSNS (1);
14804           if (speed)
14805             *cost += 2 * extra_cost->alu.arith;
14806         }
14807       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
14808                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
14809         {
14810           /* ADR.  */
14811           if (speed)
14812             *cost += extra_cost->alu.arith;
14813         }
14814
14815       if (flag_pic)
14816         {
14817           /* One extra load instruction, after accessing the GOT.  */
14818           *cost += COSTS_N_INSNS (1);
14819           if (speed)
14820             *cost += extra_cost->ldst.load;
14821         }
14822       return true;
14823
14824     case HIGH:
14825     case LO_SUM:
14826       /* ADRP/ADD (immediate).  */
14827       if (speed)
14828         *cost += extra_cost->alu.arith;
14829       return true;
14830
14831     case ZERO_EXTRACT:
14832     case SIGN_EXTRACT:
14833       /* UBFX/SBFX.  */
14834       if (speed)
14835         {
14836           if (VECTOR_MODE_P (mode))
14837             *cost += extra_cost->vect.alu;
14838           else
14839             *cost += extra_cost->alu.bfx;
14840         }
14841
14842       /* We can trust that the immediates used will be correct (there
14843          are no by-register forms), so we need only cost op0.  */
14844       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
14845       return true;
14846
14847     case MULT:
14848       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
14849       /* aarch64_rtx_mult_cost always handles recursion to its
14850          operands.  */
14851       return true;
14852
14853     case MOD:
14854     /* We can expand signed mod by power of 2 using a NEGS, two parallel
14855        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
14856        an unconditional negate.  This case should only ever be reached through
14857        the set_smod_pow2_cheap check in expmed.cc.  */
14858       if (CONST_INT_P (XEXP (x, 1))
14859           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
14860           && (mode == SImode || mode == DImode))
14861         {
14862           /* We expand to 4 instructions.  Reset the baseline.  */
14863           *cost = COSTS_N_INSNS (4);
14864
14865           if (speed)
14866             *cost += 2 * extra_cost->alu.logical
14867                      + 2 * extra_cost->alu.arith;
14868
14869           return true;
14870         }
14871
14872     /* Fall-through.  */
14873     case UMOD:
14874       if (speed)
14875         {
14876           /* Slighly prefer UMOD over SMOD.  */
14877           if (VECTOR_MODE_P (mode))
14878             *cost += extra_cost->vect.alu;
14879           else if (GET_MODE_CLASS (mode) == MODE_INT)
14880             *cost += (extra_cost->mult[mode == DImode].add
14881                       + extra_cost->mult[mode == DImode].idiv
14882                       + (code == MOD ? 1 : 0));
14883         }
14884       return false;  /* All arguments need to be in registers.  */
14885
14886     case DIV:
14887     case UDIV:
14888     case SQRT:
14889       if (speed)
14890         {
14891           if (VECTOR_MODE_P (mode))
14892             *cost += extra_cost->vect.alu;
14893           else if (GET_MODE_CLASS (mode) == MODE_INT)
14894             /* There is no integer SQRT, so only DIV and UDIV can get
14895                here.  */
14896             *cost += (extra_cost->mult[mode == DImode].idiv
14897                      /* Slighly prefer UDIV over SDIV.  */
14898                      + (code == DIV ? 1 : 0));
14899           else
14900             *cost += extra_cost->fp[mode == DFmode].div;
14901         }
14902       return false;  /* All arguments need to be in registers.  */
14903
14904     case IF_THEN_ELSE:
14905       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
14906                                          XEXP (x, 2), cost, speed);
14907
14908     case EQ:
14909     case NE:
14910     case GT:
14911     case GTU:
14912     case LT:
14913     case LTU:
14914     case GE:
14915     case GEU:
14916     case LE:
14917     case LEU:
14918
14919       return false; /* All arguments must be in registers.  */
14920
14921     case FMA:
14922       op0 = XEXP (x, 0);
14923       op1 = XEXP (x, 1);
14924       op2 = XEXP (x, 2);
14925
14926       if (speed)
14927         {
14928           if (VECTOR_MODE_P (mode))
14929             *cost += extra_cost->vect.alu;
14930           else
14931             *cost += extra_cost->fp[mode == DFmode].fma;
14932         }
14933
14934       /* FMSUB, FNMADD, and FNMSUB are free.  */
14935       if (GET_CODE (op0) == NEG)
14936         op0 = XEXP (op0, 0);
14937
14938       if (GET_CODE (op2) == NEG)
14939         op2 = XEXP (op2, 0);
14940
14941       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
14942          and the by-element operand as operand 0.  */
14943       if (GET_CODE (op1) == NEG)
14944         op1 = XEXP (op1, 0);
14945
14946       /* Catch vector-by-element operations.  The by-element operand can
14947          either be (vec_duplicate (vec_select (x))) or just
14948          (vec_select (x)), depending on whether we are multiplying by
14949          a vector or a scalar.
14950
14951          Canonicalization is not very good in these cases, FMA4 will put the
14952          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
14953       if (GET_CODE (op0) == VEC_DUPLICATE)
14954         op0 = XEXP (op0, 0);
14955       else if (GET_CODE (op1) == VEC_DUPLICATE)
14956         op1 = XEXP (op1, 0);
14957
14958       if (GET_CODE (op0) == VEC_SELECT)
14959         op0 = XEXP (op0, 0);
14960       else if (GET_CODE (op1) == VEC_SELECT)
14961         op1 = XEXP (op1, 0);
14962
14963       /* If the remaining parameters are not registers,
14964          get the cost to put them into registers.  */
14965       *cost += rtx_cost (op0, mode, FMA, 0, speed);
14966       *cost += rtx_cost (op1, mode, FMA, 1, speed);
14967       *cost += rtx_cost (op2, mode, FMA, 2, speed);
14968       return true;
14969
14970     case FLOAT:
14971     case UNSIGNED_FLOAT:
14972       if (speed)
14973         *cost += extra_cost->fp[mode == DFmode].fromint;
14974       return false;
14975
14976     case FLOAT_EXTEND:
14977       if (speed)
14978         {
14979           if (VECTOR_MODE_P (mode))
14980             {
14981               /*Vector truncate.  */
14982               *cost += extra_cost->vect.alu;
14983             }
14984           else
14985             *cost += extra_cost->fp[mode == DFmode].widen;
14986         }
14987       return false;
14988
14989     case FLOAT_TRUNCATE:
14990       if (speed)
14991         {
14992           if (VECTOR_MODE_P (mode))
14993             {
14994               /*Vector conversion.  */
14995               *cost += extra_cost->vect.alu;
14996             }
14997           else
14998             *cost += extra_cost->fp[mode == DFmode].narrow;
14999         }
15000       return false;
15001
15002     case FIX:
15003     case UNSIGNED_FIX:
15004       x = XEXP (x, 0);
15005       /* Strip the rounding part.  They will all be implemented
15006          by the fcvt* family of instructions anyway.  */
15007       if (GET_CODE (x) == UNSPEC)
15008         {
15009           unsigned int uns_code = XINT (x, 1);
15010
15011           if (uns_code == UNSPEC_FRINTA
15012               || uns_code == UNSPEC_FRINTM
15013               || uns_code == UNSPEC_FRINTN
15014               || uns_code == UNSPEC_FRINTP
15015               || uns_code == UNSPEC_FRINTZ)
15016             x = XVECEXP (x, 0, 0);
15017         }
15018
15019       if (speed)
15020         {
15021           if (VECTOR_MODE_P (mode))
15022             *cost += extra_cost->vect.alu;
15023           else
15024             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
15025         }
15026
15027       /* We can combine fmul by a power of 2 followed by a fcvt into a single
15028          fixed-point fcvt.  */
15029       if (GET_CODE (x) == MULT
15030           && ((VECTOR_MODE_P (mode)
15031                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
15032               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
15033         {
15034           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
15035                              0, speed);
15036           return true;
15037         }
15038
15039       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
15040       return true;
15041
15042     case ABS:
15043       if (VECTOR_MODE_P (mode))
15044         {
15045           /* ABS (vector).  */
15046           if (speed)
15047             *cost += extra_cost->vect.alu;
15048         }
15049       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
15050         {
15051           op0 = XEXP (x, 0);
15052
15053           /* FABD, which is analogous to FADD.  */
15054           if (GET_CODE (op0) == MINUS)
15055             {
15056               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
15057               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
15058               if (speed)
15059                 *cost += extra_cost->fp[mode == DFmode].addsub;
15060
15061               return true;
15062             }
15063           /* Simple FABS is analogous to FNEG.  */
15064           if (speed)
15065             *cost += extra_cost->fp[mode == DFmode].neg;
15066         }
15067       else
15068         {
15069           /* Integer ABS will either be split to
15070              two arithmetic instructions, or will be an ABS
15071              (scalar), which we don't model.  */
15072           *cost = COSTS_N_INSNS (2);
15073           if (speed)
15074             *cost += 2 * extra_cost->alu.arith;
15075         }
15076       return false;
15077
15078     case SMAX:
15079     case SMIN:
15080       if (speed)
15081         {
15082           if (VECTOR_MODE_P (mode))
15083             *cost += extra_cost->vect.alu;
15084           else
15085             {
15086               /* FMAXNM/FMINNM/FMAX/FMIN.
15087                  TODO: This may not be accurate for all implementations, but
15088                  we do not model this in the cost tables.  */
15089               *cost += extra_cost->fp[mode == DFmode].addsub;
15090             }
15091         }
15092       return false;
15093
15094     case UNSPEC:
15095       /* The floating point round to integer frint* instructions.  */
15096       if (aarch64_frint_unspec_p (XINT (x, 1)))
15097         {
15098           if (speed)
15099             *cost += extra_cost->fp[mode == DFmode].roundint;
15100
15101           return false;
15102         }
15103
15104       if (XINT (x, 1) == UNSPEC_RBIT)
15105         {
15106           if (speed)
15107             *cost += extra_cost->alu.rev;
15108
15109           return false;
15110         }
15111       break;
15112
15113     case TRUNCATE:
15114
15115       /* Decompose <su>muldi3_highpart.  */
15116       if (/* (truncate:DI  */
15117           mode == DImode
15118           /*   (lshiftrt:TI  */
15119           && GET_MODE (XEXP (x, 0)) == TImode
15120           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
15121           /*      (mult:TI  */
15122           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
15123           /*        (ANY_EXTEND:TI (reg:DI))
15124                     (ANY_EXTEND:TI (reg:DI)))  */
15125           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
15126                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
15127               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
15128                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
15129           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
15130           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
15131           /*     (const_int 64)  */
15132           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
15133           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
15134         {
15135           /* UMULH/SMULH.  */
15136           if (speed)
15137             *cost += extra_cost->mult[mode == DImode].extend;
15138           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
15139                              mode, MULT, 0, speed);
15140           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
15141                              mode, MULT, 1, speed);
15142           return true;
15143         }
15144         break;
15145     case CONST_VECTOR:
15146         {
15147           /* Load using MOVI/MVNI.  */
15148           if (aarch64_simd_valid_immediate (x, NULL))
15149             *cost = extra_cost->vect.movi;
15150           else /* Load using constant pool.  */
15151             *cost = extra_cost->ldst.load;
15152           break;
15153         }
15154     case VEC_CONCAT:
15155         /* depending on the operation, either DUP or INS.
15156            For now, keep default costing.  */
15157         break;
15158     case VEC_DUPLICATE:
15159         /* Load using a DUP.  */
15160         *cost = extra_cost->vect.dup;
15161         return false;
15162     case VEC_SELECT:
15163         {
15164           rtx op0 = XEXP (x, 0);
15165           *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
15166
15167           /* cost subreg of 0 as free, otherwise as DUP */
15168           rtx op1 = XEXP (x, 1);
15169           if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
15170             ;
15171           else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
15172             *cost = extra_cost->vect.dup;
15173           else
15174             *cost = extra_cost->vect.extract;
15175           return true;
15176         }
15177     default:
15178       break;
15179     }
15180
15181   if (dump_file
15182       && flag_aarch64_verbose_cost)
15183     fprintf (dump_file,
15184       "\nFailed to cost RTX.  Assuming default cost.\n");
15185
15186   return true;
15187 }
15188
15189 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
15190    calculated for X.  This cost is stored in *COST.  Returns true
15191    if the total cost of X was calculated.  */
15192 static bool
15193 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
15194                    int param, int *cost, bool speed)
15195 {
15196   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
15197
15198   if (dump_file
15199       && flag_aarch64_verbose_cost)
15200     {
15201       print_rtl_single (dump_file, x);
15202       fprintf (dump_file, "\n%s cost: %d (%s)\n",
15203                speed ? "Hot" : "Cold",
15204                *cost, result ? "final" : "partial");
15205     }
15206
15207   return result;
15208 }
15209
15210 static int
15211 aarch64_register_move_cost (machine_mode mode,
15212                             reg_class_t from_i, reg_class_t to_i)
15213 {
15214   enum reg_class from = (enum reg_class) from_i;
15215   enum reg_class to = (enum reg_class) to_i;
15216   const struct cpu_regmove_cost *regmove_cost
15217     = aarch64_tune_params.regmove_cost;
15218
15219   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
15220   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS
15221       || to == STUB_REGS)
15222     to = GENERAL_REGS;
15223
15224   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS
15225       || from == STUB_REGS)
15226     from = GENERAL_REGS;
15227
15228   /* Make RDFFR very expensive.  In particular, if we know that the FFR
15229      contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
15230      as a way of obtaining a PTRUE.  */
15231   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15232       && hard_reg_set_subset_p (reg_class_contents[from_i],
15233                                 reg_class_contents[FFR_REGS]))
15234     return 80;
15235
15236   /* Moving between GPR and stack cost is the same as GP2GP.  */
15237   if ((from == GENERAL_REGS && to == STACK_REG)
15238       || (to == GENERAL_REGS && from == STACK_REG))
15239     return regmove_cost->GP2GP;
15240
15241   /* To/From the stack register, we move via the gprs.  */
15242   if (to == STACK_REG || from == STACK_REG)
15243     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
15244             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
15245
15246   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15247   if (vec_flags != (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL)
15248       && known_eq (GET_MODE_SIZE (mode), 16))
15249     {
15250       /* 128-bit operations on general registers require 2 instructions.  */
15251       if (from == GENERAL_REGS && to == GENERAL_REGS)
15252         return regmove_cost->GP2GP * 2;
15253       else if (from == GENERAL_REGS)
15254         return regmove_cost->GP2FP * 2;
15255       else if (to == GENERAL_REGS)
15256         return regmove_cost->FP2GP * 2;
15257
15258       /* When AdvSIMD instructions are disabled it is not possible to move
15259          a 128-bit value directly between Q registers.  This is handled in
15260          secondary reload.  A general register is used as a scratch to move
15261          the upper DI value and the lower DI value is moved directly,
15262          hence the cost is the sum of three moves. */
15263       if (! TARGET_SIMD)
15264         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
15265
15266       return regmove_cost->FP2FP;
15267     }
15268
15269   if (from == GENERAL_REGS && to == GENERAL_REGS)
15270     return regmove_cost->GP2GP;
15271   else if (from == GENERAL_REGS)
15272     return regmove_cost->GP2FP;
15273   else if (to == GENERAL_REGS)
15274     return regmove_cost->FP2GP;
15275
15276   if (!TARGET_SIMD && vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15277     {
15278       /* Needs a round-trip through memory, which can use LDP/STP for pairs.
15279          The cost must be greater than 2 units to indicate that direct
15280          moves aren't possible.  */
15281       auto per_vector = (aarch64_tune_params.memmov_cost.load_fp
15282                          + aarch64_tune_params.memmov_cost.store_fp);
15283       return MIN (CEIL (per_vector, 2), 4);
15284     }
15285
15286   return regmove_cost->FP2FP;
15287 }
15288
15289 /* Implements TARGET_MEMORY_MOVE_COST.  */
15290 static int
15291 aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in)
15292 {
15293   enum reg_class rclass = (enum reg_class) rclass_i;
15294   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15295       ? reg_classes_intersect_p (rclass, PR_REGS)
15296       : reg_class_subset_p (rclass, PR_REGS))
15297     return (in
15298             ? aarch64_tune_params.memmov_cost.load_pred
15299             : aarch64_tune_params.memmov_cost.store_pred);
15300
15301   if (VECTOR_MODE_P (mode) || FLOAT_MODE_P (mode)
15302       ? reg_classes_intersect_p (rclass, FP_REGS)
15303       : reg_class_subset_p (rclass, FP_REGS))
15304     return (in
15305             ? aarch64_tune_params.memmov_cost.load_fp
15306             : aarch64_tune_params.memmov_cost.store_fp);
15307
15308   return (in
15309           ? aarch64_tune_params.memmov_cost.load_int
15310           : aarch64_tune_params.memmov_cost.store_int);
15311 }
15312
15313 /* Implement TARGET_INIT_BUILTINS.  */
15314 static void
15315 aarch64_init_builtins ()
15316 {
15317   aarch64_general_init_builtins ();
15318   aarch64_sve::init_builtins ();
15319 #ifdef SUBTARGET_INIT_BUILTINS
15320   SUBTARGET_INIT_BUILTINS;
15321 #endif
15322 }
15323
15324 /* Implement TARGET_FOLD_BUILTIN.  */
15325 static tree
15326 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
15327 {
15328   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15329   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15330   tree type = TREE_TYPE (TREE_TYPE (fndecl));
15331   switch (code & AARCH64_BUILTIN_CLASS)
15332     {
15333     case AARCH64_BUILTIN_GENERAL:
15334       return aarch64_general_fold_builtin (subcode, type, nargs, args);
15335
15336     case AARCH64_BUILTIN_SVE:
15337       return NULL_TREE;
15338     }
15339   gcc_unreachable ();
15340 }
15341
15342 /* Implement TARGET_GIMPLE_FOLD_BUILTIN.  */
15343 static bool
15344 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
15345 {
15346   gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
15347   tree fndecl = gimple_call_fndecl (stmt);
15348   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15349   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15350   gimple *new_stmt = NULL;
15351   switch (code & AARCH64_BUILTIN_CLASS)
15352     {
15353     case AARCH64_BUILTIN_GENERAL:
15354       new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt, gsi);
15355       break;
15356
15357     case AARCH64_BUILTIN_SVE:
15358       new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
15359       break;
15360     }
15361
15362   if (!new_stmt)
15363     return false;
15364
15365   gsi_replace (gsi, new_stmt, false);
15366   return true;
15367 }
15368
15369 /* Implement TARGET_EXPAND_BUILTIN.  */
15370 static rtx
15371 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
15372 {
15373   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
15374   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15375   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15376   switch (code & AARCH64_BUILTIN_CLASS)
15377     {
15378     case AARCH64_BUILTIN_GENERAL:
15379       return aarch64_general_expand_builtin (subcode, exp, target, ignore);
15380
15381     case AARCH64_BUILTIN_SVE:
15382       return aarch64_sve::expand_builtin (subcode, exp, target);
15383     }
15384   gcc_unreachable ();
15385 }
15386
15387 /* Implement TARGET_BUILTIN_DECL.  */
15388 static tree
15389 aarch64_builtin_decl (unsigned int code, bool initialize_p)
15390 {
15391   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15392   switch (code & AARCH64_BUILTIN_CLASS)
15393     {
15394     case AARCH64_BUILTIN_GENERAL:
15395       return aarch64_general_builtin_decl (subcode, initialize_p);
15396
15397     case AARCH64_BUILTIN_SVE:
15398       return aarch64_sve::builtin_decl (subcode, initialize_p);
15399     }
15400   gcc_unreachable ();
15401 }
15402
15403 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
15404    to optimize 1.0/sqrt.  */
15405
15406 static bool
15407 use_rsqrt_p (machine_mode mode)
15408 {
15409   return (!flag_trapping_math
15410           && flag_unsafe_math_optimizations
15411           && ((aarch64_tune_params.approx_modes->recip_sqrt
15412                & AARCH64_APPROX_MODE (mode))
15413               || flag_mrecip_low_precision_sqrt));
15414 }
15415
15416 /* Function to decide when to use the approximate reciprocal square root
15417    builtin.  */
15418
15419 static tree
15420 aarch64_builtin_reciprocal (tree fndecl)
15421 {
15422   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
15423
15424   if (!use_rsqrt_p (mode))
15425     return NULL_TREE;
15426   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15427   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15428   switch (code & AARCH64_BUILTIN_CLASS)
15429     {
15430     case AARCH64_BUILTIN_GENERAL:
15431       return aarch64_general_builtin_rsqrt (subcode);
15432
15433     case AARCH64_BUILTIN_SVE:
15434       return NULL_TREE;
15435     }
15436   gcc_unreachable ();
15437 }
15438
15439 /* Emit code to perform the floating-point operation:
15440
15441      DST = SRC1 * SRC2
15442
15443    where all three operands are already known to be registers.
15444    If the operation is an SVE one, PTRUE is a suitable all-true
15445    predicate.  */
15446
15447 static void
15448 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
15449 {
15450   if (ptrue)
15451     emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
15452                                  dst, ptrue, src1, src2,
15453                                  gen_int_mode (SVE_RELAXED_GP, SImode)));
15454   else
15455     emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
15456 }
15457
15458 /* Emit instruction sequence to compute either the approximate square root
15459    or its approximate reciprocal, depending on the flag RECP, and return
15460    whether the sequence was emitted or not.  */
15461
15462 bool
15463 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
15464 {
15465   machine_mode mode = GET_MODE (dst);
15466
15467   if (GET_MODE_INNER (mode) == HFmode)
15468     {
15469       gcc_assert (!recp);
15470       return false;
15471     }
15472
15473   if (!recp)
15474     {
15475       if (!(flag_mlow_precision_sqrt
15476             || (aarch64_tune_params.approx_modes->sqrt
15477                 & AARCH64_APPROX_MODE (mode))))
15478         return false;
15479
15480       if (!flag_finite_math_only
15481           || flag_trapping_math
15482           || !flag_unsafe_math_optimizations
15483           || optimize_function_for_size_p (cfun))
15484         return false;
15485     }
15486   else
15487     /* Caller assumes we cannot fail.  */
15488     gcc_assert (use_rsqrt_p (mode));
15489
15490   rtx pg = NULL_RTX;
15491   if (aarch64_sve_mode_p (mode))
15492     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15493   machine_mode mmsk = (VECTOR_MODE_P (mode)
15494                        ? related_int_vector_mode (mode).require ()
15495                        : int_mode_for_mode (mode).require ());
15496   rtx xmsk = NULL_RTX;
15497   if (!recp)
15498     {
15499       /* When calculating the approximate square root, compare the
15500          argument with 0.0 and create a mask.  */
15501       rtx zero = CONST0_RTX (mode);
15502       if (pg)
15503         {
15504           xmsk = gen_reg_rtx (GET_MODE (pg));
15505           rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
15506           emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
15507                                            xmsk, pg, hint, src, zero));
15508         }
15509       else
15510         {
15511           xmsk = gen_reg_rtx (mmsk);
15512           emit_insn (gen_rtx_SET (xmsk,
15513                                   gen_rtx_NEG (mmsk,
15514                                                gen_rtx_EQ (mmsk, src, zero))));
15515         }
15516     }
15517
15518   /* Estimate the approximate reciprocal square root.  */
15519   rtx xdst = gen_reg_rtx (mode);
15520   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
15521
15522   /* Iterate over the series twice for SF and thrice for DF.  */
15523   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15524
15525   /* Optionally iterate over the series once less for faster performance
15526      while sacrificing the accuracy.  */
15527   if ((recp && flag_mrecip_low_precision_sqrt)
15528       || (!recp && flag_mlow_precision_sqrt))
15529     iterations--;
15530
15531   /* Iterate over the series to calculate the approximate reciprocal square
15532      root.  */
15533   rtx x1 = gen_reg_rtx (mode);
15534   while (iterations--)
15535     {
15536       rtx x2 = gen_reg_rtx (mode);
15537       aarch64_emit_mult (x2, pg, xdst, xdst);
15538
15539       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
15540
15541       if (iterations > 0)
15542         aarch64_emit_mult (xdst, pg, xdst, x1);
15543     }
15544
15545   if (!recp)
15546     {
15547       if (pg)
15548         /* Multiply nonzero source values by the corresponding intermediate
15549            result elements, so that the final calculation is the approximate
15550            square root rather than its reciprocal.  Select a zero result for
15551            zero source values, to avoid the Inf * 0 -> NaN that we'd get
15552            otherwise.  */
15553         emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
15554                              xdst, xmsk, xdst, src, CONST0_RTX (mode)));
15555       else
15556         {
15557           /* Qualify the approximate reciprocal square root when the
15558              argument is 0.0 by squashing the intermediary result to 0.0.  */
15559           rtx xtmp = gen_reg_rtx (mmsk);
15560           emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
15561                                             gen_rtx_SUBREG (mmsk, xdst, 0)));
15562           emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
15563
15564           /* Calculate the approximate square root.  */
15565           aarch64_emit_mult (xdst, pg, xdst, src);
15566         }
15567     }
15568
15569   /* Finalize the approximation.  */
15570   aarch64_emit_mult (dst, pg, xdst, x1);
15571
15572   return true;
15573 }
15574
15575 /* Emit the instruction sequence to compute the approximation for the division
15576    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
15577
15578 bool
15579 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
15580 {
15581   machine_mode mode = GET_MODE (quo);
15582
15583   if (GET_MODE_INNER (mode) == HFmode)
15584     return false;
15585
15586   bool use_approx_division_p = (flag_mlow_precision_div
15587                                 || (aarch64_tune_params.approx_modes->division
15588                                     & AARCH64_APPROX_MODE (mode)));
15589
15590   if (!flag_finite_math_only
15591       || flag_trapping_math
15592       || !flag_unsafe_math_optimizations
15593       || optimize_function_for_size_p (cfun)
15594       || !use_approx_division_p)
15595     return false;
15596
15597   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
15598     return false;
15599
15600   rtx pg = NULL_RTX;
15601   if (aarch64_sve_mode_p (mode))
15602     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15603
15604   /* Estimate the approximate reciprocal.  */
15605   rtx xrcp = gen_reg_rtx (mode);
15606   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
15607
15608   /* Iterate over the series twice for SF and thrice for DF.  */
15609   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15610
15611   /* Optionally iterate over the series less for faster performance,
15612      while sacrificing the accuracy.  The default is 2 for DF and 1 for SF.  */
15613   if (flag_mlow_precision_div)
15614     iterations = (GET_MODE_INNER (mode) == DFmode
15615                   ? aarch64_double_recp_precision
15616                   : aarch64_float_recp_precision);
15617
15618   /* Iterate over the series to calculate the approximate reciprocal.  */
15619   rtx xtmp = gen_reg_rtx (mode);
15620   while (iterations--)
15621     {
15622       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
15623
15624       if (iterations > 0)
15625         aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
15626     }
15627
15628   if (num != CONST1_RTX (mode))
15629     {
15630       /* As the approximate reciprocal of DEN is already calculated, only
15631          calculate the approximate division when NUM is not 1.0.  */
15632       rtx xnum = force_reg (mode, num);
15633       aarch64_emit_mult (xrcp, pg, xrcp, xnum);
15634     }
15635
15636   /* Finalize the approximation.  */
15637   aarch64_emit_mult (quo, pg, xrcp, xtmp);
15638   return true;
15639 }
15640
15641 /* Return the number of instructions that can be issued per cycle.  */
15642 static int
15643 aarch64_sched_issue_rate (void)
15644 {
15645   return aarch64_tune_params.issue_rate;
15646 }
15647
15648 /* Implement TARGET_SCHED_VARIABLE_ISSUE.  */
15649 static int
15650 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
15651 {
15652   if (DEBUG_INSN_P (insn))
15653     return more;
15654
15655   rtx_code code = GET_CODE (PATTERN (insn));
15656   if (code == USE || code == CLOBBER)
15657     return more;
15658
15659   if (get_attr_type (insn) == TYPE_NO_INSN)
15660     return more;
15661
15662   return more - 1;
15663 }
15664
15665 static int
15666 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
15667 {
15668   int issue_rate = aarch64_sched_issue_rate ();
15669
15670   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
15671 }
15672
15673
15674 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
15675    autopref_multipass_dfa_lookahead_guard from haifa-sched.cc.  It only
15676    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
15677
15678 static int
15679 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
15680                                                     int ready_index)
15681 {
15682   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
15683 }
15684
15685
15686 /* Vectorizer cost model target hooks.  */
15687
15688 /* Information about how the CPU would issue the scalar, Advanced SIMD
15689    or SVE version of a vector loop, using the scheme defined by the
15690    aarch64_base_vec_issue_info hierarchy of structures.  */
15691 class aarch64_vec_op_count
15692 {
15693 public:
15694   aarch64_vec_op_count () = default;
15695   aarch64_vec_op_count (const aarch64_vec_issue_info *, unsigned int,
15696                         unsigned int = 1);
15697
15698   unsigned int vec_flags () const { return m_vec_flags; }
15699   unsigned int vf_factor () const { return m_vf_factor; }
15700
15701   const aarch64_base_vec_issue_info *base_issue_info () const;
15702   const aarch64_simd_vec_issue_info *simd_issue_info () const;
15703   const aarch64_sve_vec_issue_info *sve_issue_info () const;
15704
15705   fractional_cost rename_cycles_per_iter () const;
15706   fractional_cost min_nonpred_cycles_per_iter () const;
15707   fractional_cost min_pred_cycles_per_iter () const;
15708   fractional_cost min_cycles_per_iter () const;
15709
15710   void dump () const;
15711
15712   /* The number of individual "general" operations.  See the comments
15713      in aarch64_base_vec_issue_info for details.  */
15714   unsigned int general_ops = 0;
15715
15716   /* The number of load and store operations, under the same scheme
15717      as above.  */
15718   unsigned int loads = 0;
15719   unsigned int stores = 0;
15720
15721   /* The minimum number of cycles needed to execute all loop-carried
15722      operations, which in the vector code become associated with
15723      reductions.  */
15724   unsigned int reduction_latency = 0;
15725
15726   /* The number of individual predicate operations.  See the comments
15727      in aarch64_sve_vec_issue_info for details.  */
15728   unsigned int pred_ops = 0;
15729
15730 private:
15731   /* The issue information for the core.  */
15732   const aarch64_vec_issue_info *m_issue_info = nullptr;
15733
15734   /* - If M_VEC_FLAGS is zero then this structure describes scalar code
15735      - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then this structure describes
15736        Advanced SIMD code.
15737      - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then this structure describes
15738        SVE code.  */
15739   unsigned int m_vec_flags = 0;
15740
15741   /* Assume that, when the code is executing on the core described
15742      by M_ISSUE_INFO, one iteration of the loop will handle M_VF_FACTOR
15743      times more data than the vectorizer anticipates.
15744
15745      This is only ever different from 1 for SVE.  It allows us to consider
15746      what would happen on a 256-bit SVE target even when the -mtune
15747      parameters say that the “likely” SVE length is 128 bits.  */
15748   unsigned int m_vf_factor = 1;
15749 };
15750
15751 aarch64_vec_op_count::
15752 aarch64_vec_op_count (const aarch64_vec_issue_info *issue_info,
15753                       unsigned int vec_flags, unsigned int vf_factor)
15754   : m_issue_info (issue_info),
15755     m_vec_flags (vec_flags),
15756     m_vf_factor (vf_factor)
15757 {
15758 }
15759
15760 /* Return the base issue information (i.e. the parts that make sense
15761    for both scalar and vector code).  Return null if we have no issue
15762    information.  */
15763 const aarch64_base_vec_issue_info *
15764 aarch64_vec_op_count::base_issue_info () const
15765 {
15766   if (auto *ret = simd_issue_info ())
15767     return ret;
15768   return m_issue_info->scalar;
15769 }
15770
15771 /* If the structure describes vector code and we have associated issue
15772    information, return that issue information, otherwise return null.  */
15773 const aarch64_simd_vec_issue_info *
15774 aarch64_vec_op_count::simd_issue_info () const
15775 {
15776   if (auto *ret = sve_issue_info ())
15777     return ret;
15778   if (m_vec_flags)
15779     return m_issue_info->advsimd;
15780   return nullptr;
15781 }
15782
15783 /* If the structure describes SVE code and we have associated issue
15784    information, return that issue information, otherwise return null.  */
15785 const aarch64_sve_vec_issue_info *
15786 aarch64_vec_op_count::sve_issue_info () const
15787 {
15788   if (m_vec_flags & VEC_ANY_SVE)
15789     return m_issue_info->sve;
15790   return nullptr;
15791 }
15792
15793 /* Estimate the minimum number of cycles per iteration needed to rename
15794    the instructions.
15795
15796    ??? For now this is done inline rather than via cost tables, since it
15797    isn't clear how it should be parameterized for the general case.  */
15798 fractional_cost
15799 aarch64_vec_op_count::rename_cycles_per_iter () const
15800 {
15801   if (sve_issue_info () == &neoverse512tvb_sve_issue_info
15802       || sve_issue_info () == &neoversen2_sve_issue_info
15803       || sve_issue_info () == &neoversev2_sve_issue_info)
15804     /* + 1 for an addition.  We've already counted a general op for each
15805        store, so we don't need to account for stores separately.  The branch
15806        reads no registers and so does not need to be counted either.
15807
15808        ??? This value is very much on the pessimistic side, but seems to work
15809        pretty well in practice.  */
15810     return { general_ops + loads + pred_ops + 1, 5 };
15811
15812   return 0;
15813 }
15814
15815 /* Like min_cycles_per_iter, but excluding predicate operations.  */
15816 fractional_cost
15817 aarch64_vec_op_count::min_nonpred_cycles_per_iter () const
15818 {
15819   auto *issue_info = base_issue_info ();
15820
15821   fractional_cost cycles = MAX (reduction_latency, 1);
15822   cycles = std::max (cycles, { stores, issue_info->stores_per_cycle });
15823   cycles = std::max (cycles, { loads + stores,
15824                                issue_info->loads_stores_per_cycle });
15825   cycles = std::max (cycles, { general_ops,
15826                                issue_info->general_ops_per_cycle });
15827   cycles = std::max (cycles, rename_cycles_per_iter ());
15828   return cycles;
15829 }
15830
15831 /* Like min_cycles_per_iter, but including only the predicate operations.  */
15832 fractional_cost
15833 aarch64_vec_op_count::min_pred_cycles_per_iter () const
15834 {
15835   if (auto *issue_info = sve_issue_info ())
15836     return { pred_ops, issue_info->pred_ops_per_cycle };
15837   return 0;
15838 }
15839
15840 /* Estimate the minimum number of cycles needed to issue the operations.
15841    This is a very simplistic model!  */
15842 fractional_cost
15843 aarch64_vec_op_count::min_cycles_per_iter () const
15844 {
15845   return std::max (min_nonpred_cycles_per_iter (),
15846                    min_pred_cycles_per_iter ());
15847 }
15848
15849 /* Dump information about the structure.  */
15850 void
15851 aarch64_vec_op_count::dump () const
15852 {
15853   dump_printf_loc (MSG_NOTE, vect_location,
15854                    "  load operations = %d\n", loads);
15855   dump_printf_loc (MSG_NOTE, vect_location,
15856                    "  store operations = %d\n", stores);
15857   dump_printf_loc (MSG_NOTE, vect_location,
15858                    "  general operations = %d\n", general_ops);
15859   if (sve_issue_info ())
15860     dump_printf_loc (MSG_NOTE, vect_location,
15861                      "  predicate operations = %d\n", pred_ops);
15862   dump_printf_loc (MSG_NOTE, vect_location,
15863                    "  reduction latency = %d\n", reduction_latency);
15864   if (auto rcpi = rename_cycles_per_iter ())
15865     dump_printf_loc (MSG_NOTE, vect_location,
15866                      "  estimated cycles per iteration to rename = %f\n",
15867                      rcpi.as_double ());
15868   if (auto pred_cpi = min_pred_cycles_per_iter ())
15869     {
15870       dump_printf_loc (MSG_NOTE, vect_location,
15871                        "  estimated min cycles per iteration"
15872                        " without predication = %f\n",
15873                        min_nonpred_cycles_per_iter ().as_double ());
15874       dump_printf_loc (MSG_NOTE, vect_location,
15875                        "  estimated min cycles per iteration"
15876                        " for predication = %f\n", pred_cpi.as_double ());
15877     }
15878   if (auto cpi = min_cycles_per_iter ())
15879     dump_printf_loc (MSG_NOTE, vect_location,
15880                      "  estimated min cycles per iteration = %f\n",
15881                      cpi.as_double ());
15882 }
15883
15884 /* Information about vector code that we're in the process of costing.  */
15885 class aarch64_vector_costs : public vector_costs
15886 {
15887 public:
15888   aarch64_vector_costs (vec_info *, bool);
15889
15890   unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
15891                               stmt_vec_info stmt_info, slp_tree, tree vectype,
15892                               int misalign,
15893                               vect_cost_model_location where) override;
15894   void finish_cost (const vector_costs *) override;
15895   bool better_main_loop_than_p (const vector_costs *other) const override;
15896
15897 private:
15898   void record_potential_advsimd_unrolling (loop_vec_info);
15899   void analyze_loop_vinfo (loop_vec_info);
15900   void count_ops (unsigned int, vect_cost_for_stmt, stmt_vec_info,
15901                   aarch64_vec_op_count *);
15902   fractional_cost adjust_body_cost_sve (const aarch64_vec_op_count *,
15903                                         fractional_cost, unsigned int,
15904                                         unsigned int *, bool *);
15905   unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
15906                                  unsigned int);
15907   bool prefer_unrolled_loop () const;
15908   unsigned int determine_suggested_unroll_factor ();
15909
15910   /* True if we have performed one-time initialization based on the
15911      vec_info.  */
15912   bool m_analyzed_vinfo = false;
15913
15914   /* This loop uses an average operation that is not supported by SVE, but is
15915      supported by Advanced SIMD and SVE2.  */
15916   bool m_has_avg = false;
15917
15918   /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
15919      - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
15920        SIMD code.
15921      - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code.  */
15922   unsigned int m_vec_flags = 0;
15923
15924   /* At the moment, we do not model LDP and STP in the vector and scalar costs.
15925      This means that code such as:
15926
15927         a[0] = x;
15928         a[1] = x;
15929
15930      will be costed as two scalar instructions and two vector instructions
15931      (a scalar_to_vec and an unaligned_store).  For SLP, the vector form
15932      wins if the costs are equal, because of the fact that the vector costs
15933      include constant initializations whereas the scalar costs don't.
15934      We would therefore tend to vectorize the code above, even though
15935      the scalar version can use a single STP.
15936
15937      We should eventually fix this and model LDP and STP in the main costs;
15938      see the comment in aarch64_sve_adjust_stmt_cost for some of the problems.
15939      Until then, we look specifically for code that does nothing more than
15940      STP-like operations.  We cost them on that basis in addition to the
15941      normal latency-based costs.
15942
15943      If the scalar or vector code could be a sequence of STPs +
15944      initialization, this variable counts the cost of the sequence,
15945      with 2 units per instruction.  The variable is ~0U for other
15946      kinds of code.  */
15947   unsigned int m_stp_sequence_cost = 0;
15948
15949   /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
15950      throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE.  In those
15951      situations, we try to predict whether an Advanced SIMD implementation
15952      of the loop could be completely unrolled and become straight-line code.
15953      If so, it is generally better to use the Advanced SIMD version rather
15954      than length-agnostic SVE, since the SVE loop would execute an unknown
15955      number of times and so could not be completely unrolled in the same way.
15956
15957      If we're applying this heuristic, M_UNROLLED_ADVSIMD_NITERS is the
15958      number of Advanced SIMD loop iterations that would be unrolled and
15959      M_UNROLLED_ADVSIMD_STMTS estimates the total number of statements
15960      in the unrolled loop.  Both values are zero if we're not applying
15961      the heuristic.  */
15962   unsigned HOST_WIDE_INT m_unrolled_advsimd_niters = 0;
15963   unsigned HOST_WIDE_INT m_unrolled_advsimd_stmts = 0;
15964
15965   /* If we're vectorizing a loop that executes a constant number of times,
15966      this variable gives the number of times that the vector loop would
15967      iterate, otherwise it is zero.  */
15968   uint64_t m_num_vector_iterations = 0;
15969
15970   /* Used only when vectorizing loops.  Estimates the number and kind of
15971      operations that would be needed by one iteration of the scalar
15972      or vector loop.  There is one entry for each tuning option of
15973      interest.  */
15974   auto_vec<aarch64_vec_op_count, 2> m_ops;
15975 };
15976
15977 aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo,
15978                                             bool costing_for_scalar)
15979   : vector_costs (vinfo, costing_for_scalar),
15980     m_vec_flags (costing_for_scalar ? 0
15981                  : aarch64_classify_vector_mode (vinfo->vector_mode))
15982 {
15983   if (auto *issue_info = aarch64_tune_params.vec_costs->issue_info)
15984     {
15985       m_ops.quick_push ({ issue_info, m_vec_flags });
15986       if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
15987         {
15988           unsigned int vf_factor = (m_vec_flags & VEC_ANY_SVE) ? 2 : 1;
15989           m_ops.quick_push ({ &neoversev1_vec_issue_info, m_vec_flags,
15990                               vf_factor });
15991         }
15992     }
15993 }
15994
15995 /* Implement TARGET_VECTORIZE_CREATE_COSTS.  */
15996 vector_costs *
15997 aarch64_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
15998 {
15999   return new aarch64_vector_costs (vinfo, costing_for_scalar);
16000 }
16001
16002 /* Return true if the current CPU should use the new costs defined
16003    in GCC 11.  This should be removed for GCC 12 and above, with the
16004    costs applying to all CPUs instead.  */
16005 static bool
16006 aarch64_use_new_vector_costs_p ()
16007 {
16008   return (aarch64_tune_params.extra_tuning_flags
16009           & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
16010 }
16011
16012 /* Return the appropriate SIMD costs for vectors of type VECTYPE.  */
16013 static const simd_vec_cost *
16014 aarch64_simd_vec_costs (tree vectype)
16015 {
16016   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16017   if (vectype != NULL
16018       && aarch64_sve_mode_p (TYPE_MODE (vectype))
16019       && costs->sve != NULL)
16020     return costs->sve;
16021   return costs->advsimd;
16022 }
16023
16024 /* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS.  */
16025 static const simd_vec_cost *
16026 aarch64_simd_vec_costs_for_flags (unsigned int flags)
16027 {
16028   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16029   if ((flags & VEC_ANY_SVE) && costs->sve)
16030     return costs->sve;
16031   return costs->advsimd;
16032 }
16033
16034 /* If STMT_INFO is a memory reference, return the scalar memory type,
16035    otherwise return null.  */
16036 static tree
16037 aarch64_dr_type (stmt_vec_info stmt_info)
16038 {
16039   if (auto dr = STMT_VINFO_DATA_REF (stmt_info))
16040     return TREE_TYPE (DR_REF (dr));
16041   return NULL_TREE;
16042 }
16043
16044 /* Decide whether to use the unrolling heuristic described above
16045    m_unrolled_advsimd_niters, updating that field if so.  LOOP_VINFO
16046    describes the loop that we're vectorizing.  */
16047 void
16048 aarch64_vector_costs::
16049 record_potential_advsimd_unrolling (loop_vec_info loop_vinfo)
16050 {
16051   /* The heuristic only makes sense on targets that have the same
16052      vector throughput for SVE and Advanced SIMD.  */
16053   if (!(aarch64_tune_params.extra_tuning_flags
16054         & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT))
16055     return;
16056
16057   /* We only want to apply the heuristic if LOOP_VINFO is being
16058      vectorized for SVE.  */
16059   if (!(m_vec_flags & VEC_ANY_SVE))
16060     return;
16061
16062   /* Check whether it is possible in principle to use Advanced SIMD
16063      instead.  */
16064   if (aarch64_autovec_preference == 2)
16065     return;
16066
16067   /* We don't want to apply the heuristic to outer loops, since it's
16068      harder to track two levels of unrolling.  */
16069   if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
16070     return;
16071
16072   /* Only handle cases in which the number of Advanced SIMD iterations
16073      would be known at compile time but the number of SVE iterations
16074      would not.  */
16075   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
16076       || aarch64_sve_vg.is_constant ())
16077     return;
16078
16079   /* Guess how many times the Advanced SIMD loop would iterate and make
16080      sure that it is within the complete unrolling limit.  Even if the
16081      number of iterations is small enough, the number of statements might
16082      not be, which is why we need to estimate the number of statements too.  */
16083   unsigned int estimated_vq = aarch64_estimated_sve_vq ();
16084   unsigned int advsimd_vf = CEIL (vect_vf_for_cost (loop_vinfo), estimated_vq);
16085   unsigned HOST_WIDE_INT unrolled_advsimd_niters
16086     = LOOP_VINFO_INT_NITERS (loop_vinfo) / advsimd_vf;
16087   if (unrolled_advsimd_niters > (unsigned int) param_max_completely_peel_times)
16088     return;
16089
16090   /* Record that we're applying the heuristic and should try to estimate
16091      the number of statements in the Advanced SIMD loop.  */
16092   m_unrolled_advsimd_niters = unrolled_advsimd_niters;
16093 }
16094
16095 /* Do one-time initialization of the aarch64_vector_costs given that we're
16096    costing the loop vectorization described by LOOP_VINFO.  */
16097 void
16098 aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
16099 {
16100   /* Record the number of times that the vector loop would execute,
16101      if known.  */
16102   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
16103   auto scalar_niters = max_stmt_executions_int (loop);
16104   if (scalar_niters >= 0)
16105     {
16106       unsigned int vf = vect_vf_for_cost (loop_vinfo);
16107       if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
16108         m_num_vector_iterations = scalar_niters / vf;
16109       else
16110         m_num_vector_iterations = CEIL (scalar_niters, vf);
16111     }
16112
16113   /* Detect whether we're vectorizing for SVE and should apply the unrolling
16114      heuristic described above m_unrolled_advsimd_niters.  */
16115   record_potential_advsimd_unrolling (loop_vinfo);
16116
16117   /* Record the issue information for any SVE WHILE instructions that the
16118      loop needs.  */
16119   if (!m_ops.is_empty () && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
16120     {
16121       unsigned int num_masks = 0;
16122       rgroup_controls *rgm;
16123       unsigned int num_vectors_m1;
16124       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
16125         if (rgm->type)
16126           num_masks += num_vectors_m1 + 1;
16127       for (auto &ops : m_ops)
16128         if (auto *issue = ops.sve_issue_info ())
16129           ops.pred_ops += num_masks * issue->while_pred_ops;
16130     }
16131 }
16132
16133 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
16134 static int
16135 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
16136                                     tree vectype,
16137                                     int misalign ATTRIBUTE_UNUSED)
16138 {
16139   unsigned elements;
16140   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16141   bool fp = false;
16142
16143   if (vectype != NULL)
16144     fp = FLOAT_TYPE_P (vectype);
16145
16146   const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16147
16148   switch (type_of_cost)
16149     {
16150       case scalar_stmt:
16151         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
16152
16153       case scalar_load:
16154         return costs->scalar_load_cost;
16155
16156       case scalar_store:
16157         return costs->scalar_store_cost;
16158
16159       case vector_stmt:
16160         return fp ? simd_costs->fp_stmt_cost
16161                   : simd_costs->int_stmt_cost;
16162
16163       case vector_load:
16164         return simd_costs->align_load_cost;
16165
16166       case vector_store:
16167         return simd_costs->store_cost;
16168
16169       case vec_to_scalar:
16170         return simd_costs->vec_to_scalar_cost;
16171
16172       case scalar_to_vec:
16173         return simd_costs->scalar_to_vec_cost;
16174
16175       case unaligned_load:
16176       case vector_gather_load:
16177         return simd_costs->unalign_load_cost;
16178
16179       case unaligned_store:
16180       case vector_scatter_store:
16181         return simd_costs->unalign_store_cost;
16182
16183       case cond_branch_taken:
16184         return costs->cond_taken_branch_cost;
16185
16186       case cond_branch_not_taken:
16187         return costs->cond_not_taken_branch_cost;
16188
16189       case vec_perm:
16190         return simd_costs->permute_cost;
16191
16192       case vec_promote_demote:
16193         return fp ? simd_costs->fp_stmt_cost
16194                   : simd_costs->int_stmt_cost;
16195
16196       case vec_construct:
16197         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
16198         return elements / 2 + 1;
16199
16200       default:
16201         gcc_unreachable ();
16202     }
16203 }
16204
16205 /* Return true if an access of kind KIND for STMT_INFO represents one
16206    vector of an LD[234] or ST[234] operation.  Return the total number of
16207    vectors (2, 3 or 4) if so, otherwise return a value outside that range.  */
16208 static int
16209 aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info)
16210 {
16211   if ((kind == vector_load
16212        || kind == unaligned_load
16213        || kind == vector_store
16214        || kind == unaligned_store)
16215       && STMT_VINFO_DATA_REF (stmt_info))
16216     {
16217       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
16218       if (stmt_info
16219           && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
16220         return DR_GROUP_SIZE (stmt_info);
16221     }
16222   return 0;
16223 }
16224
16225 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
16226    vectors would produce a series of LDP or STP operations.  KIND is the
16227    kind of statement that STMT_INFO represents.  */
16228 static bool
16229 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
16230                            stmt_vec_info stmt_info)
16231 {
16232   switch (kind)
16233     {
16234     case vector_load:
16235     case vector_store:
16236     case unaligned_load:
16237     case unaligned_store:
16238       break;
16239
16240     default:
16241       return false;
16242     }
16243
16244   if (aarch64_tune_params.extra_tuning_flags
16245       & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
16246     return false;
16247
16248   return is_gimple_assign (stmt_info->stmt);
16249 }
16250
16251 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
16252    or multiply-subtract sequence that might be suitable for fusing into a
16253    single instruction.  If VEC_FLAGS is zero, analyze the operation as
16254    a scalar one, otherwise analyze it as an operation on vectors with those
16255    VEC_* flags.  */
16256 static bool
16257 aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
16258                         unsigned int vec_flags)
16259 {
16260   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
16261   if (!assign)
16262     return false;
16263   tree_code code = gimple_assign_rhs_code (assign);
16264   if (code != PLUS_EXPR && code != MINUS_EXPR)
16265     return false;
16266
16267   if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
16268       || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
16269     return false;
16270
16271   for (int i = 1; i < 3; ++i)
16272     {
16273       tree rhs = gimple_op (assign, i);
16274       /* ??? Should we try to check for a single use as well?  */
16275       if (TREE_CODE (rhs) != SSA_NAME)
16276         continue;
16277
16278       stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16279       if (!def_stmt_info
16280           || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16281         continue;
16282       gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16283       if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
16284         continue;
16285
16286       if (vec_flags & VEC_ADVSIMD)
16287         {
16288           /* Scalar and SVE code can tie the result to any FMLA input (or none,
16289              although that requires a MOVPRFX for SVE).  However, Advanced SIMD
16290              only supports MLA forms, so will require a move if the result
16291              cannot be tied to the accumulator.  The most important case in
16292              which this is true is when the accumulator input is invariant.  */
16293           rhs = gimple_op (assign, 3 - i);
16294           if (TREE_CODE (rhs) != SSA_NAME)
16295             return false;
16296           def_stmt_info = vinfo->lookup_def (rhs);
16297           if (!def_stmt_info
16298               || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
16299             return false;
16300         }
16301
16302       return true;
16303     }
16304   return false;
16305 }
16306
16307 /* We are considering implementing STMT_INFO using SVE.  If STMT_INFO is an
16308    in-loop reduction that SVE supports directly, return its latency in cycles,
16309    otherwise return zero.  SVE_COSTS specifies the latencies of the relevant
16310    instructions.  */
16311 static unsigned int
16312 aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
16313                                        stmt_vec_info stmt_info,
16314                                        const sve_vec_cost *sve_costs)
16315 {
16316   switch (vect_reduc_type (vinfo, stmt_info))
16317     {
16318     case EXTRACT_LAST_REDUCTION:
16319       return sve_costs->clast_cost;
16320
16321     case FOLD_LEFT_REDUCTION:
16322       switch (TYPE_MODE (TREE_TYPE (gimple_get_lhs (stmt_info->stmt))))
16323         {
16324         case E_HFmode:
16325         case E_BFmode:
16326           return sve_costs->fadda_f16_cost;
16327
16328         case E_SFmode:
16329           return sve_costs->fadda_f32_cost;
16330
16331         case E_DFmode:
16332           return sve_costs->fadda_f64_cost;
16333
16334         default:
16335           break;
16336         }
16337       break;
16338     }
16339
16340   return 0;
16341 }
16342
16343 /* STMT_INFO describes a loop-carried operation in the original scalar code
16344    that we are considering implementing as a reduction.  Return one of the
16345    following values, depending on VEC_FLAGS:
16346
16347    - If VEC_FLAGS is zero, return the loop carry latency of the original
16348      scalar operation.
16349
16350    - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
16351      Advanced SIMD implementation.
16352
16353    - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
16354      SVE implementation.  */
16355 static unsigned int
16356 aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
16357                                    unsigned int vec_flags)
16358 {
16359   const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs;
16360   const sve_vec_cost *sve_costs = nullptr;
16361   if (vec_flags & VEC_ANY_SVE)
16362     sve_costs = aarch64_tune_params.vec_costs->sve;
16363
16364   /* If the caller is asking for the SVE latency, check for forms of reduction
16365      that only SVE can handle directly.  */
16366   if (sve_costs)
16367     {
16368       unsigned int latency
16369         = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16370       if (latency)
16371         return latency;
16372     }
16373
16374   /* Handle scalar costs.  */
16375   bool is_float = FLOAT_TYPE_P (TREE_TYPE (gimple_get_lhs (stmt_info->stmt)));
16376   if (vec_flags == 0)
16377     {
16378       if (is_float)
16379         return vec_costs->scalar_fp_stmt_cost;
16380       return vec_costs->scalar_int_stmt_cost;
16381     }
16382
16383   /* Otherwise, the loop body just contains normal integer or FP operations,
16384      with a vector reduction outside the loop.  */
16385   const simd_vec_cost *simd_costs
16386     = aarch64_simd_vec_costs_for_flags (vec_flags);
16387   if (is_float)
16388     return simd_costs->fp_stmt_cost;
16389   return simd_costs->int_stmt_cost;
16390 }
16391
16392 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16393    for STMT_INFO, which has cost kind KIND.  If this is a scalar operation,
16394    try to subdivide the target-independent categorization provided by KIND
16395    to get a more accurate cost.  */
16396 static fractional_cost
16397 aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16398                                     stmt_vec_info stmt_info,
16399                                     fractional_cost stmt_cost)
16400 {
16401   /* Detect an extension of a loaded value.  In general, we'll be able to fuse
16402      the extension with the load.  */
16403   if (kind == scalar_stmt && vect_is_extending_load (vinfo, stmt_info))
16404     return 0;
16405
16406   return stmt_cost;
16407 }
16408
16409 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16410    for the vectorized form of STMT_INFO, which has cost kind KIND and which
16411    when vectorized would operate on vector type VECTYPE.  Try to subdivide
16412    the target-independent categorization provided by KIND to get a more
16413    accurate cost.  WHERE specifies where the cost associated with KIND
16414    occurs.  */
16415 static fractional_cost
16416 aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16417                                     stmt_vec_info stmt_info, tree vectype,
16418                                     enum vect_cost_model_location where,
16419                                     fractional_cost stmt_cost)
16420 {
16421   const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16422   const sve_vec_cost *sve_costs = nullptr;
16423   if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
16424     sve_costs = aarch64_tune_params.vec_costs->sve;
16425
16426   /* It's generally better to avoid costing inductions, since the induction
16427      will usually be hidden by other operations.  This is particularly true
16428      for things like COND_REDUCTIONS.  */
16429   if (is_a<gphi *> (stmt_info->stmt))
16430     return 0;
16431
16432   /* Detect cases in which vec_to_scalar is describing the extraction of a
16433      vector element in preparation for a scalar store.  The store itself is
16434      costed separately.  */
16435   if (vect_is_store_elt_extraction (kind, stmt_info))
16436     return simd_costs->store_elt_extra_cost;
16437
16438   /* Detect SVE gather loads, which are costed as a single scalar_load
16439      for each element.  We therefore need to divide the full-instruction
16440      cost by the number of elements in the vector.  */
16441   if (kind == scalar_load
16442       && sve_costs
16443       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16444     {
16445       unsigned int nunits = vect_nunits_for_cost (vectype);
16446       if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
16447         return { sve_costs->gather_load_x64_cost, nunits };
16448       return { sve_costs->gather_load_x32_cost, nunits };
16449     }
16450
16451   /* Detect cases in which a scalar_store is really storing one element
16452      in a scatter operation.  */
16453   if (kind == scalar_store
16454       && sve_costs
16455       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16456     return sve_costs->scatter_store_elt_cost;
16457
16458   /* Detect cases in which vec_to_scalar represents an in-loop reduction.  */
16459   if (kind == vec_to_scalar
16460       && where == vect_body
16461       && sve_costs)
16462     {
16463       unsigned int latency
16464         = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16465       if (latency)
16466         return latency;
16467     }
16468
16469   /* Detect cases in which vec_to_scalar represents a single reduction
16470      instruction like FADDP or MAXV.  */
16471   if (kind == vec_to_scalar
16472       && where == vect_epilogue
16473       && vect_is_reduction (stmt_info))
16474     switch (GET_MODE_INNER (TYPE_MODE (vectype)))
16475       {
16476       case E_QImode:
16477         return simd_costs->reduc_i8_cost;
16478
16479       case E_HImode:
16480         return simd_costs->reduc_i16_cost;
16481
16482       case E_SImode:
16483         return simd_costs->reduc_i32_cost;
16484
16485       case E_DImode:
16486         return simd_costs->reduc_i64_cost;
16487
16488       case E_HFmode:
16489       case E_BFmode:
16490         return simd_costs->reduc_f16_cost;
16491
16492       case E_SFmode:
16493         return simd_costs->reduc_f32_cost;
16494
16495       case E_DFmode:
16496         return simd_costs->reduc_f64_cost;
16497
16498       default:
16499         break;
16500       }
16501
16502   /* Otherwise stick with the original categorization.  */
16503   return stmt_cost;
16504 }
16505
16506 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16507    for STMT_INFO, which has cost kind KIND and which when vectorized would
16508    operate on vector type VECTYPE.  Adjust the cost as necessary for SVE
16509    targets.  */
16510 static fractional_cost
16511 aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
16512                               stmt_vec_info stmt_info, tree vectype,
16513                               fractional_cost stmt_cost)
16514 {
16515   /* Unlike vec_promote_demote, vector_stmt conversions do not change the
16516      vector register size or number of units.  Integer promotions of this
16517      type therefore map to SXT[BHW] or UXT[BHW].
16518
16519      Most loads have extending forms that can do the sign or zero extension
16520      on the fly.  Optimistically assume that a load followed by an extension
16521      will fold to this form during combine, and that the extension therefore
16522      comes for free.  */
16523   if (kind == vector_stmt && vect_is_extending_load (vinfo, stmt_info))
16524     stmt_cost = 0;
16525
16526   /* For similar reasons, vector_stmt integer truncations are a no-op,
16527      because we can just ignore the unused upper bits of the source.  */
16528   if (kind == vector_stmt && vect_is_integer_truncation (stmt_info))
16529     stmt_cost = 0;
16530
16531   /* Advanced SIMD can load and store pairs of registers using LDP and STP,
16532      but there are no equivalent instructions for SVE.  This means that
16533      (all other things being equal) 128-bit SVE needs twice as many load
16534      and store instructions as Advanced SIMD in order to process vector pairs.
16535
16536      Also, scalar code can often use LDP and STP to access pairs of values,
16537      so it is too simplistic to say that one SVE load or store replaces
16538      VF scalar loads and stores.
16539
16540      Ideally we would account for this in the scalar and Advanced SIMD
16541      costs by making suitable load/store pairs as cheap as a single
16542      load/store.  However, that would be a very invasive change and in
16543      practice it tends to stress other parts of the cost model too much.
16544      E.g. stores of scalar constants currently count just a store,
16545      whereas stores of vector constants count a store and a vec_init.
16546      This is an artificial distinction for AArch64, where stores of
16547      nonzero scalar constants need the same kind of register invariant
16548      as vector stores.
16549
16550      An alternative would be to double the cost of any SVE loads and stores
16551      that could be paired in Advanced SIMD (and possibly also paired in
16552      scalar code).  But this tends to stress other parts of the cost model
16553      in the same way.  It also means that we can fall back to Advanced SIMD
16554      even if full-loop predication would have been useful.
16555
16556      Here we go for a more conservative version: double the costs of SVE
16557      loads and stores if one iteration of the scalar loop processes enough
16558      elements for it to use a whole number of Advanced SIMD LDP or STP
16559      instructions.  This makes it very likely that the VF would be 1 for
16560      Advanced SIMD, and so no epilogue should be needed.  */
16561   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
16562     {
16563       stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
16564       unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
16565       unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
16566       if (multiple_p (count * elt_bits, 256)
16567           && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
16568         stmt_cost *= 2;
16569     }
16570
16571   return stmt_cost;
16572 }
16573
16574 /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
16575    and which when vectorized would operate on vector type VECTYPE.  Add the
16576    cost of any embedded operations.  */
16577 static fractional_cost
16578 aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
16579                           tree vectype, fractional_cost stmt_cost)
16580 {
16581   if (vectype)
16582     {
16583       const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16584
16585       /* Detect cases in which a vector load or store represents an
16586          LD[234] or ST[234] instruction.  */
16587       switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16588         {
16589         case 2:
16590           stmt_cost += simd_costs->ld2_st2_permute_cost;
16591           break;
16592
16593         case 3:
16594           stmt_cost += simd_costs->ld3_st3_permute_cost;
16595           break;
16596
16597         case 4:
16598           stmt_cost += simd_costs->ld4_st4_permute_cost;
16599           break;
16600         }
16601
16602       if (kind == vector_stmt || kind == vec_to_scalar)
16603         if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16604           {
16605             if (FLOAT_TYPE_P (cmp_type))
16606               stmt_cost += simd_costs->fp_stmt_cost;
16607             else
16608               stmt_cost += simd_costs->int_stmt_cost;
16609           }
16610     }
16611
16612   if (kind == scalar_stmt)
16613     if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16614       {
16615         if (FLOAT_TYPE_P (cmp_type))
16616           stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost;
16617         else
16618           stmt_cost += aarch64_tune_params.vec_costs->scalar_int_stmt_cost;
16619       }
16620
16621   return stmt_cost;
16622 }
16623
16624 /* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost
16625    and they describe an operation in the body of a vector loop.  Record issue
16626    information relating to the vector operation in OPS.  */
16627 void
16628 aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
16629                                  stmt_vec_info stmt_info,
16630                                  aarch64_vec_op_count *ops)
16631 {
16632   const aarch64_base_vec_issue_info *base_issue = ops->base_issue_info ();
16633   if (!base_issue)
16634     return;
16635   const aarch64_simd_vec_issue_info *simd_issue = ops->simd_issue_info ();
16636   const aarch64_sve_vec_issue_info *sve_issue = ops->sve_issue_info ();
16637
16638   /* Calculate the minimum cycles per iteration imposed by a reduction
16639      operation.  */
16640   if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
16641       && vect_is_reduction (stmt_info))
16642     {
16643       unsigned int base
16644         = aarch64_in_loop_reduction_latency (m_vinfo, stmt_info, m_vec_flags);
16645
16646       /* ??? Ideally we'd do COUNT reductions in parallel, but unfortunately
16647          that's not yet the case.  */
16648       ops->reduction_latency = MAX (ops->reduction_latency, base * count);
16649     }
16650
16651   /* Assume that multiply-adds will become a single operation.  */
16652   if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
16653     return;
16654
16655   /* Count the basic operation cost associated with KIND.  */
16656   switch (kind)
16657     {
16658     case cond_branch_taken:
16659     case cond_branch_not_taken:
16660     case vector_gather_load:
16661     case vector_scatter_store:
16662       /* We currently don't expect these to be used in a loop body.  */
16663       break;
16664
16665     case vec_perm:
16666     case vec_promote_demote:
16667     case vec_construct:
16668     case vec_to_scalar:
16669     case scalar_to_vec:
16670     case vector_stmt:
16671     case scalar_stmt:
16672       ops->general_ops += count;
16673       break;
16674
16675     case scalar_load:
16676     case vector_load:
16677     case unaligned_load:
16678       ops->loads += count;
16679       if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
16680         ops->general_ops += base_issue->fp_simd_load_general_ops * count;
16681       break;
16682
16683     case vector_store:
16684     case unaligned_store:
16685     case scalar_store:
16686       ops->stores += count;
16687       if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
16688         ops->general_ops += base_issue->fp_simd_store_general_ops * count;
16689       break;
16690     }
16691
16692   /* Add any embedded comparison operations.  */
16693   if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
16694       && vect_embedded_comparison_type (stmt_info))
16695     ops->general_ops += count;
16696
16697   /* COND_REDUCTIONS need two sets of VEC_COND_EXPRs, whereas so far we
16698      have only accounted for one.  */
16699   if ((kind == vector_stmt || kind == vec_to_scalar)
16700       && vect_reduc_type (m_vinfo, stmt_info) == COND_REDUCTION)
16701     ops->general_ops += count;
16702
16703   /* Count the predicate operations needed by an SVE comparison.  */
16704   if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
16705     if (tree type = vect_comparison_type (stmt_info))
16706       {
16707         unsigned int base = (FLOAT_TYPE_P (type)
16708                              ? sve_issue->fp_cmp_pred_ops
16709                              : sve_issue->int_cmp_pred_ops);
16710         ops->pred_ops += base * count;
16711       }
16712
16713   /* Add any extra overhead associated with LD[234] and ST[234] operations.  */
16714   if (simd_issue)
16715     switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16716       {
16717       case 2:
16718         ops->general_ops += simd_issue->ld2_st2_general_ops * count;
16719         break;
16720
16721       case 3:
16722         ops->general_ops += simd_issue->ld3_st3_general_ops * count;
16723         break;
16724
16725       case 4:
16726         ops->general_ops += simd_issue->ld4_st4_general_ops * count;
16727         break;
16728       }
16729
16730   /* Add any overhead associated with gather loads and scatter stores.  */
16731   if (sve_issue
16732       && (kind == scalar_load || kind == scalar_store)
16733       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16734     {
16735       unsigned int pairs = CEIL (count, 2);
16736       ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs;
16737       ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs;
16738     }
16739 }
16740
16741 /* Return true if STMT_INFO contains a memory access and if the constant
16742    component of the memory address is aligned to SIZE bytes.  */
16743 static bool
16744 aarch64_aligned_constant_offset_p (stmt_vec_info stmt_info,
16745                                    poly_uint64 size)
16746 {
16747   if (!STMT_VINFO_DATA_REF (stmt_info))
16748     return false;
16749
16750   if (auto first_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info))
16751     stmt_info = first_stmt;
16752   tree constant_offset = DR_INIT (STMT_VINFO_DATA_REF (stmt_info));
16753   /* Needed for gathers & scatters, for example.  */
16754   if (!constant_offset)
16755     return false;
16756
16757   return multiple_p (wi::to_poly_offset (constant_offset), size);
16758 }
16759
16760 /* Check if a scalar or vector stmt could be part of a region of code
16761    that does nothing more than store values to memory, in the scalar
16762    case using STP.  Return the cost of the stmt if so, counting 2 for
16763    one instruction.  Return ~0U otherwise.
16764
16765    The arguments are a subset of those passed to add_stmt_cost.  */
16766 unsigned int
16767 aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind,
16768                            stmt_vec_info stmt_info, tree vectype)
16769 {
16770   /* Code that stores vector constants uses a vector_load to create
16771      the constant.  We don't apply the heuristic to that case for two
16772      main reasons:
16773
16774      - At the moment, STPs are only formed via peephole2, and the
16775        constant scalar moves would often come between STRs and so
16776        prevent STP formation.
16777
16778      - The scalar code also has to load the constant somehow, and that
16779        isn't costed.  */
16780   switch (kind)
16781     {
16782     case scalar_to_vec:
16783       /* Count 2 insns for a GPR->SIMD dup and 1 insn for a FPR->SIMD dup.  */
16784       return (FLOAT_TYPE_P (vectype) ? 2 : 4) * count;
16785
16786     case vec_construct:
16787       if (FLOAT_TYPE_P (vectype))
16788         /* Count 1 insn for the maximum number of FP->SIMD INS
16789            instructions.  */
16790         return (vect_nunits_for_cost (vectype) - 1) * 2 * count;
16791
16792       /* Count 2 insns for a GPR->SIMD move and 2 insns for the
16793          maximum number of GPR->SIMD INS instructions.  */
16794       return vect_nunits_for_cost (vectype) * 4 * count;
16795
16796     case vector_store:
16797     case unaligned_store:
16798       /* Count 1 insn per vector if we can't form STP Q pairs.  */
16799       if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
16800         return count * 2;
16801       if (aarch64_tune_params.extra_tuning_flags
16802           & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
16803         return count * 2;
16804
16805       if (stmt_info)
16806         {
16807           /* Assume we won't be able to use STP if the constant offset
16808              component of the address is misaligned.  ??? This could be
16809              removed if we formed STP pairs earlier, rather than relying
16810              on peephole2.  */
16811           auto size = GET_MODE_SIZE (TYPE_MODE (vectype));
16812           if (!aarch64_aligned_constant_offset_p (stmt_info, size))
16813             return count * 2;
16814         }
16815       return CEIL (count, 2) * 2;
16816
16817     case scalar_store:
16818       if (stmt_info && STMT_VINFO_DATA_REF (stmt_info))
16819         {
16820           /* Check for a mode in which STP pairs can be formed.  */
16821           auto size = GET_MODE_SIZE (TYPE_MODE (aarch64_dr_type (stmt_info)));
16822           if (maybe_ne (size, 4) && maybe_ne (size, 8))
16823             return ~0U;
16824
16825           /* Assume we won't be able to use STP if the constant offset
16826              component of the address is misaligned.  ??? This could be
16827              removed if we formed STP pairs earlier, rather than relying
16828              on peephole2.  */
16829           if (!aarch64_aligned_constant_offset_p (stmt_info, size))
16830             return ~0U;
16831         }
16832       return count;
16833
16834     default:
16835       return ~0U;
16836     }
16837 }
16838
16839 unsigned
16840 aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
16841                                      stmt_vec_info stmt_info, slp_tree,
16842                                      tree vectype, int misalign,
16843                                      vect_cost_model_location where)
16844 {
16845   fractional_cost stmt_cost
16846     = aarch64_builtin_vectorization_cost (kind, vectype, misalign);
16847
16848   bool in_inner_loop_p = (where == vect_body
16849                           && stmt_info
16850                           && stmt_in_inner_loop_p (m_vinfo, stmt_info));
16851
16852   /* Do one-time initialization based on the vinfo.  */
16853   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
16854   if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ())
16855     {
16856       if (loop_vinfo)
16857         analyze_loop_vinfo (loop_vinfo);
16858
16859       m_analyzed_vinfo = true;
16860     }
16861
16862   /* Apply the heuristic described above m_stp_sequence_cost.  */
16863   if (m_stp_sequence_cost != ~0U)
16864     {
16865       uint64_t cost = aarch64_stp_sequence_cost (count, kind,
16866                                                  stmt_info, vectype);
16867       m_stp_sequence_cost = MIN (m_stp_sequence_cost + cost, ~0U);
16868     }
16869
16870   /* Try to get a more accurate cost by looking at STMT_INFO instead
16871      of just looking at KIND.  */
16872   if (stmt_info && aarch64_use_new_vector_costs_p ())
16873     {
16874       /* If we scalarize a strided store, the vectorizer costs one
16875          vec_to_scalar for each element.  However, we can store the first
16876          element using an FP store without a separate extract step.  */
16877       if (vect_is_store_elt_extraction (kind, stmt_info))
16878         count -= 1;
16879
16880       stmt_cost = aarch64_detect_scalar_stmt_subtype (m_vinfo, kind,
16881                                                       stmt_info, stmt_cost);
16882
16883       if (vectype && m_vec_flags)
16884         stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
16885                                                         stmt_info, vectype,
16886                                                         where, stmt_cost);
16887     }
16888
16889   /* Do any SVE-specific adjustments to the cost.  */
16890   if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
16891     stmt_cost = aarch64_sve_adjust_stmt_cost (m_vinfo, kind, stmt_info,
16892                                               vectype, stmt_cost);
16893
16894   if (stmt_info && aarch64_use_new_vector_costs_p ())
16895     {
16896       /* Account for any extra "embedded" costs that apply additively
16897          to the base cost calculated above.  */
16898       stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
16899                                             stmt_cost);
16900
16901       /* If we're recording a nonzero vector loop body cost for the
16902          innermost loop, also estimate the operations that would need
16903          to be issued by all relevant implementations of the loop.  */
16904       if (loop_vinfo
16905           && (m_costing_for_scalar || where == vect_body)
16906           && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p)
16907           && stmt_cost != 0)
16908         for (auto &ops : m_ops)
16909           count_ops (count, kind, stmt_info, &ops);
16910
16911       /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
16912          estimate the number of statements in the unrolled Advanced SIMD
16913          loop.  For simplicitly, we assume that one iteration of the
16914          Advanced SIMD loop would need the same number of statements
16915          as one iteration of the SVE loop.  */
16916       if (where == vect_body && m_unrolled_advsimd_niters)
16917         m_unrolled_advsimd_stmts += count * m_unrolled_advsimd_niters;
16918
16919       /* Detect the use of an averaging operation.  */
16920       gimple *stmt = stmt_info->stmt;
16921       if (is_gimple_call (stmt)
16922           && gimple_call_internal_p (stmt))
16923         {
16924           switch (gimple_call_internal_fn (stmt))
16925             {
16926             case IFN_AVG_FLOOR:
16927             case IFN_AVG_CEIL:
16928               m_has_avg = true;
16929             default:
16930               break;
16931             }
16932         }
16933     }
16934   return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
16935 }
16936
16937 /* Return true if (a) we're applying the Advanced SIMD vs. SVE unrolling
16938    heuristic described above m_unrolled_advsimd_niters and (b) the heuristic
16939    says that we should prefer the Advanced SIMD loop.  */
16940 bool
16941 aarch64_vector_costs::prefer_unrolled_loop () const
16942 {
16943   if (!m_unrolled_advsimd_stmts)
16944     return false;
16945
16946   if (dump_enabled_p ())
16947     dump_printf_loc (MSG_NOTE, vect_location, "Number of insns in"
16948                      " unrolled Advanced SIMD loop = "
16949                      HOST_WIDE_INT_PRINT_UNSIGNED "\n",
16950                      m_unrolled_advsimd_stmts);
16951
16952   /* The balance here is tricky.  On the one hand, we can't be sure whether
16953      the code is vectorizable with Advanced SIMD or not.  However, even if
16954      it isn't vectorizable with Advanced SIMD, there's a possibility that
16955      the scalar code could also be unrolled.  Some of the code might then
16956      benefit from SLP, or from using LDP and STP.  We therefore apply
16957      the heuristic regardless of can_use_advsimd_p.  */
16958   return (m_unrolled_advsimd_stmts
16959           && (m_unrolled_advsimd_stmts
16960               <= (unsigned int) param_max_completely_peeled_insns));
16961 }
16962
16963 /* Subroutine of adjust_body_cost for handling SVE.  Use ISSUE_INFO to work out
16964    how fast the SVE code can be issued and compare it to the equivalent value
16965    for scalar code (SCALAR_CYCLES_PER_ITER).  If COULD_USE_ADVSIMD is true,
16966    also compare it to the issue rate of Advanced SIMD code
16967    (ADVSIMD_CYCLES_PER_ITER).
16968
16969    ORIG_BODY_COST is the cost originally passed to adjust_body_cost and
16970    *BODY_COST is the current value of the adjusted cost.  *SHOULD_DISPARAGE
16971    is true if we think the loop body is too expensive.  */
16972
16973 fractional_cost
16974 aarch64_vector_costs::
16975 adjust_body_cost_sve (const aarch64_vec_op_count *ops,
16976                       fractional_cost scalar_cycles_per_iter,
16977                       unsigned int orig_body_cost, unsigned int *body_cost,
16978                       bool *should_disparage)
16979 {
16980   if (dump_enabled_p ())
16981     ops->dump ();
16982
16983   fractional_cost sve_pred_cycles_per_iter = ops->min_pred_cycles_per_iter ();
16984   fractional_cost sve_cycles_per_iter = ops->min_cycles_per_iter ();
16985
16986   /* If the scalar version of the loop could issue at least as
16987      quickly as the predicate parts of the SVE loop, make the SVE loop
16988      prohibitively expensive.  In this case vectorization is adding an
16989      overhead that the original scalar code didn't have.
16990
16991      This is mostly intended to detect cases in which WHILELOs dominate
16992      for very tight loops, which is something that normal latency-based
16993      costs would not model.  Adding this kind of cliffedge would be
16994      too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
16995      code in the caller handles that case in a more conservative way.  */
16996   fractional_cost sve_estimate = sve_pred_cycles_per_iter + 1;
16997   if (scalar_cycles_per_iter < sve_estimate)
16998     {
16999       unsigned int min_cost
17000         = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
17001       if (*body_cost < min_cost)
17002         {
17003           if (dump_enabled_p ())
17004             dump_printf_loc (MSG_NOTE, vect_location,
17005                              "Increasing body cost to %d because the"
17006                              " scalar code could issue within the limit"
17007                              " imposed by predicate operations\n",
17008                              min_cost);
17009           *body_cost = min_cost;
17010           *should_disparage = true;
17011         }
17012     }
17013
17014   return sve_cycles_per_iter;
17015 }
17016
17017 unsigned int
17018 aarch64_vector_costs::determine_suggested_unroll_factor ()
17019 {
17020   bool sve = m_vec_flags & VEC_ANY_SVE;
17021   /* If we are trying to unroll an Advanced SIMD main loop that contains
17022      an averaging operation that we do not support with SVE and we might use a
17023      predicated epilogue, we need to be conservative and block unrolling as
17024      this might lead to a less optimal loop for the first and only epilogue
17025      using the original loop's vectorization factor.
17026      TODO: Remove this constraint when we add support for multiple epilogue
17027      vectorization.  */
17028   if (!sve && !TARGET_SVE2 && m_has_avg)
17029     return 1;
17030
17031   unsigned int max_unroll_factor = 1;
17032   for (auto vec_ops : m_ops)
17033     {
17034       aarch64_simd_vec_issue_info const *vec_issue
17035         = vec_ops.simd_issue_info ();
17036       if (!vec_issue)
17037         return 1;
17038       /* Limit unroll factor to a value adjustable by the user, the default
17039          value is 4. */
17040       unsigned int unroll_factor = aarch64_vect_unroll_limit;
17041       unsigned int factor
17042        = vec_ops.reduction_latency > 1 ? vec_ops.reduction_latency : 1;
17043       unsigned int temp;
17044
17045       /* Sanity check, this should never happen.  */
17046       if ((vec_ops.stores + vec_ops.loads + vec_ops.general_ops) == 0)
17047         return 1;
17048
17049       /* Check stores.  */
17050       if (vec_ops.stores > 0)
17051         {
17052           temp = CEIL (factor * vec_issue->stores_per_cycle,
17053                        vec_ops.stores);
17054           unroll_factor = MIN (unroll_factor, temp);
17055         }
17056
17057       /* Check loads + stores.  */
17058       if (vec_ops.loads > 0)
17059         {
17060           temp = CEIL (factor * vec_issue->loads_stores_per_cycle,
17061                        vec_ops.loads + vec_ops.stores);
17062           unroll_factor = MIN (unroll_factor, temp);
17063         }
17064
17065       /* Check general ops.  */
17066       if (vec_ops.general_ops > 0)
17067         {
17068           temp = CEIL (factor * vec_issue->general_ops_per_cycle,
17069                        vec_ops.general_ops);
17070           unroll_factor = MIN (unroll_factor, temp);
17071          }
17072       max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
17073     }
17074
17075   /* Make sure unroll factor is power of 2.  */
17076   return 1 << ceil_log2 (max_unroll_factor);
17077 }
17078
17079 /* BODY_COST is the cost of a vector loop body.  Adjust the cost as necessary
17080    and return the new cost.  */
17081 unsigned int
17082 aarch64_vector_costs::
17083 adjust_body_cost (loop_vec_info loop_vinfo,
17084                   const aarch64_vector_costs *scalar_costs,
17085                   unsigned int body_cost)
17086 {
17087   if (scalar_costs->m_ops.is_empty () || m_ops.is_empty ())
17088     return body_cost;
17089
17090   const auto &scalar_ops = scalar_costs->m_ops[0];
17091   const auto &vector_ops = m_ops[0];
17092   unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo);
17093   unsigned int orig_body_cost = body_cost;
17094   bool should_disparage = false;
17095
17096   if (dump_enabled_p ())
17097     dump_printf_loc (MSG_NOTE, vect_location,
17098                      "Original vector body cost = %d\n", body_cost);
17099
17100   fractional_cost scalar_cycles_per_iter
17101     = scalar_ops.min_cycles_per_iter () * estimated_vf;
17102
17103   fractional_cost vector_cycles_per_iter = vector_ops.min_cycles_per_iter ();
17104
17105   if (dump_enabled_p ())
17106     {
17107       if (IN_RANGE (m_num_vector_iterations, 0, 65536))
17108         dump_printf_loc (MSG_NOTE, vect_location,
17109                          "Vector loop iterates at most %wd times\n",
17110                          m_num_vector_iterations);
17111       dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
17112       scalar_ops.dump ();
17113       dump_printf_loc (MSG_NOTE, vect_location,
17114                        "  estimated cycles per vector iteration"
17115                        " (for VF %d) = %f\n",
17116                        estimated_vf, scalar_cycles_per_iter.as_double ());
17117     }
17118
17119   if (vector_ops.sve_issue_info ())
17120     {
17121       if (dump_enabled_p ())
17122         dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
17123       vector_cycles_per_iter
17124         = adjust_body_cost_sve (&vector_ops, scalar_cycles_per_iter,
17125                                 orig_body_cost, &body_cost, &should_disparage);
17126
17127       if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
17128         {
17129           /* Also take Neoverse V1 tuning into account, doubling the
17130              scalar and Advanced SIMD estimates to account for the
17131              doubling in SVE vector length.  */
17132           if (dump_enabled_p ())
17133             dump_printf_loc (MSG_NOTE, vect_location,
17134                              "Neoverse V1 estimate:\n");
17135           auto vf_factor = m_ops[1].vf_factor ();
17136           adjust_body_cost_sve (&m_ops[1], scalar_cycles_per_iter * vf_factor,
17137                                 orig_body_cost, &body_cost, &should_disparage);
17138         }
17139     }
17140   else
17141     {
17142       if (dump_enabled_p ())
17143         {
17144           dump_printf_loc (MSG_NOTE, vect_location,
17145                            "Vector issue estimate:\n");
17146           vector_ops.dump ();
17147         }
17148     }
17149
17150   /* Decide whether to stick to latency-based costs or whether to try to
17151      take issue rates into account.  */
17152   unsigned int threshold = aarch64_loop_vect_issue_rate_niters;
17153   if (m_vec_flags & VEC_ANY_SVE)
17154     threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
17155
17156   if (m_num_vector_iterations >= 1
17157       && m_num_vector_iterations < threshold)
17158     {
17159       if (dump_enabled_p ())
17160         dump_printf_loc (MSG_NOTE, vect_location,
17161                          "Low iteration count, so using pure latency"
17162                          " costs\n");
17163     }
17164   /* Increase the cost of the vector code if it looks like the scalar code
17165      could issue more quickly.  These values are only rough estimates,
17166      so minor differences should only result in minor changes.  */
17167   else if (scalar_cycles_per_iter < vector_cycles_per_iter)
17168     {
17169       body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17170                                           scalar_cycles_per_iter);
17171       if (dump_enabled_p ())
17172         dump_printf_loc (MSG_NOTE, vect_location,
17173                          "Increasing body cost to %d because scalar code"
17174                          " would issue more quickly\n", body_cost);
17175     }
17176   /* In general, it's expected that the proposed vector code would be able
17177      to issue more quickly than the original scalar code.  This should
17178      already be reflected to some extent in the latency-based costs.
17179
17180      However, the latency-based costs effectively assume that the scalar
17181      code and the vector code execute serially, which tends to underplay
17182      one important case: if the real (non-serialized) execution time of
17183      a scalar iteration is dominated by loop-carried dependencies,
17184      and if the vector code is able to reduce both the length of
17185      the loop-carried dependencies *and* the number of cycles needed
17186      to issue the code in general, we can be more confident that the
17187      vector code is an improvement, even if adding the other (non-loop-carried)
17188      latencies tends to hide this saving.  We therefore reduce the cost of the
17189      vector loop body in proportion to the saving.  */
17190   else if (scalar_ops.reduction_latency > vector_ops.reduction_latency
17191            && scalar_ops.reduction_latency == scalar_cycles_per_iter
17192            && scalar_cycles_per_iter > vector_cycles_per_iter
17193            && !should_disparage)
17194     {
17195       body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17196                                           scalar_cycles_per_iter);
17197       if (dump_enabled_p ())
17198         dump_printf_loc (MSG_NOTE, vect_location,
17199                          "Decreasing body cost to %d account for smaller"
17200                          " reduction latency\n", body_cost);
17201     }
17202
17203   return body_cost;
17204 }
17205
17206 void
17207 aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
17208 {
17209   auto *scalar_costs
17210     = static_cast<const aarch64_vector_costs *> (uncast_scalar_costs);
17211   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
17212   if (loop_vinfo
17213       && m_vec_flags
17214       && aarch64_use_new_vector_costs_p ())
17215     {
17216       m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
17217                                              m_costs[vect_body]);
17218       m_suggested_unroll_factor = determine_suggested_unroll_factor ();
17219     }
17220
17221   /* Apply the heuristic described above m_stp_sequence_cost.  Prefer
17222      the scalar code in the event of a tie, since there is more chance
17223      of scalar code being optimized with surrounding operations.  */
17224   if (!loop_vinfo
17225       && scalar_costs
17226       && m_stp_sequence_cost != ~0U
17227       && m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
17228     m_costs[vect_body] = 2 * scalar_costs->total_cost ();
17229
17230   vector_costs::finish_cost (scalar_costs);
17231 }
17232
17233 bool
17234 aarch64_vector_costs::
17235 better_main_loop_than_p (const vector_costs *uncast_other) const
17236 {
17237   auto other = static_cast<const aarch64_vector_costs *> (uncast_other);
17238
17239   auto this_loop_vinfo = as_a<loop_vec_info> (this->m_vinfo);
17240   auto other_loop_vinfo = as_a<loop_vec_info> (other->m_vinfo);
17241
17242   if (dump_enabled_p ())
17243     dump_printf_loc (MSG_NOTE, vect_location,
17244                      "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
17245                      GET_MODE_NAME (this_loop_vinfo->vector_mode),
17246                      vect_vf_for_cost (this_loop_vinfo),
17247                      GET_MODE_NAME (other_loop_vinfo->vector_mode),
17248                      vect_vf_for_cost (other_loop_vinfo));
17249
17250   /* Apply the unrolling heuristic described above
17251      m_unrolled_advsimd_niters.  */
17252   if (bool (m_unrolled_advsimd_stmts)
17253       != bool (other->m_unrolled_advsimd_stmts))
17254     {
17255       bool this_prefer_unrolled = this->prefer_unrolled_loop ();
17256       bool other_prefer_unrolled = other->prefer_unrolled_loop ();
17257       if (this_prefer_unrolled != other_prefer_unrolled)
17258         {
17259           if (dump_enabled_p ())
17260             dump_printf_loc (MSG_NOTE, vect_location,
17261                              "Preferring Advanced SIMD loop because"
17262                              " it can be unrolled\n");
17263           return other_prefer_unrolled;
17264         }
17265     }
17266
17267   for (unsigned int i = 0; i < m_ops.length (); ++i)
17268     {
17269       if (dump_enabled_p ())
17270         {
17271           if (i)
17272             dump_printf_loc (MSG_NOTE, vect_location,
17273                              "Reconsidering with subtuning %d\n", i);
17274           dump_printf_loc (MSG_NOTE, vect_location,
17275                            "Issue info for %s loop:\n",
17276                            GET_MODE_NAME (this_loop_vinfo->vector_mode));
17277           this->m_ops[i].dump ();
17278           dump_printf_loc (MSG_NOTE, vect_location,
17279                            "Issue info for %s loop:\n",
17280                            GET_MODE_NAME (other_loop_vinfo->vector_mode));
17281           other->m_ops[i].dump ();
17282         }
17283
17284       auto this_estimated_vf = (vect_vf_for_cost (this_loop_vinfo)
17285                                 * this->m_ops[i].vf_factor ());
17286       auto other_estimated_vf = (vect_vf_for_cost (other_loop_vinfo)
17287                                  * other->m_ops[i].vf_factor ());
17288
17289       /* If it appears that one loop could process the same amount of data
17290          in fewer cycles, prefer that loop over the other one.  */
17291       fractional_cost this_cost
17292         = this->m_ops[i].min_cycles_per_iter () * other_estimated_vf;
17293       fractional_cost other_cost
17294         = other->m_ops[i].min_cycles_per_iter () * this_estimated_vf;
17295       if (dump_enabled_p ())
17296         {
17297           dump_printf_loc (MSG_NOTE, vect_location,
17298                            "Weighted cycles per iteration of %s loop ~= %f\n",
17299                            GET_MODE_NAME (this_loop_vinfo->vector_mode),
17300                            this_cost.as_double ());
17301           dump_printf_loc (MSG_NOTE, vect_location,
17302                            "Weighted cycles per iteration of %s loop ~= %f\n",
17303                            GET_MODE_NAME (other_loop_vinfo->vector_mode),
17304                            other_cost.as_double ());
17305         }
17306       if (this_cost != other_cost)
17307         {
17308           if (dump_enabled_p ())
17309             dump_printf_loc (MSG_NOTE, vect_location,
17310                              "Preferring loop with lower cycles"
17311                              " per iteration\n");
17312           return this_cost < other_cost;
17313         }
17314
17315       /* If the issue rate of SVE code is limited by predicate operations
17316          (i.e. if sve_pred_cycles_per_iter > sve_nonpred_cycles_per_iter),
17317          and if Advanced SIMD code could issue within the limit imposed
17318          by the predicate operations, the predicate operations are adding an
17319          overhead that the original code didn't have and so we should prefer
17320          the Advanced SIMD version.  */
17321       auto better_pred_limit_p = [](const aarch64_vec_op_count &a,
17322                                     const aarch64_vec_op_count &b) -> bool
17323         {
17324           if (a.pred_ops == 0
17325               && (b.min_pred_cycles_per_iter ()
17326                   > b.min_nonpred_cycles_per_iter ()))
17327             {
17328               if (dump_enabled_p ())
17329                 dump_printf_loc (MSG_NOTE, vect_location,
17330                                  "Preferring Advanced SIMD loop since"
17331                                  " SVE loop is predicate-limited\n");
17332               return true;
17333             }
17334           return false;
17335         };
17336       if (better_pred_limit_p (this->m_ops[i], other->m_ops[i]))
17337         return true;
17338       if (better_pred_limit_p (other->m_ops[i], this->m_ops[i]))
17339         return false;
17340     }
17341
17342   return vector_costs::better_main_loop_than_p (other);
17343 }
17344
17345 static void initialize_aarch64_code_model (struct gcc_options *);
17346
17347 /* Parse the TO_PARSE string and put the architecture struct that it
17348    selects into RES and the architectural features into ISA_FLAGS.
17349    Return an aarch64_parse_opt_result describing the parse result.
17350    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
17351    When the TO_PARSE string contains an invalid extension,
17352    a copy of the string is created and stored to INVALID_EXTENSION.  */
17353
17354 static enum aarch64_parse_opt_result
17355 aarch64_parse_arch (const char *to_parse, const struct processor **res,
17356                     aarch64_feature_flags *isa_flags,
17357                     std::string *invalid_extension)
17358 {
17359   const char *ext;
17360   const struct processor *arch;
17361   size_t len;
17362
17363   ext = strchr (to_parse, '+');
17364
17365   if (ext != NULL)
17366     len = ext - to_parse;
17367   else
17368     len = strlen (to_parse);
17369
17370   if (len == 0)
17371     return AARCH64_PARSE_MISSING_ARG;
17372
17373
17374   /* Loop through the list of supported ARCHes to find a match.  */
17375   for (arch = all_architectures; arch->name != NULL; arch++)
17376     {
17377       if (strlen (arch->name) == len
17378           && strncmp (arch->name, to_parse, len) == 0)
17379         {
17380           auto isa_temp = arch->flags;
17381
17382           if (ext != NULL)
17383             {
17384               /* TO_PARSE string contains at least one extension.  */
17385               enum aarch64_parse_opt_result ext_res
17386                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17387
17388               if (ext_res != AARCH64_PARSE_OK)
17389                 return ext_res;
17390             }
17391           /* Extension parsing was successful.  Confirm the result
17392              arch and ISA flags.  */
17393           *res = arch;
17394           *isa_flags = isa_temp;
17395           return AARCH64_PARSE_OK;
17396         }
17397     }
17398
17399   /* ARCH name not found in list.  */
17400   return AARCH64_PARSE_INVALID_ARG;
17401 }
17402
17403 /* Parse the TO_PARSE string and put the result tuning in RES and the
17404    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
17405    describing the parse result.  If there is an error parsing, RES and
17406    ISA_FLAGS are left unchanged.
17407    When the TO_PARSE string contains an invalid extension,
17408    a copy of the string is created and stored to INVALID_EXTENSION.  */
17409
17410 static enum aarch64_parse_opt_result
17411 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
17412                    aarch64_feature_flags *isa_flags,
17413                    std::string *invalid_extension)
17414 {
17415   const char *ext;
17416   const struct processor *cpu;
17417   size_t len;
17418
17419   ext = strchr (to_parse, '+');
17420
17421   if (ext != NULL)
17422     len = ext - to_parse;
17423   else
17424     len = strlen (to_parse);
17425
17426   if (len == 0)
17427     return AARCH64_PARSE_MISSING_ARG;
17428
17429
17430   /* Loop through the list of supported CPUs to find a match.  */
17431   for (cpu = all_cores; cpu->name != NULL; cpu++)
17432     {
17433       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
17434         {
17435           auto isa_temp = cpu->flags;
17436
17437           if (ext != NULL)
17438             {
17439               /* TO_PARSE string contains at least one extension.  */
17440               enum aarch64_parse_opt_result ext_res
17441                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17442
17443               if (ext_res != AARCH64_PARSE_OK)
17444                 return ext_res;
17445             }
17446           /* Extension parsing was successfull.  Confirm the result
17447              cpu and ISA flags.  */
17448           *res = cpu;
17449           *isa_flags = isa_temp;
17450           return AARCH64_PARSE_OK;
17451         }
17452     }
17453
17454   /* CPU name not found in list.  */
17455   return AARCH64_PARSE_INVALID_ARG;
17456 }
17457
17458 /* Parse the TO_PARSE string and put the cpu it selects into RES.
17459    Return an aarch64_parse_opt_result describing the parse result.
17460    If the parsing fails the RES does not change.  */
17461
17462 static enum aarch64_parse_opt_result
17463 aarch64_parse_tune (const char *to_parse, const struct processor **res)
17464 {
17465   const struct processor *cpu;
17466
17467   /* Loop through the list of supported CPUs to find a match.  */
17468   for (cpu = all_cores; cpu->name != NULL; cpu++)
17469     {
17470       if (strcmp (cpu->name, to_parse) == 0)
17471         {
17472           *res = cpu;
17473           return AARCH64_PARSE_OK;
17474         }
17475     }
17476
17477   /* CPU name not found in list.  */
17478   return AARCH64_PARSE_INVALID_ARG;
17479 }
17480
17481 /* Parse TOKEN, which has length LENGTH to see if it is an option
17482    described in FLAG.  If it is, return the index bit for that fusion type.
17483    If not, error (printing OPTION_NAME) and return zero.  */
17484
17485 static unsigned int
17486 aarch64_parse_one_option_token (const char *token,
17487                                 size_t length,
17488                                 const struct aarch64_flag_desc *flag,
17489                                 const char *option_name)
17490 {
17491   for (; flag->name != NULL; flag++)
17492     {
17493       if (length == strlen (flag->name)
17494           && !strncmp (flag->name, token, length))
17495         return flag->flag;
17496     }
17497
17498   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
17499   return 0;
17500 }
17501
17502 /* Parse OPTION which is a comma-separated list of flags to enable.
17503    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
17504    default state we inherit from the CPU tuning structures.  OPTION_NAME
17505    gives the top-level option we are parsing in the -moverride string,
17506    for use in error messages.  */
17507
17508 static unsigned int
17509 aarch64_parse_boolean_options (const char *option,
17510                                const struct aarch64_flag_desc *flags,
17511                                unsigned int initial_state,
17512                                const char *option_name)
17513 {
17514   const char separator = '.';
17515   const char* specs = option;
17516   const char* ntoken = option;
17517   unsigned int found_flags = initial_state;
17518
17519   while ((ntoken = strchr (specs, separator)))
17520     {
17521       size_t token_length = ntoken - specs;
17522       unsigned token_ops = aarch64_parse_one_option_token (specs,
17523                                                            token_length,
17524                                                            flags,
17525                                                            option_name);
17526       /* If we find "none" (or, for simplicity's sake, an error) anywhere
17527          in the token stream, reset the supported operations.  So:
17528
17529            adrp+add.cmp+branch.none.adrp+add
17530
17531            would have the result of turning on only adrp+add fusion.  */
17532       if (!token_ops)
17533         found_flags = 0;
17534
17535       found_flags |= token_ops;
17536       specs = ++ntoken;
17537     }
17538
17539   /* We ended with a comma, print something.  */
17540   if (!(*specs))
17541     {
17542       error ("%qs string ill-formed", option_name);
17543       return 0;
17544     }
17545
17546   /* We still have one more token to parse.  */
17547   size_t token_length = strlen (specs);
17548   unsigned token_ops = aarch64_parse_one_option_token (specs,
17549                                                        token_length,
17550                                                        flags,
17551                                                        option_name);
17552    if (!token_ops)
17553      found_flags = 0;
17554
17555   found_flags |= token_ops;
17556   return found_flags;
17557 }
17558
17559 /* Support for overriding instruction fusion.  */
17560
17561 static void
17562 aarch64_parse_fuse_string (const char *fuse_string,
17563                             struct tune_params *tune)
17564 {
17565   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
17566                                                      aarch64_fusible_pairs,
17567                                                      tune->fusible_ops,
17568                                                      "fuse=");
17569 }
17570
17571 /* Support for overriding other tuning flags.  */
17572
17573 static void
17574 aarch64_parse_tune_string (const char *tune_string,
17575                             struct tune_params *tune)
17576 {
17577   tune->extra_tuning_flags
17578     = aarch64_parse_boolean_options (tune_string,
17579                                      aarch64_tuning_flags,
17580                                      tune->extra_tuning_flags,
17581                                      "tune=");
17582 }
17583
17584 /* Parse the sve_width tuning moverride string in TUNE_STRING.
17585    Accept the valid SVE vector widths allowed by
17586    aarch64_sve_vector_bits_enum and use it to override sve_width
17587    in TUNE.  */
17588
17589 static void
17590 aarch64_parse_sve_width_string (const char *tune_string,
17591                                 struct tune_params *tune)
17592 {
17593   int width = -1;
17594
17595   int n = sscanf (tune_string, "%d", &width);
17596   if (n == EOF)
17597     {
17598       error ("invalid format for %<sve_width%>");
17599       return;
17600     }
17601   switch (width)
17602     {
17603     case SVE_128:
17604     case SVE_256:
17605     case SVE_512:
17606     case SVE_1024:
17607     case SVE_2048:
17608       break;
17609     default:
17610       error ("invalid %<sve_width%> value: %d", width);
17611     }
17612   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
17613 }
17614
17615 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
17616    we understand.  If it is, extract the option string and handoff to
17617    the appropriate function.  */
17618
17619 void
17620 aarch64_parse_one_override_token (const char* token,
17621                                   size_t length,
17622                                   struct tune_params *tune)
17623 {
17624   const struct aarch64_tuning_override_function *fn
17625     = aarch64_tuning_override_functions;
17626
17627   const char *option_part = strchr (token, '=');
17628   if (!option_part)
17629     {
17630       error ("tuning string missing in option (%s)", token);
17631       return;
17632     }
17633
17634   /* Get the length of the option name.  */
17635   length = option_part - token;
17636   /* Skip the '=' to get to the option string.  */
17637   option_part++;
17638
17639   for (; fn->name != NULL; fn++)
17640     {
17641       if (!strncmp (fn->name, token, length))
17642         {
17643           fn->parse_override (option_part, tune);
17644           return;
17645         }
17646     }
17647
17648   error ("unknown tuning option (%s)",token);
17649   return;
17650 }
17651
17652 /* A checking mechanism for the implementation of the tls size.  */
17653
17654 static void
17655 initialize_aarch64_tls_size (struct gcc_options *opts)
17656 {
17657   if (aarch64_tls_size == 0)
17658     aarch64_tls_size = 24;
17659
17660   switch (opts->x_aarch64_cmodel_var)
17661     {
17662     case AARCH64_CMODEL_TINY:
17663       /* Both the default and maximum TLS size allowed under tiny is 1M which
17664          needs two instructions to address, so we clamp the size to 24.  */
17665       if (aarch64_tls_size > 24)
17666         aarch64_tls_size = 24;
17667       break;
17668     case AARCH64_CMODEL_SMALL:
17669       /* The maximum TLS size allowed under small is 4G.  */
17670       if (aarch64_tls_size > 32)
17671         aarch64_tls_size = 32;
17672       break;
17673     case AARCH64_CMODEL_LARGE:
17674       /* The maximum TLS size allowed under large is 16E.
17675          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
17676       if (aarch64_tls_size > 48)
17677         aarch64_tls_size = 48;
17678       break;
17679     default:
17680       gcc_unreachable ();
17681     }
17682
17683   return;
17684 }
17685
17686 /* Return the CPU corresponding to the enum CPU.  */
17687
17688 static const struct processor *
17689 aarch64_get_tune_cpu (enum aarch64_processor cpu)
17690 {
17691   gcc_assert (cpu != aarch64_none);
17692
17693   return &all_cores[cpu];
17694 }
17695
17696 /* Return the architecture corresponding to the enum ARCH.  */
17697
17698 static const struct processor *
17699 aarch64_get_arch (enum aarch64_arch arch)
17700 {
17701   gcc_assert (arch != aarch64_no_arch);
17702
17703   return &all_architectures[arch];
17704 }
17705
17706 /* Parse STRING looking for options in the format:
17707      string     :: option:string
17708      option     :: name=substring
17709      name       :: {a-z}
17710      substring  :: defined by option.  */
17711
17712 static void
17713 aarch64_parse_override_string (const char* input_string,
17714                                struct tune_params* tune)
17715 {
17716   const char separator = ':';
17717   size_t string_length = strlen (input_string) + 1;
17718   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
17719   char *string = string_root;
17720   strncpy (string, input_string, string_length);
17721   string[string_length - 1] = '\0';
17722
17723   char* ntoken = string;
17724
17725   while ((ntoken = strchr (string, separator)))
17726     {
17727       size_t token_length = ntoken - string;
17728       /* Make this substring look like a string.  */
17729       *ntoken = '\0';
17730       aarch64_parse_one_override_token (string, token_length, tune);
17731       string = ++ntoken;
17732     }
17733
17734   /* One last option to parse.  */
17735   aarch64_parse_one_override_token (string, strlen (string), tune);
17736   free (string_root);
17737 }
17738
17739 /* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
17740    are best for a generic target with the currently-enabled architecture
17741    extensions.  */
17742 static void
17743 aarch64_adjust_generic_arch_tuning (struct tune_params &current_tune)
17744 {
17745   /* Neoverse V1 is the only core that is known to benefit from
17746      AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS.  There is therefore no
17747      point enabling it for SVE2 and above.  */
17748   if (TARGET_SVE2)
17749     current_tune.extra_tuning_flags
17750       &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS;
17751 }
17752
17753 static void
17754 aarch64_override_options_after_change_1 (struct gcc_options *opts)
17755 {
17756   if (accepted_branch_protection_string)
17757     {
17758       opts->x_aarch64_branch_protection_string
17759         = xstrdup (accepted_branch_protection_string);
17760     }
17761
17762   /* PR 70044: We have to be careful about being called multiple times for the
17763      same function.  This means all changes should be repeatable.  */
17764
17765   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
17766      Disable the frame pointer flag so the mid-end will not use a frame
17767      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
17768      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
17769      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
17770   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
17771   if (opts->x_flag_omit_frame_pointer == 0)
17772     opts->x_flag_omit_frame_pointer = 2;
17773
17774   /* If not optimizing for size, set the default
17775      alignment to what the target wants.  */
17776   if (!opts->x_optimize_size)
17777     {
17778       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
17779         opts->x_str_align_loops = aarch64_tune_params.loop_align;
17780       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
17781         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
17782       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
17783         opts->x_str_align_functions = aarch64_tune_params.function_align;
17784     }
17785
17786   /* We default to no pc-relative literal loads.  */
17787
17788   aarch64_pcrelative_literal_loads = false;
17789
17790   /* If -mpc-relative-literal-loads is set on the command line, this
17791      implies that the user asked for PC relative literal loads.  */
17792   if (opts->x_pcrelative_literal_loads == 1)
17793     aarch64_pcrelative_literal_loads = true;
17794
17795   /* In the tiny memory model it makes no sense to disallow PC relative
17796      literal pool loads.  */
17797   if (aarch64_cmodel == AARCH64_CMODEL_TINY
17798       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
17799     aarch64_pcrelative_literal_loads = true;
17800
17801   /* When enabling the lower precision Newton series for the square root, also
17802      enable it for the reciprocal square root, since the latter is an
17803      intermediary step for the former.  */
17804   if (flag_mlow_precision_sqrt)
17805     flag_mrecip_low_precision_sqrt = true;
17806 }
17807
17808 /* 'Unpack' up the internal tuning structs and update the options
17809     in OPTS.  The caller must have set up selected_tune and selected_arch
17810     as all the other target-specific codegen decisions are
17811     derived from them.  */
17812
17813 void
17814 aarch64_override_options_internal (struct gcc_options *opts)
17815 {
17816   const struct processor *tune = aarch64_get_tune_cpu (opts->x_selected_tune);
17817   aarch64_tune_flags = tune->flags;
17818   aarch64_tune = tune->sched_core;
17819   /* Make a copy of the tuning parameters attached to the core, which
17820      we may later overwrite.  */
17821   aarch64_tune_params = *(tune->tune);
17822   if (tune->tune == &generic_tunings)
17823     aarch64_adjust_generic_arch_tuning (aarch64_tune_params);
17824
17825   if (opts->x_aarch64_override_tune_string)
17826     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
17827                                    &aarch64_tune_params);
17828
17829   /* This target defaults to strict volatile bitfields.  */
17830   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
17831     opts->x_flag_strict_volatile_bitfields = 1;
17832
17833   if (aarch64_stack_protector_guard == SSP_GLOBAL
17834       && opts->x_aarch64_stack_protector_guard_offset_str)
17835     {
17836       error ("incompatible options %<-mstack-protector-guard=global%> and "
17837              "%<-mstack-protector-guard-offset=%s%>",
17838              aarch64_stack_protector_guard_offset_str);
17839     }
17840
17841   if (aarch64_stack_protector_guard == SSP_SYSREG
17842       && !(opts->x_aarch64_stack_protector_guard_offset_str
17843            && opts->x_aarch64_stack_protector_guard_reg_str))
17844     {
17845       error ("both %<-mstack-protector-guard-offset%> and "
17846              "%<-mstack-protector-guard-reg%> must be used "
17847              "with %<-mstack-protector-guard=sysreg%>");
17848     }
17849
17850   if (opts->x_aarch64_stack_protector_guard_reg_str)
17851     {
17852       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
17853           error ("specify a system register with a small string length");
17854     }
17855
17856   if (opts->x_aarch64_stack_protector_guard_offset_str)
17857     {
17858       char *end;
17859       const char *str = aarch64_stack_protector_guard_offset_str;
17860       errno = 0;
17861       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
17862       if (!*str || *end || errno)
17863         error ("%qs is not a valid offset in %qs", str,
17864                "-mstack-protector-guard-offset=");
17865       aarch64_stack_protector_guard_offset = offs;
17866     }
17867
17868   if ((flag_sanitize & SANITIZE_SHADOW_CALL_STACK)
17869       && !fixed_regs[R18_REGNUM])
17870     error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>");
17871
17872   initialize_aarch64_code_model (opts);
17873   initialize_aarch64_tls_size (opts);
17874
17875   int queue_depth = 0;
17876   switch (aarch64_tune_params.autoprefetcher_model)
17877     {
17878       case tune_params::AUTOPREFETCHER_OFF:
17879         queue_depth = -1;
17880         break;
17881       case tune_params::AUTOPREFETCHER_WEAK:
17882         queue_depth = 0;
17883         break;
17884       case tune_params::AUTOPREFETCHER_STRONG:
17885         queue_depth = max_insn_queue_index + 1;
17886         break;
17887       default:
17888         gcc_unreachable ();
17889     }
17890
17891   /* We don't mind passing in global_options_set here as we don't use
17892      the *options_set structs anyway.  */
17893   SET_OPTION_IF_UNSET (opts, &global_options_set,
17894                        param_sched_autopref_queue_depth, queue_depth);
17895
17896   /* If using Advanced SIMD only for autovectorization disable SVE vector costs
17897      comparison.  */
17898   if (aarch64_autovec_preference == 1)
17899     SET_OPTION_IF_UNSET (opts, &global_options_set,
17900                          aarch64_sve_compare_costs, 0);
17901
17902   /* Set up parameters to be used in prefetching algorithm.  Do not
17903      override the defaults unless we are tuning for a core we have
17904      researched values for.  */
17905   if (aarch64_tune_params.prefetch->num_slots > 0)
17906     SET_OPTION_IF_UNSET (opts, &global_options_set,
17907                          param_simultaneous_prefetches,
17908                          aarch64_tune_params.prefetch->num_slots);
17909   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
17910     SET_OPTION_IF_UNSET (opts, &global_options_set,
17911                          param_l1_cache_size,
17912                          aarch64_tune_params.prefetch->l1_cache_size);
17913   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
17914     SET_OPTION_IF_UNSET (opts, &global_options_set,
17915                          param_l1_cache_line_size,
17916                          aarch64_tune_params.prefetch->l1_cache_line_size);
17917
17918   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
17919     {
17920       SET_OPTION_IF_UNSET (opts, &global_options_set,
17921                            param_destruct_interfere_size,
17922                            aarch64_tune_params.prefetch->l1_cache_line_size);
17923       SET_OPTION_IF_UNSET (opts, &global_options_set,
17924                            param_construct_interfere_size,
17925                            aarch64_tune_params.prefetch->l1_cache_line_size);
17926     }
17927   else
17928     {
17929       /* For a generic AArch64 target, cover the current range of cache line
17930          sizes.  */
17931       SET_OPTION_IF_UNSET (opts, &global_options_set,
17932                            param_destruct_interfere_size,
17933                            256);
17934       SET_OPTION_IF_UNSET (opts, &global_options_set,
17935                            param_construct_interfere_size,
17936                            64);
17937     }
17938
17939   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
17940     SET_OPTION_IF_UNSET (opts, &global_options_set,
17941                          param_l2_cache_size,
17942                          aarch64_tune_params.prefetch->l2_cache_size);
17943   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
17944     SET_OPTION_IF_UNSET (opts, &global_options_set,
17945                          param_prefetch_dynamic_strides, 0);
17946   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
17947     SET_OPTION_IF_UNSET (opts, &global_options_set,
17948                          param_prefetch_minimum_stride,
17949                          aarch64_tune_params.prefetch->minimum_stride);
17950
17951   /* Use the alternative scheduling-pressure algorithm by default.  */
17952   SET_OPTION_IF_UNSET (opts, &global_options_set,
17953                        param_sched_pressure_algorithm,
17954                        SCHED_PRESSURE_MODEL);
17955
17956   /* Validate the guard size.  */
17957   int guard_size = param_stack_clash_protection_guard_size;
17958
17959   if (guard_size != 12 && guard_size != 16)
17960     error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
17961            "size.  Given value %d (%llu KB) is out of range",
17962            guard_size, (1ULL << guard_size) / 1024ULL);
17963
17964   /* Enforce that interval is the same size as size so the mid-end does the
17965      right thing.  */
17966   SET_OPTION_IF_UNSET (opts, &global_options_set,
17967                        param_stack_clash_protection_probe_interval,
17968                        guard_size);
17969
17970   /* The maybe_set calls won't update the value if the user has explicitly set
17971      one.  Which means we need to validate that probing interval and guard size
17972      are equal.  */
17973   int probe_interval
17974     = param_stack_clash_protection_probe_interval;
17975   if (guard_size != probe_interval)
17976     error ("stack clash guard size %<%d%> must be equal to probing interval "
17977            "%<%d%>", guard_size, probe_interval);
17978
17979   /* Enable sw prefetching at specified optimization level for
17980      CPUS that have prefetch.  Lower optimization level threshold by 1
17981      when profiling is enabled.  */
17982   if (opts->x_flag_prefetch_loop_arrays < 0
17983       && !opts->x_optimize_size
17984       && aarch64_tune_params.prefetch->default_opt_level >= 0
17985       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
17986     opts->x_flag_prefetch_loop_arrays = 1;
17987
17988   aarch64_override_options_after_change_1 (opts);
17989 }
17990
17991 /* Print a hint with a suggestion for a core or architecture name that
17992    most closely resembles what the user passed in STR.  ARCH is true if
17993    the user is asking for an architecture name.  ARCH is false if the user
17994    is asking for a core name.  */
17995
17996 static void
17997 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
17998 {
17999   auto_vec<const char *> candidates;
18000   const struct processor *entry = arch ? all_architectures : all_cores;
18001   for (; entry->name != NULL; entry++)
18002     candidates.safe_push (entry->name);
18003
18004 #ifdef HAVE_LOCAL_CPU_DETECT
18005   /* Add also "native" as possible value.  */
18006   if (arch)
18007     candidates.safe_push ("native");
18008 #endif
18009
18010   char *s;
18011   const char *hint = candidates_list_and_hint (str, s, candidates);
18012   if (hint)
18013     inform (input_location, "valid arguments are: %s;"
18014                              " did you mean %qs?", s, hint);
18015   else
18016     inform (input_location, "valid arguments are: %s", s);
18017
18018   XDELETEVEC (s);
18019 }
18020
18021 /* Print a hint with a suggestion for a core name that most closely resembles
18022    what the user passed in STR.  */
18023
18024 inline static void
18025 aarch64_print_hint_for_core (const char *str)
18026 {
18027   aarch64_print_hint_for_core_or_arch (str, false);
18028 }
18029
18030 /* Print a hint with a suggestion for an architecture name that most closely
18031    resembles what the user passed in STR.  */
18032
18033 inline static void
18034 aarch64_print_hint_for_arch (const char *str)
18035 {
18036   aarch64_print_hint_for_core_or_arch (str, true);
18037 }
18038
18039
18040 /* Print a hint with a suggestion for an extension name
18041    that most closely resembles what the user passed in STR.  */
18042
18043 void
18044 aarch64_print_hint_for_extensions (const std::string &str)
18045 {
18046   auto_vec<const char *> candidates;
18047   aarch64_get_all_extension_candidates (&candidates);
18048   char *s;
18049   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
18050   if (hint)
18051     inform (input_location, "valid arguments are: %s;"
18052                              " did you mean %qs?", s, hint);
18053   else
18054     inform (input_location, "valid arguments are: %s", s);
18055
18056   XDELETEVEC (s);
18057 }
18058
18059 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
18060    specified in STR and throw errors if appropriate.  Put the results if
18061    they are valid in RES and ISA_FLAGS.  Return whether the option is
18062    valid.  */
18063
18064 static bool
18065 aarch64_validate_mcpu (const char *str, const struct processor **res,
18066                        aarch64_feature_flags *isa_flags)
18067 {
18068   std::string invalid_extension;
18069   enum aarch64_parse_opt_result parse_res
18070     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
18071
18072   if (parse_res == AARCH64_PARSE_OK)
18073     return true;
18074
18075   switch (parse_res)
18076     {
18077       case AARCH64_PARSE_MISSING_ARG:
18078         error ("missing cpu name in %<-mcpu=%s%>", str);
18079         break;
18080       case AARCH64_PARSE_INVALID_ARG:
18081         error ("unknown value %qs for %<-mcpu%>", str);
18082         aarch64_print_hint_for_core (str);
18083         break;
18084       case AARCH64_PARSE_INVALID_FEATURE:
18085         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
18086                invalid_extension.c_str (), str);
18087         aarch64_print_hint_for_extensions (invalid_extension);
18088         break;
18089       default:
18090         gcc_unreachable ();
18091     }
18092
18093   return false;
18094 }
18095
18096 /* Straight line speculation indicators.  */
18097 enum aarch64_sls_hardening_type
18098 {
18099   SLS_NONE = 0,
18100   SLS_RETBR = 1,
18101   SLS_BLR = 2,
18102   SLS_ALL = 3,
18103 };
18104 static enum aarch64_sls_hardening_type aarch64_sls_hardening;
18105
18106 /* Return whether we should mitigatate Straight Line Speculation for the RET
18107    and BR instructions.  */
18108 bool
18109 aarch64_harden_sls_retbr_p (void)
18110 {
18111   return aarch64_sls_hardening & SLS_RETBR;
18112 }
18113
18114 /* Return whether we should mitigatate Straight Line Speculation for the BLR
18115    instruction.  */
18116 bool
18117 aarch64_harden_sls_blr_p (void)
18118 {
18119   return aarch64_sls_hardening & SLS_BLR;
18120 }
18121
18122 /* As of yet we only allow setting these options globally, in the future we may
18123    allow setting them per function.  */
18124 static void
18125 aarch64_validate_sls_mitigation (const char *const_str)
18126 {
18127   char *token_save = NULL;
18128   char *str = NULL;
18129
18130   if (strcmp (const_str, "none") == 0)
18131     {
18132       aarch64_sls_hardening = SLS_NONE;
18133       return;
18134     }
18135   if (strcmp (const_str, "all") == 0)
18136     {
18137       aarch64_sls_hardening = SLS_ALL;
18138       return;
18139     }
18140
18141   char *str_root = xstrdup (const_str);
18142   str = strtok_r (str_root, ",", &token_save);
18143   if (!str)
18144     error ("invalid argument given to %<-mharden-sls=%>");
18145
18146   int temp = SLS_NONE;
18147   while (str)
18148     {
18149       if (strcmp (str, "blr") == 0)
18150         temp |= SLS_BLR;
18151       else if (strcmp (str, "retbr") == 0)
18152         temp |= SLS_RETBR;
18153       else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
18154         {
18155           error ("%qs must be by itself for %<-mharden-sls=%>", str);
18156           break;
18157         }
18158       else
18159         {
18160           error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
18161           break;
18162         }
18163       str = strtok_r (NULL, ",", &token_save);
18164     }
18165   aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
18166   free (str_root);
18167 }
18168
18169 /* Parses CONST_STR for branch protection features specified in
18170    aarch64_branch_protect_types, and set any global variables required.  Returns
18171    the parsing result and assigns LAST_STR to the last processed token from
18172    CONST_STR so that it can be used for error reporting.  */
18173
18174 static enum
18175 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
18176                                                           char** last_str)
18177 {
18178   char *str_root = xstrdup (const_str);
18179   char* token_save = NULL;
18180   char *str = strtok_r (str_root, "+", &token_save);
18181   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
18182   if (!str)
18183     res = AARCH64_PARSE_MISSING_ARG;
18184   else
18185     {
18186       char *next_str = strtok_r (NULL, "+", &token_save);
18187       /* Reset the branch protection features to their defaults.  */
18188       aarch64_handle_no_branch_protection (NULL, NULL);
18189
18190       while (str && res == AARCH64_PARSE_OK)
18191         {
18192           const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
18193           bool found = false;
18194           /* Search for this type.  */
18195           while (type && type->name && !found && res == AARCH64_PARSE_OK)
18196             {
18197               if (strcmp (str, type->name) == 0)
18198                 {
18199                   found = true;
18200                   res = type->handler (str, next_str);
18201                   str = next_str;
18202                   next_str = strtok_r (NULL, "+", &token_save);
18203                 }
18204               else
18205                 type++;
18206             }
18207           if (found && res == AARCH64_PARSE_OK)
18208             {
18209               bool found_subtype = true;
18210               /* Loop through each token until we find one that isn't a
18211                  subtype.  */
18212               while (found_subtype)
18213                 {
18214                   found_subtype = false;
18215                   const aarch64_branch_protect_type *subtype = type->subtypes;
18216                   /* Search for the subtype.  */
18217                   while (str && subtype && subtype->name && !found_subtype
18218                           && res == AARCH64_PARSE_OK)
18219                     {
18220                       if (strcmp (str, subtype->name) == 0)
18221                         {
18222                           found_subtype = true;
18223                           res = subtype->handler (str, next_str);
18224                           str = next_str;
18225                           next_str = strtok_r (NULL, "+", &token_save);
18226                         }
18227                       else
18228                         subtype++;
18229                     }
18230                 }
18231             }
18232           else if (!found)
18233             res = AARCH64_PARSE_INVALID_ARG;
18234         }
18235     }
18236   /* Copy the last processed token into the argument to pass it back.
18237     Used by option and attribute validation to print the offending token.  */
18238   if (last_str)
18239     {
18240       if (str) strcpy (*last_str, str);
18241       else *last_str = NULL;
18242     }
18243   if (res == AARCH64_PARSE_OK)
18244     {
18245       /* If needed, alloc the accepted string then copy in const_str.
18246         Used by override_option_after_change_1.  */
18247       if (!accepted_branch_protection_string)
18248         accepted_branch_protection_string = (char *) xmalloc (
18249                                                       BRANCH_PROTECT_STR_MAX
18250                                                         + 1);
18251       strncpy (accepted_branch_protection_string, const_str,
18252                 BRANCH_PROTECT_STR_MAX + 1);
18253       /* Forcibly null-terminate.  */
18254       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
18255     }
18256   return res;
18257 }
18258
18259 static bool
18260 aarch64_validate_mbranch_protection (const char *const_str)
18261 {
18262   char *str = (char *) xmalloc (strlen (const_str));
18263   enum aarch64_parse_opt_result res =
18264     aarch64_parse_branch_protection (const_str, &str);
18265   if (res == AARCH64_PARSE_INVALID_ARG)
18266     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
18267   else if (res == AARCH64_PARSE_MISSING_ARG)
18268     error ("missing argument for %<-mbranch-protection=%>");
18269   free (str);
18270   return res == AARCH64_PARSE_OK;
18271 }
18272
18273 /* Validate a command-line -march option.  Parse the arch and extensions
18274    (if any) specified in STR and throw errors if appropriate.  Put the
18275    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
18276    option is valid.  */
18277
18278 static bool
18279 aarch64_validate_march (const char *str, const struct processor **res,
18280                         aarch64_feature_flags *isa_flags)
18281 {
18282   std::string invalid_extension;
18283   enum aarch64_parse_opt_result parse_res
18284     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
18285
18286   if (parse_res == AARCH64_PARSE_OK)
18287     return true;
18288
18289   switch (parse_res)
18290     {
18291       case AARCH64_PARSE_MISSING_ARG:
18292         error ("missing arch name in %<-march=%s%>", str);
18293         break;
18294       case AARCH64_PARSE_INVALID_ARG:
18295         error ("unknown value %qs for %<-march%>", str);
18296         aarch64_print_hint_for_arch (str);
18297         /* A common user error is confusing -march and -mcpu.
18298            If the -march string matches a known CPU suggest -mcpu.  */
18299         parse_res = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
18300         if (parse_res == AARCH64_PARSE_OK)
18301           inform (input_location, "did you mean %<-mcpu=%s%>?", str);
18302         break;
18303       case AARCH64_PARSE_INVALID_FEATURE:
18304         error ("invalid feature modifier %qs in %<-march=%s%>",
18305                invalid_extension.c_str (), str);
18306         aarch64_print_hint_for_extensions (invalid_extension);
18307         break;
18308       default:
18309         gcc_unreachable ();
18310     }
18311
18312   return false;
18313 }
18314
18315 /* Validate a command-line -mtune option.  Parse the cpu
18316    specified in STR and throw errors if appropriate.  Put the
18317    result, if it is valid, in RES.  Return whether the option is
18318    valid.  */
18319
18320 static bool
18321 aarch64_validate_mtune (const char *str, const struct processor **res)
18322 {
18323   enum aarch64_parse_opt_result parse_res
18324     = aarch64_parse_tune (str, res);
18325
18326   if (parse_res == AARCH64_PARSE_OK)
18327     return true;
18328
18329   switch (parse_res)
18330     {
18331       case AARCH64_PARSE_MISSING_ARG:
18332         error ("missing cpu name in %<-mtune=%s%>", str);
18333         break;
18334       case AARCH64_PARSE_INVALID_ARG:
18335         error ("unknown value %qs for %<-mtune%>", str);
18336         aarch64_print_hint_for_core (str);
18337         break;
18338       default:
18339         gcc_unreachable ();
18340     }
18341   return false;
18342 }
18343
18344 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
18345
18346 static poly_uint16
18347 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
18348 {
18349   /* 128-bit SVE and Advanced SIMD modes use different register layouts
18350      on big-endian targets, so we would need to forbid subregs that convert
18351      from one to the other.  By default a reinterpret sequence would then
18352      involve a store to memory in one mode and a load back in the other.
18353      Even if we optimize that sequence using reverse instructions,
18354      it would still be a significant potential overhead.
18355
18356      For now, it seems better to generate length-agnostic code for that
18357      case instead.  */
18358   if (value == SVE_SCALABLE
18359       || (value == SVE_128 && BYTES_BIG_ENDIAN))
18360     return poly_uint16 (2, 2);
18361   else
18362     return (int) value / 64;
18363 }
18364
18365 /* Set the global aarch64_asm_isa_flags to FLAGS and update
18366    aarch64_isa_flags accordingly.  */
18367
18368 void
18369 aarch64_set_asm_isa_flags (aarch64_feature_flags flags)
18370 {
18371   aarch64_set_asm_isa_flags (&global_options, flags);
18372 }
18373
18374 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
18375    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
18376    tuning structs.  In particular it must set selected_tune and
18377    aarch64_asm_isa_flags that define the available ISA features and tuning
18378    decisions.  It must also set selected_arch as this will be used to
18379    output the .arch asm tags for each function.  */
18380
18381 static void
18382 aarch64_override_options (void)
18383 {
18384   aarch64_feature_flags cpu_isa = 0;
18385   aarch64_feature_flags arch_isa = 0;
18386   aarch64_set_asm_isa_flags (0);
18387
18388   const struct processor *cpu = NULL;
18389   const struct processor *arch = NULL;
18390   const struct processor *tune = NULL;
18391
18392   if (aarch64_harden_sls_string)
18393     aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
18394
18395   if (aarch64_branch_protection_string)
18396     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
18397
18398   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
18399      If either of -march or -mtune is given, they override their
18400      respective component of -mcpu.  */
18401   if (aarch64_cpu_string)
18402     aarch64_validate_mcpu (aarch64_cpu_string, &cpu, &cpu_isa);
18403
18404   if (aarch64_arch_string)
18405     aarch64_validate_march (aarch64_arch_string, &arch, &arch_isa);
18406
18407   if (aarch64_tune_string)
18408     aarch64_validate_mtune (aarch64_tune_string, &tune);
18409
18410 #ifdef SUBTARGET_OVERRIDE_OPTIONS
18411   SUBTARGET_OVERRIDE_OPTIONS;
18412 #endif
18413
18414   if (cpu && arch)
18415     {
18416       /* If both -mcpu and -march are specified, warn if they are not
18417          architecturally compatible and prefer the -march ISA flags.  */
18418       if (arch->arch != cpu->arch)
18419         {
18420           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
18421                        aarch64_cpu_string,
18422                        aarch64_arch_string);
18423         }
18424
18425       selected_arch = arch->arch;
18426       aarch64_set_asm_isa_flags (arch_isa);
18427     }
18428   else if (cpu)
18429     {
18430       selected_arch = cpu->arch;
18431       aarch64_set_asm_isa_flags (cpu_isa);
18432     }
18433   else if (arch)
18434     {
18435       cpu = &all_cores[arch->ident];
18436       selected_arch = arch->arch;
18437       aarch64_set_asm_isa_flags (arch_isa);
18438     }
18439   else
18440     {
18441       /* No -mcpu or -march specified, so use the default CPU.  */
18442       cpu = &all_cores[TARGET_CPU_DEFAULT];
18443       selected_arch = cpu->arch;
18444       aarch64_set_asm_isa_flags (cpu->flags);
18445     }
18446
18447   selected_tune = tune ? tune->ident : cpu->ident;
18448
18449   if (aarch64_enable_bti == 2)
18450     {
18451 #ifdef TARGET_ENABLE_BTI
18452       aarch64_enable_bti = 1;
18453 #else
18454       aarch64_enable_bti = 0;
18455 #endif
18456     }
18457
18458   /* Return address signing is currently not supported for ILP32 targets.  For
18459      LP64 targets use the configured option in the absence of a command-line
18460      option for -mbranch-protection.  */
18461   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
18462     {
18463 #ifdef TARGET_ENABLE_PAC_RET
18464       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
18465 #else
18466       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
18467 #endif
18468     }
18469
18470 #ifndef HAVE_AS_MABI_OPTION
18471   /* The compiler may have been configured with 2.23.* binutils, which does
18472      not have support for ILP32.  */
18473   if (TARGET_ILP32)
18474     error ("assembler does not support %<-mabi=ilp32%>");
18475 #endif
18476
18477   /* Convert -msve-vector-bits to a VG count.  */
18478   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
18479
18480   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
18481     sorry ("return address signing is only supported for %<-mabi=lp64%>");
18482
18483   /* The pass to insert speculation tracking runs before
18484      shrink-wrapping and the latter does not know how to update the
18485      tracking status.  So disable it in this case.  */
18486   if (aarch64_track_speculation)
18487     flag_shrink_wrap = 0;
18488
18489   aarch64_override_options_internal (&global_options);
18490
18491   /* Save these options as the default ones in case we push and pop them later
18492      while processing functions with potential target attributes.  */
18493   target_option_default_node = target_option_current_node
18494     = build_target_option_node (&global_options, &global_options_set);
18495 }
18496
18497 /* Implement targetm.override_options_after_change.  */
18498
18499 static void
18500 aarch64_override_options_after_change (void)
18501 {
18502   aarch64_override_options_after_change_1 (&global_options);
18503 }
18504
18505 /* Implement the TARGET_OFFLOAD_OPTIONS hook.  */
18506 static char *
18507 aarch64_offload_options (void)
18508 {
18509   if (TARGET_ILP32)
18510     return xstrdup ("-foffload-abi=ilp32");
18511   else
18512     return xstrdup ("-foffload-abi=lp64");
18513 }
18514
18515 static struct machine_function *
18516 aarch64_init_machine_status (void)
18517 {
18518   struct machine_function *machine;
18519   machine = ggc_cleared_alloc<machine_function> ();
18520   return machine;
18521 }
18522
18523 void
18524 aarch64_init_expanders (void)
18525 {
18526   init_machine_status = aarch64_init_machine_status;
18527 }
18528
18529 /* A checking mechanism for the implementation of the various code models.  */
18530 static void
18531 initialize_aarch64_code_model (struct gcc_options *opts)
18532 {
18533   aarch64_cmodel = opts->x_aarch64_cmodel_var;
18534   switch (opts->x_aarch64_cmodel_var)
18535     {
18536     case AARCH64_CMODEL_TINY:
18537       if (opts->x_flag_pic)
18538         aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
18539       break;
18540     case AARCH64_CMODEL_SMALL:
18541       if (opts->x_flag_pic)
18542         {
18543 #ifdef HAVE_AS_SMALL_PIC_RELOCS
18544           aarch64_cmodel = (flag_pic == 2
18545                             ? AARCH64_CMODEL_SMALL_PIC
18546                             : AARCH64_CMODEL_SMALL_SPIC);
18547 #else
18548           aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
18549 #endif
18550         }
18551       break;
18552     case AARCH64_CMODEL_LARGE:
18553       if (opts->x_flag_pic)
18554         sorry ("code model %qs with %<-f%s%>", "large",
18555                opts->x_flag_pic > 1 ? "PIC" : "pic");
18556       if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
18557         sorry ("code model %qs not supported in ilp32 mode", "large");
18558       break;
18559     case AARCH64_CMODEL_TINY_PIC:
18560     case AARCH64_CMODEL_SMALL_PIC:
18561     case AARCH64_CMODEL_SMALL_SPIC:
18562       gcc_unreachable ();
18563     }
18564 }
18565
18566 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
18567    using the information saved in PTR.  */
18568
18569 static void
18570 aarch64_option_restore (struct gcc_options *opts,
18571                         struct gcc_options * /* opts_set */,
18572                         struct cl_target_option * /* ptr */)
18573 {
18574   aarch64_override_options_internal (opts);
18575 }
18576
18577 /* Implement TARGET_OPTION_PRINT.  */
18578
18579 static void
18580 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
18581 {
18582   const struct processor *cpu
18583     = aarch64_get_tune_cpu (ptr->x_selected_tune);
18584   const struct processor *arch = aarch64_get_arch (ptr->x_selected_arch);
18585   std::string extension
18586     = aarch64_get_extension_string_for_isa_flags (ptr->x_aarch64_asm_isa_flags,
18587                                                   arch->flags);
18588
18589   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
18590   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
18591            arch->name, extension.c_str ());
18592 }
18593
18594 static GTY(()) tree aarch64_previous_fndecl;
18595
18596 void
18597 aarch64_reset_previous_fndecl (void)
18598 {
18599   aarch64_previous_fndecl = NULL;
18600 }
18601
18602 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
18603    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
18604    make sure optab availability predicates are recomputed when necessary.  */
18605
18606 void
18607 aarch64_save_restore_target_globals (tree new_tree)
18608 {
18609   if (TREE_TARGET_GLOBALS (new_tree))
18610     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
18611   else if (new_tree == target_option_default_node)
18612     restore_target_globals (&default_target_globals);
18613   else
18614     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
18615 }
18616
18617 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
18618    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
18619    of the function, if such exists.  This function may be called multiple
18620    times on a single function so use aarch64_previous_fndecl to avoid
18621    setting up identical state.  */
18622
18623 static void
18624 aarch64_set_current_function (tree fndecl)
18625 {
18626   if (!fndecl || fndecl == aarch64_previous_fndecl)
18627     return;
18628
18629   tree old_tree = (aarch64_previous_fndecl
18630                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
18631                    : NULL_TREE);
18632
18633   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
18634
18635   /* If current function has no attributes but the previous one did,
18636      use the default node.  */
18637   if (!new_tree && old_tree)
18638     new_tree = target_option_default_node;
18639
18640   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
18641      the default have been handled by aarch64_save_restore_target_globals from
18642      aarch64_pragma_target_parse.  */
18643   if (old_tree == new_tree)
18644     return;
18645
18646   aarch64_previous_fndecl = fndecl;
18647
18648   /* First set the target options.  */
18649   cl_target_option_restore (&global_options, &global_options_set,
18650                             TREE_TARGET_OPTION (new_tree));
18651
18652   aarch64_save_restore_target_globals (new_tree);
18653 }
18654
18655 /* Enum describing the various ways we can handle attributes.
18656    In many cases we can reuse the generic option handling machinery.  */
18657
18658 enum aarch64_attr_opt_type
18659 {
18660   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
18661   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
18662   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
18663   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
18664 };
18665
18666 /* All the information needed to handle a target attribute.
18667    NAME is the name of the attribute.
18668    ATTR_TYPE specifies the type of behavior of the attribute as described
18669    in the definition of enum aarch64_attr_opt_type.
18670    ALLOW_NEG is true if the attribute supports a "no-" form.
18671    HANDLER is the function that takes the attribute string as an argument
18672    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
18673    OPT_NUM is the enum specifying the option that the attribute modifies.
18674    This is needed for attributes that mirror the behavior of a command-line
18675    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
18676    aarch64_attr_enum.  */
18677
18678 struct aarch64_attribute_info
18679 {
18680   const char *name;
18681   enum aarch64_attr_opt_type attr_type;
18682   bool allow_neg;
18683   bool (*handler) (const char *);
18684   enum opt_code opt_num;
18685 };
18686
18687 /* Handle the ARCH_STR argument to the arch= target attribute.  */
18688
18689 static bool
18690 aarch64_handle_attr_arch (const char *str)
18691 {
18692   const struct processor *tmp_arch = NULL;
18693   std::string invalid_extension;
18694   aarch64_feature_flags tmp_flags;
18695   enum aarch64_parse_opt_result parse_res
18696     = aarch64_parse_arch (str, &tmp_arch, &tmp_flags, &invalid_extension);
18697
18698   if (parse_res == AARCH64_PARSE_OK)
18699     {
18700       gcc_assert (tmp_arch);
18701       selected_arch = tmp_arch->arch;
18702       aarch64_set_asm_isa_flags (tmp_flags);
18703       return true;
18704     }
18705
18706   switch (parse_res)
18707     {
18708       case AARCH64_PARSE_MISSING_ARG:
18709         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
18710         break;
18711       case AARCH64_PARSE_INVALID_ARG:
18712         error ("invalid name %qs in %<target(\"arch=\")%> pragma or attribute", str);
18713         aarch64_print_hint_for_arch (str);
18714         break;
18715       case AARCH64_PARSE_INVALID_FEATURE:
18716         error ("invalid feature modifier %s of value %qs in "
18717                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18718         aarch64_print_hint_for_extensions (invalid_extension);
18719         break;
18720       default:
18721         gcc_unreachable ();
18722     }
18723
18724   return false;
18725 }
18726
18727 /* Handle the argument CPU_STR to the cpu= target attribute.  */
18728
18729 static bool
18730 aarch64_handle_attr_cpu (const char *str)
18731 {
18732   const struct processor *tmp_cpu = NULL;
18733   std::string invalid_extension;
18734   aarch64_feature_flags tmp_flags;
18735   enum aarch64_parse_opt_result parse_res
18736     = aarch64_parse_cpu (str, &tmp_cpu, &tmp_flags, &invalid_extension);
18737
18738   if (parse_res == AARCH64_PARSE_OK)
18739     {
18740       gcc_assert (tmp_cpu);
18741       selected_tune = tmp_cpu->ident;
18742       selected_arch = tmp_cpu->arch;
18743       aarch64_set_asm_isa_flags (tmp_flags);
18744       return true;
18745     }
18746
18747   switch (parse_res)
18748     {
18749       case AARCH64_PARSE_MISSING_ARG:
18750         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
18751         break;
18752       case AARCH64_PARSE_INVALID_ARG:
18753         error ("invalid name %qs in %<target(\"cpu=\")%> pragma or attribute", str);
18754         aarch64_print_hint_for_core (str);
18755         break;
18756       case AARCH64_PARSE_INVALID_FEATURE:
18757         error ("invalid feature modifier %qs of value %qs in "
18758                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18759         aarch64_print_hint_for_extensions (invalid_extension);
18760         break;
18761       default:
18762         gcc_unreachable ();
18763     }
18764
18765   return false;
18766 }
18767
18768 /* Handle the argument STR to the branch-protection= attribute.  */
18769
18770  static bool
18771  aarch64_handle_attr_branch_protection (const char* str)
18772  {
18773   char *err_str = (char *) xmalloc (strlen (str) + 1);
18774   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
18775                                                                       &err_str);
18776   bool success = false;
18777   switch (res)
18778     {
18779      case AARCH64_PARSE_MISSING_ARG:
18780        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
18781               " attribute");
18782        break;
18783      case AARCH64_PARSE_INVALID_ARG:
18784        error ("invalid protection type %qs in %<target(\"branch-protection"
18785               "=\")%> pragma or attribute", err_str);
18786        break;
18787      case AARCH64_PARSE_OK:
18788        success = true;
18789       /* Fall through.  */
18790      case AARCH64_PARSE_INVALID_FEATURE:
18791        break;
18792      default:
18793        gcc_unreachable ();
18794     }
18795   free (err_str);
18796   return success;
18797  }
18798
18799 /* Handle the argument STR to the tune= target attribute.  */
18800
18801 static bool
18802 aarch64_handle_attr_tune (const char *str)
18803 {
18804   const struct processor *tmp_tune = NULL;
18805   enum aarch64_parse_opt_result parse_res
18806     = aarch64_parse_tune (str, &tmp_tune);
18807
18808   if (parse_res == AARCH64_PARSE_OK)
18809     {
18810       gcc_assert (tmp_tune);
18811       selected_tune = tmp_tune->ident;
18812       return true;
18813     }
18814
18815   switch (parse_res)
18816     {
18817       case AARCH64_PARSE_INVALID_ARG:
18818         error ("invalid name %qs in %<target(\"tune=\")%> pragma or attribute", str);
18819         aarch64_print_hint_for_core (str);
18820         break;
18821       default:
18822         gcc_unreachable ();
18823     }
18824
18825   return false;
18826 }
18827
18828 /* Parse an architecture extensions target attribute string specified in STR.
18829    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
18830    if successful.  Update aarch64_isa_flags to reflect the ISA features
18831    modified.  */
18832
18833 static bool
18834 aarch64_handle_attr_isa_flags (char *str)
18835 {
18836   enum aarch64_parse_opt_result parse_res;
18837   auto isa_flags = aarch64_asm_isa_flags;
18838
18839   /* We allow "+nothing" in the beginning to clear out all architectural
18840      features if the user wants to handpick specific features.  */
18841   if (strncmp ("+nothing", str, 8) == 0)
18842     {
18843       isa_flags = 0;
18844       str += 8;
18845     }
18846
18847   std::string invalid_extension;
18848   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
18849
18850   if (parse_res == AARCH64_PARSE_OK)
18851     {
18852       aarch64_set_asm_isa_flags (isa_flags);
18853       return true;
18854     }
18855
18856   switch (parse_res)
18857     {
18858       case AARCH64_PARSE_MISSING_ARG:
18859         error ("missing value in %<target()%> pragma or attribute");
18860         break;
18861
18862       case AARCH64_PARSE_INVALID_FEATURE:
18863         error ("invalid feature modifier %qs of value %qs in "
18864                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18865         break;
18866
18867       default:
18868         gcc_unreachable ();
18869     }
18870
18871  return false;
18872 }
18873
18874 /* The target attributes that we support.  On top of these we also support just
18875    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
18876    handled explicitly in aarch64_process_one_target_attr.  */
18877
18878 static const struct aarch64_attribute_info aarch64_attributes[] =
18879 {
18880   { "general-regs-only", aarch64_attr_mask, false, NULL,
18881      OPT_mgeneral_regs_only },
18882   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
18883      OPT_mfix_cortex_a53_835769 },
18884   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
18885      OPT_mfix_cortex_a53_843419 },
18886   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
18887   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
18888   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
18889      OPT_momit_leaf_frame_pointer },
18890   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
18891   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
18892      OPT_march_ },
18893   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
18894   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
18895      OPT_mtune_ },
18896   { "branch-protection", aarch64_attr_custom, false,
18897      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
18898   { "sign-return-address", aarch64_attr_enum, false, NULL,
18899      OPT_msign_return_address_ },
18900   { "outline-atomics", aarch64_attr_bool, true, NULL,
18901      OPT_moutline_atomics},
18902   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
18903 };
18904
18905 /* Parse ARG_STR which contains the definition of one target attribute.
18906    Show appropriate errors if any or return true if the attribute is valid.  */
18907
18908 static bool
18909 aarch64_process_one_target_attr (char *arg_str)
18910 {
18911   bool invert = false;
18912
18913   size_t len = strlen (arg_str);
18914
18915   if (len == 0)
18916     {
18917       error ("malformed %<target()%> pragma or attribute");
18918       return false;
18919     }
18920
18921   char *str_to_check = (char *) alloca (len + 1);
18922   strcpy (str_to_check, arg_str);
18923
18924   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
18925      It is easier to detect and handle it explicitly here rather than going
18926      through the machinery for the rest of the target attributes in this
18927      function.  */
18928   if (*str_to_check == '+')
18929     return aarch64_handle_attr_isa_flags (str_to_check);
18930
18931   if (len > 3 && startswith (str_to_check, "no-"))
18932     {
18933       invert = true;
18934       str_to_check += 3;
18935     }
18936   char *arg = strchr (str_to_check, '=');
18937
18938   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
18939      and point ARG to "foo".  */
18940   if (arg)
18941     {
18942       *arg = '\0';
18943       arg++;
18944     }
18945   const struct aarch64_attribute_info *p_attr;
18946   bool found = false;
18947   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
18948     {
18949       /* If the names don't match up, or the user has given an argument
18950          to an attribute that doesn't accept one, or didn't give an argument
18951          to an attribute that expects one, fail to match.  */
18952       if (strcmp (str_to_check, p_attr->name) != 0)
18953         continue;
18954
18955       found = true;
18956       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
18957                               || p_attr->attr_type == aarch64_attr_enum;
18958
18959       if (attr_need_arg_p ^ (arg != NULL))
18960         {
18961           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
18962           return false;
18963         }
18964
18965       /* If the name matches but the attribute does not allow "no-" versions
18966          then we can't match.  */
18967       if (invert && !p_attr->allow_neg)
18968         {
18969           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
18970           return false;
18971         }
18972
18973       switch (p_attr->attr_type)
18974         {
18975         /* Has a custom handler registered.
18976            For example, cpu=, arch=, tune=.  */
18977           case aarch64_attr_custom:
18978             gcc_assert (p_attr->handler);
18979             if (!p_attr->handler (arg))
18980               return false;
18981             break;
18982
18983           /* Either set or unset a boolean option.  */
18984           case aarch64_attr_bool:
18985             {
18986               struct cl_decoded_option decoded;
18987
18988               generate_option (p_attr->opt_num, NULL, !invert,
18989                                CL_TARGET, &decoded);
18990               aarch64_handle_option (&global_options, &global_options_set,
18991                                       &decoded, input_location);
18992               break;
18993             }
18994           /* Set or unset a bit in the target_flags.  aarch64_handle_option
18995              should know what mask to apply given the option number.  */
18996           case aarch64_attr_mask:
18997             {
18998               struct cl_decoded_option decoded;
18999               /* We only need to specify the option number.
19000                  aarch64_handle_option will know which mask to apply.  */
19001               decoded.opt_index = p_attr->opt_num;
19002               decoded.value = !invert;
19003               aarch64_handle_option (&global_options, &global_options_set,
19004                                       &decoded, input_location);
19005               break;
19006             }
19007           /* Use the option setting machinery to set an option to an enum.  */
19008           case aarch64_attr_enum:
19009             {
19010               gcc_assert (arg);
19011               bool valid;
19012               int value;
19013               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
19014                                               &value, CL_TARGET);
19015               if (valid)
19016                 {
19017                   set_option (&global_options, NULL, p_attr->opt_num, value,
19018                               NULL, DK_UNSPECIFIED, input_location,
19019                               global_dc);
19020                 }
19021               else
19022                 {
19023                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
19024                 }
19025               break;
19026             }
19027           default:
19028             gcc_unreachable ();
19029         }
19030     }
19031
19032   /* If we reached here we either have found an attribute and validated
19033      it or didn't match any.  If we matched an attribute but its arguments
19034      were malformed we will have returned false already.  */
19035   return found;
19036 }
19037
19038 /* Count how many times the character C appears in
19039    NULL-terminated string STR.  */
19040
19041 static unsigned int
19042 num_occurences_in_str (char c, char *str)
19043 {
19044   unsigned int res = 0;
19045   while (*str != '\0')
19046     {
19047       if (*str == c)
19048         res++;
19049
19050       str++;
19051     }
19052
19053   return res;
19054 }
19055
19056 /* Parse the tree in ARGS that contains the target attribute information
19057    and update the global target options space.  */
19058
19059 bool
19060 aarch64_process_target_attr (tree args)
19061 {
19062   if (TREE_CODE (args) == TREE_LIST)
19063     {
19064       do
19065         {
19066           tree head = TREE_VALUE (args);
19067           if (head)
19068             {
19069               if (!aarch64_process_target_attr (head))
19070                 return false;
19071             }
19072           args = TREE_CHAIN (args);
19073         } while (args);
19074
19075       return true;
19076     }
19077
19078   if (TREE_CODE (args) != STRING_CST)
19079     {
19080       error ("attribute %<target%> argument not a string");
19081       return false;
19082     }
19083
19084   size_t len = strlen (TREE_STRING_POINTER (args));
19085   char *str_to_check = (char *) alloca (len + 1);
19086   strcpy (str_to_check, TREE_STRING_POINTER (args));
19087
19088   if (len == 0)
19089     {
19090       error ("malformed %<target()%> pragma or attribute");
19091       return false;
19092     }
19093
19094   /* Used to catch empty spaces between commas i.e.
19095      attribute ((target ("attr1,,attr2"))).  */
19096   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
19097
19098   /* Handle multiple target attributes separated by ','.  */
19099   char *token = strtok_r (str_to_check, ",", &str_to_check);
19100
19101   unsigned int num_attrs = 0;
19102   while (token)
19103     {
19104       num_attrs++;
19105       if (!aarch64_process_one_target_attr (token))
19106         {
19107           /* Check if token is possibly an arch extension without
19108              leading '+'.  */
19109           aarch64_feature_flags isa_temp = 0;
19110           auto with_plus = std::string ("+") + token;
19111           enum aarch64_parse_opt_result ext_res
19112             = aarch64_parse_extension (with_plus.c_str (), &isa_temp, nullptr);
19113
19114           if (ext_res == AARCH64_PARSE_OK)
19115             error ("arch extension %<%s%> should be prefixed by %<+%>",
19116                    token);
19117           else
19118             error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
19119           return false;
19120         }
19121
19122       token = strtok_r (NULL, ",", &str_to_check);
19123     }
19124
19125   if (num_attrs != num_commas + 1)
19126     {
19127       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
19128       return false;
19129     }
19130
19131   return true;
19132 }
19133
19134 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
19135    process attribute ((target ("..."))).  */
19136
19137 static bool
19138 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
19139 {
19140   struct cl_target_option cur_target;
19141   bool ret;
19142   tree old_optimize;
19143   tree new_target, new_optimize;
19144   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19145
19146   /* If what we're processing is the current pragma string then the
19147      target option node is already stored in target_option_current_node
19148      by aarch64_pragma_target_parse in aarch64-c.cc.  Use that to avoid
19149      having to re-parse the string.  This is especially useful to keep
19150      arm_neon.h compile times down since that header contains a lot
19151      of intrinsics enclosed in pragmas.  */
19152   if (!existing_target && args == current_target_pragma)
19153     {
19154       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
19155       return true;
19156     }
19157   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19158
19159   old_optimize
19160     = build_optimization_node (&global_options, &global_options_set);
19161   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19162
19163   /* If the function changed the optimization levels as well as setting
19164      target options, start with the optimizations specified.  */
19165   if (func_optimize && func_optimize != old_optimize)
19166     cl_optimization_restore (&global_options, &global_options_set,
19167                              TREE_OPTIMIZATION (func_optimize));
19168
19169   /* Save the current target options to restore at the end.  */
19170   cl_target_option_save (&cur_target, &global_options, &global_options_set);
19171
19172   /* If fndecl already has some target attributes applied to it, unpack
19173      them so that we add this attribute on top of them, rather than
19174      overwriting them.  */
19175   if (existing_target)
19176     {
19177       struct cl_target_option *existing_options
19178         = TREE_TARGET_OPTION (existing_target);
19179
19180       if (existing_options)
19181         cl_target_option_restore (&global_options, &global_options_set,
19182                                   existing_options);
19183     }
19184   else
19185     cl_target_option_restore (&global_options, &global_options_set,
19186                               TREE_TARGET_OPTION (target_option_current_node));
19187
19188   ret = aarch64_process_target_attr (args);
19189
19190   /* Set up any additional state.  */
19191   if (ret)
19192     {
19193       aarch64_override_options_internal (&global_options);
19194       new_target = build_target_option_node (&global_options,
19195                                              &global_options_set);
19196     }
19197   else
19198     new_target = NULL;
19199
19200   new_optimize = build_optimization_node (&global_options,
19201                                           &global_options_set);
19202
19203   if (fndecl && ret)
19204     {
19205       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
19206
19207       if (old_optimize != new_optimize)
19208         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
19209     }
19210
19211   cl_target_option_restore (&global_options, &global_options_set, &cur_target);
19212
19213   if (old_optimize != new_optimize)
19214     cl_optimization_restore (&global_options, &global_options_set,
19215                              TREE_OPTIMIZATION (old_optimize));
19216   return ret;
19217 }
19218
19219 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
19220    tri-bool options (yes, no, don't care) and the default value is
19221    DEF, determine whether to reject inlining.  */
19222
19223 static bool
19224 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
19225                                      int dont_care, int def)
19226 {
19227   /* If the callee doesn't care, always allow inlining.  */
19228   if (callee == dont_care)
19229     return true;
19230
19231   /* If the caller doesn't care, always allow inlining.  */
19232   if (caller == dont_care)
19233     return true;
19234
19235   /* Otherwise, allow inlining if either the callee and caller values
19236      agree, or if the callee is using the default value.  */
19237   return (callee == caller || callee == def);
19238 }
19239
19240 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
19241    to inline CALLEE into CALLER based on target-specific info.
19242    Make sure that the caller and callee have compatible architectural
19243    features.  Then go through the other possible target attributes
19244    and see if they can block inlining.  Try not to reject always_inline
19245    callees unless they are incompatible architecturally.  */
19246
19247 static bool
19248 aarch64_can_inline_p (tree caller, tree callee)
19249 {
19250   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
19251   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
19252
19253   struct cl_target_option *caller_opts
19254         = TREE_TARGET_OPTION (caller_tree ? caller_tree
19255                                            : target_option_default_node);
19256
19257   struct cl_target_option *callee_opts
19258         = TREE_TARGET_OPTION (callee_tree ? callee_tree
19259                                            : target_option_default_node);
19260
19261   /* Callee's ISA flags should be a subset of the caller's.  */
19262   if ((caller_opts->x_aarch64_asm_isa_flags
19263        & callee_opts->x_aarch64_asm_isa_flags)
19264       != callee_opts->x_aarch64_asm_isa_flags)
19265     return false;
19266   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
19267       != callee_opts->x_aarch64_isa_flags)
19268     return false;
19269
19270   /* Allow non-strict aligned functions inlining into strict
19271      aligned ones.  */
19272   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
19273        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
19274       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
19275            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
19276     return false;
19277
19278   bool always_inline = lookup_attribute ("always_inline",
19279                                           DECL_ATTRIBUTES (callee));
19280
19281   /* If the architectural features match up and the callee is always_inline
19282      then the other attributes don't matter.  */
19283   if (always_inline)
19284     return true;
19285
19286   if (caller_opts->x_aarch64_cmodel_var
19287       != callee_opts->x_aarch64_cmodel_var)
19288     return false;
19289
19290   if (caller_opts->x_aarch64_tls_dialect
19291       != callee_opts->x_aarch64_tls_dialect)
19292     return false;
19293
19294   /* Honour explicit requests to workaround errata.  */
19295   if (!aarch64_tribools_ok_for_inlining_p (
19296           caller_opts->x_aarch64_fix_a53_err835769,
19297           callee_opts->x_aarch64_fix_a53_err835769,
19298           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
19299     return false;
19300
19301   if (!aarch64_tribools_ok_for_inlining_p (
19302           caller_opts->x_aarch64_fix_a53_err843419,
19303           callee_opts->x_aarch64_fix_a53_err843419,
19304           2, TARGET_FIX_ERR_A53_843419))
19305     return false;
19306
19307   /* If the user explicitly specified -momit-leaf-frame-pointer for the
19308      caller and calle and they don't match up, reject inlining.  */
19309   if (!aarch64_tribools_ok_for_inlining_p (
19310           caller_opts->x_flag_omit_leaf_frame_pointer,
19311           callee_opts->x_flag_omit_leaf_frame_pointer,
19312           2, 1))
19313     return false;
19314
19315   /* If the callee has specific tuning overrides, respect them.  */
19316   if (callee_opts->x_aarch64_override_tune_string != NULL
19317       && caller_opts->x_aarch64_override_tune_string == NULL)
19318     return false;
19319
19320   /* If the user specified tuning override strings for the
19321      caller and callee and they don't match up, reject inlining.
19322      We just do a string compare here, we don't analyze the meaning
19323      of the string, as it would be too costly for little gain.  */
19324   if (callee_opts->x_aarch64_override_tune_string
19325       && caller_opts->x_aarch64_override_tune_string
19326       && (strcmp (callee_opts->x_aarch64_override_tune_string,
19327                   caller_opts->x_aarch64_override_tune_string) != 0))
19328     return false;
19329
19330   return true;
19331 }
19332
19333 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
19334    been already.  */
19335
19336 unsigned int
19337 aarch64_tlsdesc_abi_id ()
19338 {
19339   predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
19340   if (!tlsdesc_abi.initialized_p ())
19341     {
19342       HARD_REG_SET full_reg_clobbers;
19343       CLEAR_HARD_REG_SET (full_reg_clobbers);
19344       SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
19345       SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
19346       for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
19347         SET_HARD_REG_BIT (full_reg_clobbers, regno);
19348       tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
19349     }
19350   return tlsdesc_abi.id ();
19351 }
19352
19353 /* Return true if SYMBOL_REF X binds locally.  */
19354
19355 static bool
19356 aarch64_symbol_binds_local_p (const_rtx x)
19357 {
19358   return (SYMBOL_REF_DECL (x)
19359           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
19360           : SYMBOL_REF_LOCAL_P (x));
19361 }
19362
19363 /* Return true if SYMBOL_REF X is thread local */
19364 static bool
19365 aarch64_tls_symbol_p (rtx x)
19366 {
19367   if (! TARGET_HAVE_TLS)
19368     return false;
19369
19370   x = strip_salt (x);
19371   if (!SYMBOL_REF_P (x))
19372     return false;
19373
19374   return SYMBOL_REF_TLS_MODEL (x) != 0;
19375 }
19376
19377 /* Classify a TLS symbol into one of the TLS kinds.  */
19378 enum aarch64_symbol_type
19379 aarch64_classify_tls_symbol (rtx x)
19380 {
19381   enum tls_model tls_kind = tls_symbolic_operand_type (x);
19382
19383   switch (tls_kind)
19384     {
19385     case TLS_MODEL_GLOBAL_DYNAMIC:
19386     case TLS_MODEL_LOCAL_DYNAMIC:
19387       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
19388
19389     case TLS_MODEL_INITIAL_EXEC:
19390       switch (aarch64_cmodel)
19391         {
19392         case AARCH64_CMODEL_TINY:
19393         case AARCH64_CMODEL_TINY_PIC:
19394           return SYMBOL_TINY_TLSIE;
19395         default:
19396           return SYMBOL_SMALL_TLSIE;
19397         }
19398
19399     case TLS_MODEL_LOCAL_EXEC:
19400       if (aarch64_tls_size == 12)
19401         return SYMBOL_TLSLE12;
19402       else if (aarch64_tls_size == 24)
19403         return SYMBOL_TLSLE24;
19404       else if (aarch64_tls_size == 32)
19405         return SYMBOL_TLSLE32;
19406       else if (aarch64_tls_size == 48)
19407         return SYMBOL_TLSLE48;
19408       else
19409         gcc_unreachable ();
19410
19411     case TLS_MODEL_EMULATED:
19412     case TLS_MODEL_NONE:
19413       return SYMBOL_FORCE_TO_MEM;
19414
19415     default:
19416       gcc_unreachable ();
19417     }
19418 }
19419
19420 /* Return the correct method for accessing X + OFFSET, where X is either
19421    a SYMBOL_REF or LABEL_REF.  */
19422
19423 enum aarch64_symbol_type
19424 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
19425 {
19426   x = strip_salt (x);
19427
19428   if (LABEL_REF_P (x))
19429     {
19430       switch (aarch64_cmodel)
19431         {
19432         case AARCH64_CMODEL_LARGE:
19433           return SYMBOL_FORCE_TO_MEM;
19434
19435         case AARCH64_CMODEL_TINY_PIC:
19436         case AARCH64_CMODEL_TINY:
19437           return SYMBOL_TINY_ABSOLUTE;
19438
19439         case AARCH64_CMODEL_SMALL_SPIC:
19440         case AARCH64_CMODEL_SMALL_PIC:
19441         case AARCH64_CMODEL_SMALL:
19442           return SYMBOL_SMALL_ABSOLUTE;
19443
19444         default:
19445           gcc_unreachable ();
19446         }
19447     }
19448
19449   if (SYMBOL_REF_P (x))
19450     {
19451       if (aarch64_tls_symbol_p (x))
19452         return aarch64_classify_tls_symbol (x);
19453
19454       switch (aarch64_cmodel)
19455         {
19456         case AARCH64_CMODEL_TINY_PIC:
19457         case AARCH64_CMODEL_TINY:
19458           /* With -fPIC non-local symbols use the GOT.  For orthogonality
19459              always use the GOT for extern weak symbols.  */
19460           if ((flag_pic || SYMBOL_REF_WEAK (x))
19461               && !aarch64_symbol_binds_local_p (x))
19462             return SYMBOL_TINY_GOT;
19463
19464           /* When we retrieve symbol + offset address, we have to make sure
19465              the offset does not cause overflow of the final address.  But
19466              we have no way of knowing the address of symbol at compile time
19467              so we can't accurately say if the distance between the PC and
19468              symbol + offset is outside the addressible range of +/-1MB in the
19469              TINY code model.  So we limit the maximum offset to +/-64KB and
19470              assume the offset to the symbol is not larger than +/-(1MB - 64KB).
19471              If offset_within_block_p is true we allow larger offsets.  */
19472           if (!(IN_RANGE (offset, -0x10000, 0x10000)
19473                 || offset_within_block_p (x, offset)))
19474             return SYMBOL_FORCE_TO_MEM;
19475
19476           return SYMBOL_TINY_ABSOLUTE;
19477
19478
19479         case AARCH64_CMODEL_SMALL_SPIC:
19480         case AARCH64_CMODEL_SMALL_PIC:
19481         case AARCH64_CMODEL_SMALL:
19482           if ((flag_pic || SYMBOL_REF_WEAK (x))
19483               && !aarch64_symbol_binds_local_p (x))
19484             return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
19485                     ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G;
19486
19487           /* Same reasoning as the tiny code model, but the offset cap here is
19488              1MB, allowing +/-3.9GB for the offset to the symbol.  */
19489           if (!(IN_RANGE (offset, -0x100000, 0x100000)
19490                 || offset_within_block_p (x, offset)))
19491             return SYMBOL_FORCE_TO_MEM;
19492
19493           return SYMBOL_SMALL_ABSOLUTE;
19494
19495         case AARCH64_CMODEL_LARGE:
19496           /* This is alright even in PIC code as the constant
19497              pool reference is always PC relative and within
19498              the same translation unit.  */
19499           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
19500             return SYMBOL_SMALL_ABSOLUTE;
19501           else
19502             return SYMBOL_FORCE_TO_MEM;
19503
19504         default:
19505           gcc_unreachable ();
19506         }
19507     }
19508
19509   /* By default push everything into the constant pool.  */
19510   return SYMBOL_FORCE_TO_MEM;
19511 }
19512
19513 bool
19514 aarch64_constant_address_p (rtx x)
19515 {
19516   return (CONSTANT_P (x) && memory_address_p (DImode, x));
19517 }
19518
19519 bool
19520 aarch64_legitimate_pic_operand_p (rtx x)
19521 {
19522   poly_int64 offset;
19523   x = strip_offset_and_salt (x, &offset);
19524   if (SYMBOL_REF_P (x))
19525     return false;
19526
19527   return true;
19528 }
19529
19530 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
19531    that should be rematerialized rather than spilled.  */
19532
19533 static bool
19534 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
19535 {
19536   /* Support CSE and rematerialization of common constants.  */
19537   if (CONST_INT_P (x)
19538       || CONST_DOUBLE_P (x))
19539     return true;
19540
19541   /* Only accept variable-length vector constants if they can be
19542      handled directly.
19543
19544      ??? It would be possible (but complex) to handle rematerialization
19545      of other constants via secondary reloads.  */
19546   if (!GET_MODE_SIZE (mode).is_constant ())
19547     return aarch64_simd_valid_immediate (x, NULL);
19548
19549   /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
19550      least be forced to memory and loaded from there.  */
19551   if (CONST_VECTOR_P (x))
19552     return !targetm.cannot_force_const_mem (mode, x);
19553
19554   /* Do not allow vector struct mode constants for Advanced SIMD.
19555      We could support 0 and -1 easily, but they need support in
19556      aarch64-simd.md.  */
19557   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19558   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
19559     return false;
19560
19561   if (GET_CODE (x) == HIGH)
19562     x = XEXP (x, 0);
19563
19564   /* Accept polynomial constants that can be calculated by using the
19565      destination of a move as the sole temporary.  Constants that
19566      require a second temporary cannot be rematerialized (they can't be
19567      forced to memory and also aren't legitimate constants).  */
19568   poly_int64 offset;
19569   if (poly_int_rtx_p (x, &offset))
19570     return aarch64_offset_temporaries (false, offset) <= 1;
19571
19572   /* If an offset is being added to something else, we need to allow the
19573      base to be moved into the destination register, meaning that there
19574      are no free temporaries for the offset.  */
19575   x = strip_offset_and_salt (x, &offset);
19576   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
19577     return false;
19578
19579   /* Do not allow const (plus (anchor_symbol, const_int)).  */
19580   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
19581     return false;
19582
19583   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
19584      so spilling them is better than rematerialization.  */
19585   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
19586     return true;
19587
19588   /* Label references are always constant.  */
19589   if (LABEL_REF_P (x))
19590     return true;
19591
19592   return false;
19593 }
19594
19595 rtx
19596 aarch64_load_tp (rtx target)
19597 {
19598   if (!target
19599       || GET_MODE (target) != Pmode
19600       || !register_operand (target, Pmode))
19601     target = gen_reg_rtx (Pmode);
19602
19603   /* Can return in any reg.  */
19604   emit_insn (gen_aarch64_load_tp_hard (target));
19605   return target;
19606 }
19607
19608 /* On AAPCS systems, this is the "struct __va_list".  */
19609 static GTY(()) tree va_list_type;
19610
19611 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
19612    Return the type to use as __builtin_va_list.
19613
19614    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
19615
19616    struct __va_list
19617    {
19618      void *__stack;
19619      void *__gr_top;
19620      void *__vr_top;
19621      int   __gr_offs;
19622      int   __vr_offs;
19623    };  */
19624
19625 static tree
19626 aarch64_build_builtin_va_list (void)
19627 {
19628   tree va_list_name;
19629   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19630
19631   /* Create the type.  */
19632   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
19633   /* Give it the required name.  */
19634   va_list_name = build_decl (BUILTINS_LOCATION,
19635                              TYPE_DECL,
19636                              get_identifier ("__va_list"),
19637                              va_list_type);
19638   DECL_ARTIFICIAL (va_list_name) = 1;
19639   TYPE_NAME (va_list_type) = va_list_name;
19640   TYPE_STUB_DECL (va_list_type) = va_list_name;
19641
19642   /* Create the fields.  */
19643   f_stack = build_decl (BUILTINS_LOCATION,
19644                         FIELD_DECL, get_identifier ("__stack"),
19645                         ptr_type_node);
19646   f_grtop = build_decl (BUILTINS_LOCATION,
19647                         FIELD_DECL, get_identifier ("__gr_top"),
19648                         ptr_type_node);
19649   f_vrtop = build_decl (BUILTINS_LOCATION,
19650                         FIELD_DECL, get_identifier ("__vr_top"),
19651                         ptr_type_node);
19652   f_groff = build_decl (BUILTINS_LOCATION,
19653                         FIELD_DECL, get_identifier ("__gr_offs"),
19654                         integer_type_node);
19655   f_vroff = build_decl (BUILTINS_LOCATION,
19656                         FIELD_DECL, get_identifier ("__vr_offs"),
19657                         integer_type_node);
19658
19659   /* Tell tree-stdarg pass about our internal offset fields.
19660      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
19661      purpose to identify whether the code is updating va_list internal
19662      offset fields through irregular way.  */
19663   va_list_gpr_counter_field = f_groff;
19664   va_list_fpr_counter_field = f_vroff;
19665
19666   DECL_ARTIFICIAL (f_stack) = 1;
19667   DECL_ARTIFICIAL (f_grtop) = 1;
19668   DECL_ARTIFICIAL (f_vrtop) = 1;
19669   DECL_ARTIFICIAL (f_groff) = 1;
19670   DECL_ARTIFICIAL (f_vroff) = 1;
19671
19672   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
19673   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
19674   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
19675   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
19676   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
19677
19678   TYPE_FIELDS (va_list_type) = f_stack;
19679   DECL_CHAIN (f_stack) = f_grtop;
19680   DECL_CHAIN (f_grtop) = f_vrtop;
19681   DECL_CHAIN (f_vrtop) = f_groff;
19682   DECL_CHAIN (f_groff) = f_vroff;
19683
19684   /* Compute its layout.  */
19685   layout_type (va_list_type);
19686
19687   return va_list_type;
19688 }
19689
19690 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
19691 static void
19692 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
19693 {
19694   const CUMULATIVE_ARGS *cum;
19695   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19696   tree stack, grtop, vrtop, groff, vroff;
19697   tree t;
19698   int gr_save_area_size = cfun->va_list_gpr_size;
19699   int vr_save_area_size = cfun->va_list_fpr_size;
19700   int vr_offset;
19701
19702   cum = &crtl->args.info;
19703   if (cfun->va_list_gpr_size)
19704     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
19705                              cfun->va_list_gpr_size);
19706   if (cfun->va_list_fpr_size)
19707     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
19708                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
19709
19710   if (!TARGET_FLOAT)
19711     {
19712       gcc_assert (cum->aapcs_nvrn == 0);
19713       vr_save_area_size = 0;
19714     }
19715
19716   f_stack = TYPE_FIELDS (va_list_type_node);
19717   f_grtop = DECL_CHAIN (f_stack);
19718   f_vrtop = DECL_CHAIN (f_grtop);
19719   f_groff = DECL_CHAIN (f_vrtop);
19720   f_vroff = DECL_CHAIN (f_groff);
19721
19722   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
19723                   NULL_TREE);
19724   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
19725                   NULL_TREE);
19726   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
19727                   NULL_TREE);
19728   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
19729                   NULL_TREE);
19730   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
19731                   NULL_TREE);
19732
19733   /* Emit code to initialize STACK, which points to the next varargs stack
19734      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
19735      by named arguments.  STACK is 8-byte aligned.  */
19736   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
19737   if (cum->aapcs_stack_size > 0)
19738     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
19739   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
19740   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19741
19742   /* Emit code to initialize GRTOP, the top of the GR save area.
19743      virtual_incoming_args_rtx should have been 16 byte aligned.  */
19744   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
19745   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
19746   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19747
19748   /* Emit code to initialize VRTOP, the top of the VR save area.
19749      This address is gr_save_area_bytes below GRTOP, rounded
19750      down to the next 16-byte boundary.  */
19751   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
19752   vr_offset = ROUND_UP (gr_save_area_size,
19753                         STACK_BOUNDARY / BITS_PER_UNIT);
19754
19755   if (vr_offset)
19756     t = fold_build_pointer_plus_hwi (t, -vr_offset);
19757   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
19758   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19759
19760   /* Emit code to initialize GROFF, the offset from GRTOP of the
19761      next GPR argument.  */
19762   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
19763               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
19764   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19765
19766   /* Likewise emit code to initialize VROFF, the offset from FTOP
19767      of the next VR argument.  */
19768   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
19769               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
19770   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19771 }
19772
19773 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
19774
19775 static tree
19776 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
19777                               gimple_seq *post_p ATTRIBUTE_UNUSED)
19778 {
19779   tree addr;
19780   bool indirect_p;
19781   bool is_ha;           /* is HFA or HVA.  */
19782   bool dw_align;        /* double-word align.  */
19783   machine_mode ag_mode = VOIDmode;
19784   int nregs;
19785   machine_mode mode;
19786
19787   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19788   tree stack, f_top, f_off, off, arg, roundup, on_stack;
19789   HOST_WIDE_INT size, rsize, adjust, align;
19790   tree t, u, cond1, cond2;
19791
19792   indirect_p = pass_va_arg_by_reference (type);
19793   if (indirect_p)
19794     type = build_pointer_type (type);
19795
19796   mode = TYPE_MODE (type);
19797
19798   f_stack = TYPE_FIELDS (va_list_type_node);
19799   f_grtop = DECL_CHAIN (f_stack);
19800   f_vrtop = DECL_CHAIN (f_grtop);
19801   f_groff = DECL_CHAIN (f_vrtop);
19802   f_vroff = DECL_CHAIN (f_groff);
19803
19804   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
19805                   f_stack, NULL_TREE);
19806   size = int_size_in_bytes (type);
19807
19808   unsigned int abi_break;
19809   unsigned int abi_break_packed;
19810   align
19811     = aarch64_function_arg_alignment (mode, type, &abi_break, &abi_break_packed)
19812     / BITS_PER_UNIT;
19813
19814   dw_align = false;
19815   adjust = 0;
19816   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
19817                                                &is_ha, false))
19818     {
19819       /* No frontends can create types with variable-sized modes, so we
19820          shouldn't be asked to pass or return them.  */
19821       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
19822
19823       /* TYPE passed in fp/simd registers.  */
19824       if (!TARGET_FLOAT)
19825         aarch64_err_no_fpadvsimd (mode);
19826
19827       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
19828                       unshare_expr (valist), f_vrtop, NULL_TREE);
19829       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
19830                       unshare_expr (valist), f_vroff, NULL_TREE);
19831
19832       rsize = nregs * UNITS_PER_VREG;
19833
19834       if (is_ha)
19835         {
19836           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
19837             adjust = UNITS_PER_VREG - ag_size;
19838         }
19839       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
19840                && size < UNITS_PER_VREG)
19841         {
19842           adjust = UNITS_PER_VREG - size;
19843         }
19844     }
19845   else
19846     {
19847       /* TYPE passed in general registers.  */
19848       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
19849                       unshare_expr (valist), f_grtop, NULL_TREE);
19850       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
19851                       unshare_expr (valist), f_groff, NULL_TREE);
19852       rsize = ROUND_UP (size, UNITS_PER_WORD);
19853       nregs = rsize / UNITS_PER_WORD;
19854
19855       if (align <= 8 && abi_break_packed && warn_psabi)
19856         inform (input_location, "parameter passing for argument of type "
19857                 "%qT changed in GCC 13.1", type);
19858
19859       if (align > 8)
19860         {
19861           if (abi_break && warn_psabi)
19862             inform (input_location, "parameter passing for argument of type "
19863                     "%qT changed in GCC 9.1", type);
19864           dw_align = true;
19865         }
19866
19867       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
19868           && size < UNITS_PER_WORD)
19869         {
19870           adjust = UNITS_PER_WORD  - size;
19871         }
19872     }
19873
19874   /* Get a local temporary for the field value.  */
19875   off = get_initialized_tmp_var (f_off, pre_p, NULL);
19876
19877   /* Emit code to branch if off >= 0.  */
19878   t = build2 (GE_EXPR, boolean_type_node, off,
19879               build_int_cst (TREE_TYPE (off), 0));
19880   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
19881
19882   if (dw_align)
19883     {
19884       /* Emit: offs = (offs + 15) & -16.  */
19885       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
19886                   build_int_cst (TREE_TYPE (off), 15));
19887       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
19888                   build_int_cst (TREE_TYPE (off), -16));
19889       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
19890     }
19891   else
19892     roundup = NULL;
19893
19894   /* Update ap.__[g|v]r_offs  */
19895   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
19896               build_int_cst (TREE_TYPE (off), rsize));
19897   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
19898
19899   /* String up.  */
19900   if (roundup)
19901     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
19902
19903   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
19904   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
19905               build_int_cst (TREE_TYPE (f_off), 0));
19906   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
19907
19908   /* String up: make sure the assignment happens before the use.  */
19909   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
19910   COND_EXPR_ELSE (cond1) = t;
19911
19912   /* Prepare the trees handling the argument that is passed on the stack;
19913      the top level node will store in ON_STACK.  */
19914   arg = get_initialized_tmp_var (stack, pre_p, NULL);
19915   if (align > 8)
19916     {
19917       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
19918       t = fold_build_pointer_plus_hwi (arg, 15);
19919       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
19920                   build_int_cst (TREE_TYPE (t), -16));
19921       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
19922     }
19923   else
19924     roundup = NULL;
19925   /* Advance ap.__stack  */
19926   t = fold_build_pointer_plus_hwi (arg, size + 7);
19927   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
19928               build_int_cst (TREE_TYPE (t), -8));
19929   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
19930   /* String up roundup and advance.  */
19931   if (roundup)
19932     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
19933   /* String up with arg */
19934   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
19935   /* Big-endianness related address adjustment.  */
19936   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
19937       && size < UNITS_PER_WORD)
19938   {
19939     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
19940                 size_int (UNITS_PER_WORD - size));
19941     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
19942   }
19943
19944   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
19945   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
19946
19947   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
19948   t = off;
19949   if (adjust)
19950     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
19951                 build_int_cst (TREE_TYPE (off), adjust));
19952
19953   t = fold_convert (sizetype, t);
19954   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
19955
19956   if (is_ha)
19957     {
19958       /* type ha; // treat as "struct {ftype field[n];}"
19959          ... [computing offs]
19960          for (i = 0; i <nregs; ++i, offs += 16)
19961            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
19962          return ha;  */
19963       int i;
19964       tree tmp_ha, field_t, field_ptr_t;
19965
19966       /* Declare a local variable.  */
19967       tmp_ha = create_tmp_var_raw (type, "ha");
19968       gimple_add_tmp_var (tmp_ha);
19969
19970       /* Establish the base type.  */
19971       switch (ag_mode)
19972         {
19973         case E_SFmode:
19974           field_t = float_type_node;
19975           field_ptr_t = float_ptr_type_node;
19976           break;
19977         case E_DFmode:
19978           field_t = double_type_node;
19979           field_ptr_t = double_ptr_type_node;
19980           break;
19981         case E_TFmode:
19982           field_t = long_double_type_node;
19983           field_ptr_t = long_double_ptr_type_node;
19984           break;
19985         case E_SDmode:
19986           field_t = dfloat32_type_node;
19987           field_ptr_t = build_pointer_type (dfloat32_type_node);
19988           break;
19989         case E_DDmode:
19990           field_t = dfloat64_type_node;
19991           field_ptr_t = build_pointer_type (dfloat64_type_node);
19992           break;
19993         case E_TDmode:
19994           field_t = dfloat128_type_node;
19995           field_ptr_t = build_pointer_type (dfloat128_type_node);
19996           break;
19997         case E_HFmode:
19998           field_t = aarch64_fp16_type_node;
19999           field_ptr_t = aarch64_fp16_ptr_type_node;
20000           break;
20001         case E_BFmode:
20002           field_t = aarch64_bf16_type_node;
20003           field_ptr_t = aarch64_bf16_ptr_type_node;
20004           break;
20005         case E_V2SImode:
20006         case E_V4SImode:
20007             {
20008               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
20009               field_t = build_vector_type_for_mode (innertype, ag_mode);
20010               field_ptr_t = build_pointer_type (field_t);
20011             }
20012           break;
20013         default:
20014           gcc_assert (0);
20015         }
20016
20017       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
20018       TREE_ADDRESSABLE (tmp_ha) = 1;
20019       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
20020       addr = t;
20021       t = fold_convert (field_ptr_t, addr);
20022       t = build2 (MODIFY_EXPR, field_t,
20023                   build1 (INDIRECT_REF, field_t, tmp_ha),
20024                   build1 (INDIRECT_REF, field_t, t));
20025
20026       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
20027       for (i = 1; i < nregs; ++i)
20028         {
20029           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
20030           u = fold_convert (field_ptr_t, addr);
20031           u = build2 (MODIFY_EXPR, field_t,
20032                       build2 (MEM_REF, field_t, tmp_ha,
20033                               build_int_cst (field_ptr_t,
20034                                              (i *
20035                                               int_size_in_bytes (field_t)))),
20036                       build1 (INDIRECT_REF, field_t, u));
20037           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
20038         }
20039
20040       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
20041       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
20042     }
20043
20044   COND_EXPR_ELSE (cond2) = t;
20045   addr = fold_convert (build_pointer_type (type), cond1);
20046   addr = build_va_arg_indirect_ref (addr);
20047
20048   if (indirect_p)
20049     addr = build_va_arg_indirect_ref (addr);
20050
20051   return addr;
20052 }
20053
20054 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
20055
20056 static void
20057 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
20058                                 const function_arg_info &arg,
20059                                 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
20060 {
20061   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
20062   CUMULATIVE_ARGS local_cum;
20063   int gr_saved = cfun->va_list_gpr_size;
20064   int vr_saved = cfun->va_list_fpr_size;
20065
20066   /* The caller has advanced CUM up to, but not beyond, the last named
20067      argument.  Advance a local copy of CUM past the last "real" named
20068      argument, to find out how many registers are left over.  */
20069   local_cum = *cum;
20070   if (!TYPE_NO_NAMED_ARGS_STDARG_P (TREE_TYPE (current_function_decl)))
20071     aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
20072
20073   /* Found out how many registers we need to save.
20074      Honor tree-stdvar analysis results.  */
20075   if (cfun->va_list_gpr_size)
20076     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
20077                     cfun->va_list_gpr_size / UNITS_PER_WORD);
20078   if (cfun->va_list_fpr_size)
20079     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
20080                     cfun->va_list_fpr_size / UNITS_PER_VREG);
20081
20082   if (!TARGET_FLOAT)
20083     {
20084       gcc_assert (local_cum.aapcs_nvrn == 0);
20085       vr_saved = 0;
20086     }
20087
20088   if (!no_rtl)
20089     {
20090       if (gr_saved > 0)
20091         {
20092           rtx ptr, mem;
20093
20094           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
20095           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
20096                                - gr_saved * UNITS_PER_WORD);
20097           mem = gen_frame_mem (BLKmode, ptr);
20098           set_mem_alias_set (mem, get_varargs_alias_set ());
20099
20100           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
20101                                mem, gr_saved);
20102         }
20103       if (vr_saved > 0)
20104         {
20105           /* We can't use move_block_from_reg, because it will use
20106              the wrong mode, storing D regs only.  */
20107           machine_mode mode = TImode;
20108           int off, i, vr_start;
20109
20110           /* Set OFF to the offset from virtual_incoming_args_rtx of
20111              the first vector register.  The VR save area lies below
20112              the GR one, and is aligned to 16 bytes.  */
20113           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
20114                            STACK_BOUNDARY / BITS_PER_UNIT);
20115           off -= vr_saved * UNITS_PER_VREG;
20116
20117           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
20118           for (i = 0; i < vr_saved; ++i)
20119             {
20120               rtx ptr, mem;
20121
20122               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
20123               mem = gen_frame_mem (mode, ptr);
20124               set_mem_alias_set (mem, get_varargs_alias_set ());
20125               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
20126               off += UNITS_PER_VREG;
20127             }
20128         }
20129     }
20130
20131   /* We don't save the size into *PRETEND_SIZE because we want to avoid
20132      any complication of having crtl->args.pretend_args_size changed.  */
20133   cfun->machine->frame.saved_varargs_size
20134     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
20135                  STACK_BOUNDARY / BITS_PER_UNIT)
20136        + vr_saved * UNITS_PER_VREG);
20137 }
20138
20139 static void
20140 aarch64_conditional_register_usage (void)
20141 {
20142   int i;
20143   if (!TARGET_FLOAT)
20144     {
20145       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
20146         {
20147           fixed_regs[i] = 1;
20148           call_used_regs[i] = 1;
20149           CLEAR_HARD_REG_BIT (operand_reg_set, i);
20150         }
20151     }
20152   if (!TARGET_SVE)
20153     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
20154       {
20155         fixed_regs[i] = 1;
20156         call_used_regs[i] = 1;
20157       }
20158
20159   /* Only allow the FFR and FFRT to be accessed via special patterns.  */
20160   CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
20161   CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
20162
20163   /* When tracking speculation, we need a couple of call-clobbered registers
20164      to track the speculation state.  It would be nice to just use
20165      IP0 and IP1, but currently there are numerous places that just
20166      assume these registers are free for other uses (eg pointer
20167      authentication).  */
20168   if (aarch64_track_speculation)
20169     {
20170       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
20171       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
20172       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
20173       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
20174     }
20175 }
20176
20177 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK.  */
20178
20179 bool
20180 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
20181 {
20182   /* For records we're passed a FIELD_DECL, for arrays we're passed
20183      an ARRAY_TYPE.  In both cases we're interested in the TREE_TYPE.  */
20184   const_tree type = TREE_TYPE (field_or_array);
20185
20186   /* Assign BLKmode to anything that contains multiple SVE predicates.
20187      For structures, the "multiple" case is indicated by MODE being
20188      VOIDmode.  */
20189   unsigned int num_zr, num_pr;
20190   if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr != 0)
20191     {
20192       if (TREE_CODE (field_or_array) == ARRAY_TYPE)
20193         return !simple_cst_equal (TYPE_SIZE (field_or_array),
20194                                   TYPE_SIZE (type));
20195       return mode == VOIDmode;
20196     }
20197
20198   return default_member_type_forces_blk (field_or_array, mode);
20199 }
20200
20201 /* Bitmasks that indicate whether earlier versions of GCC would have
20202    taken a different path through the ABI logic.  This should result in
20203    a -Wpsabi warning if the earlier path led to a different ABI decision.
20204
20205    WARN_PSABI_EMPTY_CXX17_BASE
20206       Indicates that the type includes an artificial empty C++17 base field
20207       that, prior to GCC 10.1, would prevent the type from being treated as
20208       a HFA or HVA.  See PR94383 for details.
20209
20210    WARN_PSABI_NO_UNIQUE_ADDRESS
20211       Indicates that the type includes an empty [[no_unique_address]] field
20212       that, prior to GCC 10.1, would prevent the type from being treated as
20213       a HFA or HVA.  */
20214 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
20215 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
20216 const unsigned int WARN_PSABI_ZERO_WIDTH_BITFIELD = 1U << 2;
20217
20218 /* Walk down the type tree of TYPE counting consecutive base elements.
20219    If *MODEP is VOIDmode, then set it to the first valid floating point
20220    type.  If a non-floating point type is found, or if a floating point
20221    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
20222    otherwise return the count in the sub-tree.
20223
20224    The WARN_PSABI_FLAGS argument allows the caller to check whether this
20225    function has changed its behavior relative to earlier versions of GCC.
20226    Normally the argument should be nonnull and point to a zero-initialized
20227    variable.  The function then records whether the ABI decision might
20228    be affected by a known fix to the ABI logic, setting the associated
20229    WARN_PSABI_* bits if so.
20230
20231    When the argument is instead a null pointer, the function tries to
20232    simulate the behavior of GCC before all such ABI fixes were made.
20233    This is useful to check whether the function returns something
20234    different after the ABI fixes.  */
20235 static int
20236 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
20237                          unsigned int *warn_psabi_flags)
20238 {
20239   machine_mode mode;
20240   HOST_WIDE_INT size;
20241
20242   if (aarch64_sve::builtin_type_p (type))
20243     return -1;
20244
20245   switch (TREE_CODE (type))
20246     {
20247     case REAL_TYPE:
20248       mode = TYPE_MODE (type);
20249       if (mode != DFmode && mode != SFmode
20250           && mode != TFmode && mode != HFmode
20251           && mode != SDmode && mode != DDmode && mode != TDmode)
20252         return -1;
20253
20254       if (*modep == VOIDmode)
20255         *modep = mode;
20256
20257       if (*modep == mode)
20258         return 1;
20259
20260       break;
20261
20262     case COMPLEX_TYPE:
20263       mode = TYPE_MODE (TREE_TYPE (type));
20264       if (mode != DFmode && mode != SFmode
20265           && mode != TFmode && mode != HFmode)
20266         return -1;
20267
20268       if (*modep == VOIDmode)
20269         *modep = mode;
20270
20271       if (*modep == mode)
20272         return 2;
20273
20274       break;
20275
20276     case VECTOR_TYPE:
20277       /* Use V2SImode and V4SImode as representatives of all 64-bit
20278          and 128-bit vector types.  */
20279       size = int_size_in_bytes (type);
20280       switch (size)
20281         {
20282         case 8:
20283           mode = V2SImode;
20284           break;
20285         case 16:
20286           mode = V4SImode;
20287           break;
20288         default:
20289           return -1;
20290         }
20291
20292       if (*modep == VOIDmode)
20293         *modep = mode;
20294
20295       /* Vector modes are considered to be opaque: two vectors are
20296          equivalent for the purposes of being homogeneous aggregates
20297          if they are the same size.  */
20298       if (*modep == mode)
20299         return 1;
20300
20301       break;
20302
20303     case ARRAY_TYPE:
20304       {
20305         int count;
20306         tree index = TYPE_DOMAIN (type);
20307
20308         /* Can't handle incomplete types nor sizes that are not
20309            fixed.  */
20310         if (!COMPLETE_TYPE_P (type)
20311             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
20312           return -1;
20313
20314         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
20315                                          warn_psabi_flags);
20316         if (count == -1
20317             || !index
20318             || !TYPE_MAX_VALUE (index)
20319             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
20320             || !TYPE_MIN_VALUE (index)
20321             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
20322             || count < 0)
20323           return -1;
20324
20325         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
20326                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
20327
20328         /* There must be no padding.  */
20329         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20330                       count * GET_MODE_BITSIZE (*modep)))
20331           return -1;
20332
20333         return count;
20334       }
20335
20336     case RECORD_TYPE:
20337       {
20338         int count = 0;
20339         int sub_count;
20340         tree field;
20341
20342         /* Can't handle incomplete types nor sizes that are not
20343            fixed.  */
20344         if (!COMPLETE_TYPE_P (type)
20345             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
20346           return -1;
20347
20348         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
20349           {
20350             if (TREE_CODE (field) != FIELD_DECL)
20351               continue;
20352
20353             if (DECL_FIELD_ABI_IGNORED (field))
20354               {
20355                 /* See whether this is something that earlier versions of
20356                    GCC failed to ignore.  */
20357                 unsigned int flag;
20358                 if (lookup_attribute ("no_unique_address",
20359                                       DECL_ATTRIBUTES (field)))
20360                   flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
20361                 else if (cxx17_empty_base_field_p (field))
20362                   flag = WARN_PSABI_EMPTY_CXX17_BASE;
20363                 else
20364                   /* No compatibility problem.  */
20365                   continue;
20366
20367                 /* Simulate the old behavior when WARN_PSABI_FLAGS is null.  */
20368                 if (warn_psabi_flags)
20369                   {
20370                     *warn_psabi_flags |= flag;
20371                     continue;
20372                   }
20373               }
20374             /* A zero-width bitfield may affect layout in some
20375                circumstances, but adds no members.  The determination
20376                of whether or not a type is an HFA is performed after
20377                layout is complete, so if the type still looks like an
20378                HFA afterwards, it is still classed as one.  This is
20379                potentially an ABI break for the hard-float ABI.  */
20380             else if (DECL_BIT_FIELD (field)
20381                      && integer_zerop (DECL_SIZE (field)))
20382               {
20383                 /* Prior to GCC-12 these fields were striped early,
20384                    hiding them from the back-end entirely and
20385                    resulting in the correct behaviour for argument
20386                    passing.  Simulate that old behaviour without
20387                    generating a warning.  */
20388                 if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field))
20389                   continue;
20390                 if (warn_psabi_flags)
20391                   {
20392                     *warn_psabi_flags |= WARN_PSABI_ZERO_WIDTH_BITFIELD;
20393                     continue;
20394                   }
20395               }
20396
20397             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
20398                                                  warn_psabi_flags);
20399             if (sub_count < 0)
20400               return -1;
20401             count += sub_count;
20402           }
20403
20404         /* There must be no padding.  */
20405         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20406                       count * GET_MODE_BITSIZE (*modep)))
20407           return -1;
20408
20409         return count;
20410       }
20411
20412     case UNION_TYPE:
20413     case QUAL_UNION_TYPE:
20414       {
20415         /* These aren't very interesting except in a degenerate case.  */
20416         int count = 0;
20417         int sub_count;
20418         tree field;
20419
20420         /* Can't handle incomplete types nor sizes that are not
20421            fixed.  */
20422         if (!COMPLETE_TYPE_P (type)
20423             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
20424           return -1;
20425
20426         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
20427           {
20428             if (TREE_CODE (field) != FIELD_DECL)
20429               continue;
20430
20431             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
20432                                                  warn_psabi_flags);
20433             if (sub_count < 0)
20434               return -1;
20435             count = count > sub_count ? count : sub_count;
20436           }
20437
20438         /* There must be no padding.  */
20439         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20440                       count * GET_MODE_BITSIZE (*modep)))
20441           return -1;
20442
20443         return count;
20444       }
20445
20446     default:
20447       break;
20448     }
20449
20450   return -1;
20451 }
20452
20453 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
20454    type as described in AAPCS64 \S 4.1.2.
20455
20456    See the comment above aarch64_composite_type_p for the notes on MODE.  */
20457
20458 static bool
20459 aarch64_short_vector_p (const_tree type,
20460                         machine_mode mode)
20461 {
20462   poly_int64 size = -1;
20463
20464   if (type && TREE_CODE (type) == VECTOR_TYPE)
20465     {
20466       if (aarch64_sve::builtin_type_p (type))
20467         return false;
20468       size = int_size_in_bytes (type);
20469     }
20470   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
20471            || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
20472     {
20473       /* The containing "else if" is too loose: it means that we look at TYPE
20474          if the type is a vector type (good), but that we otherwise ignore TYPE
20475          and look only at the mode.  This is wrong because the type describes
20476          the language-level information whereas the mode is purely an internal
20477          GCC concept.  We can therefore reach here for types that are not
20478          vectors in the AAPCS64 sense.
20479
20480          We can't "fix" that for the traditional Advanced SIMD vector modes
20481          without breaking backwards compatibility.  However, there's no such
20482          baggage for the structure modes, which were introduced in GCC 12.  */
20483       if (aarch64_advsimd_struct_mode_p (mode))
20484         return false;
20485
20486       /* For similar reasons, rely only on the type, not the mode, when
20487          processing SVE types.  */
20488       if (type && aarch64_some_values_include_pst_objects_p (type))
20489         /* Leave later code to report an error if SVE is disabled.  */
20490         gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
20491       else
20492         size = GET_MODE_SIZE (mode);
20493     }
20494   if (known_eq (size, 8) || known_eq (size, 16))
20495     {
20496       /* 64-bit and 128-bit vectors should only acquire an SVE mode if
20497          they are being treated as scalable AAPCS64 types.  */
20498       gcc_assert (!aarch64_sve_mode_p (mode)
20499                   && !aarch64_advsimd_struct_mode_p (mode));
20500       return true;
20501     }
20502   return false;
20503 }
20504
20505 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
20506    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
20507    array types.  The C99 floating-point complex types are also considered
20508    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
20509    types, which are GCC extensions and out of the scope of AAPCS64, are
20510    treated as composite types here as well.
20511
20512    Note that MODE itself is not sufficient in determining whether a type
20513    is such a composite type or not.  This is because
20514    stor-layout.cc:compute_record_mode may have already changed the MODE
20515    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
20516    structure with only one field may have its MODE set to the mode of the
20517    field.  Also an integer mode whose size matches the size of the
20518    RECORD_TYPE type may be used to substitute the original mode
20519    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
20520    solely relied on.  */
20521
20522 static bool
20523 aarch64_composite_type_p (const_tree type,
20524                           machine_mode mode)
20525 {
20526   if (aarch64_short_vector_p (type, mode))
20527     return false;
20528
20529   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
20530     return true;
20531
20532   if (mode == BLKmode
20533       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
20534       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
20535     return true;
20536
20537   return false;
20538 }
20539
20540 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
20541    shall be passed or returned in simd/fp register(s) (providing these
20542    parameter passing registers are available).
20543
20544    Upon successful return, *COUNT returns the number of needed registers,
20545    *BASE_MODE returns the mode of the individual register and when IS_HA
20546    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
20547    floating-point aggregate or a homogeneous short-vector aggregate.
20548
20549    SILENT_P is true if the function should refrain from reporting any
20550    diagnostics.  This should only be used if the caller is certain that
20551    any ABI decisions would eventually come through this function with
20552    SILENT_P set to false.  */
20553
20554 static bool
20555 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
20556                                          const_tree type,
20557                                          machine_mode *base_mode,
20558                                          int *count,
20559                                          bool *is_ha,
20560                                          bool silent_p)
20561 {
20562   if (is_ha != NULL) *is_ha = false;
20563
20564   machine_mode new_mode = VOIDmode;
20565   bool composite_p = aarch64_composite_type_p (type, mode);
20566
20567   if ((!composite_p
20568        && (GET_MODE_CLASS (mode) == MODE_FLOAT
20569            || GET_MODE_CLASS (mode) == MODE_DECIMAL_FLOAT))
20570       || aarch64_short_vector_p (type, mode))
20571     {
20572       *count = 1;
20573       new_mode = mode;
20574     }
20575   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
20576     {
20577       if (is_ha != NULL) *is_ha = true;
20578       *count = 2;
20579       new_mode = GET_MODE_INNER (mode);
20580     }
20581   else if (type && composite_p)
20582     {
20583       unsigned int warn_psabi_flags = 0;
20584       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
20585                                               &warn_psabi_flags);
20586       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
20587         {
20588           static unsigned last_reported_type_uid;
20589           unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
20590           int alt;
20591           if (!silent_p
20592               && warn_psabi
20593               && warn_psabi_flags
20594               && uid != last_reported_type_uid
20595               && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
20596                   != ag_count))
20597             {
20598               const char *url10
20599                 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
20600               const char *url12
20601                 = CHANGES_ROOT_URL "gcc-12/changes.html#zero_width_bitfields";
20602               gcc_assert (alt == -1);
20603               last_reported_type_uid = uid;
20604               /* Use TYPE_MAIN_VARIANT to strip any redundant const
20605                  qualification.  */
20606               if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
20607                 inform (input_location, "parameter passing for argument of "
20608                         "type %qT with %<[[no_unique_address]]%> members "
20609                         "changed %{in GCC 10.1%}",
20610                         TYPE_MAIN_VARIANT (type), url10);
20611               else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
20612                 inform (input_location, "parameter passing for argument of "
20613                         "type %qT when C++17 is enabled changed to match "
20614                         "C++14 %{in GCC 10.1%}",
20615                         TYPE_MAIN_VARIANT (type), url10);
20616               else if (warn_psabi_flags & WARN_PSABI_ZERO_WIDTH_BITFIELD)
20617                 inform (input_location, "parameter passing for argument of "
20618                         "type %qT changed %{in GCC 12.1%}",
20619                         TYPE_MAIN_VARIANT (type), url12);
20620             }
20621
20622           if (is_ha != NULL) *is_ha = true;
20623           *count = ag_count;
20624         }
20625       else
20626         return false;
20627     }
20628   else
20629     return false;
20630
20631   gcc_assert (!aarch64_sve_mode_p (new_mode));
20632   *base_mode = new_mode;
20633   return true;
20634 }
20635
20636 /* Implement TARGET_STRUCT_VALUE_RTX.  */
20637
20638 static rtx
20639 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
20640                           int incoming ATTRIBUTE_UNUSED)
20641 {
20642   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
20643 }
20644
20645 /* Implements target hook vector_mode_supported_p.  */
20646 static bool
20647 aarch64_vector_mode_supported_p (machine_mode mode)
20648 {
20649   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
20650   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
20651 }
20652
20653 /* Return the full-width SVE vector mode for element mode MODE, if one
20654    exists.  */
20655 opt_machine_mode
20656 aarch64_full_sve_mode (scalar_mode mode)
20657 {
20658   switch (mode)
20659     {
20660     case E_DFmode:
20661       return VNx2DFmode;
20662     case E_SFmode:
20663       return VNx4SFmode;
20664     case E_HFmode:
20665       return VNx8HFmode;
20666     case E_BFmode:
20667       return VNx8BFmode;
20668     case E_DImode:
20669       return VNx2DImode;
20670     case E_SImode:
20671       return VNx4SImode;
20672     case E_HImode:
20673       return VNx8HImode;
20674     case E_QImode:
20675       return VNx16QImode;
20676     default:
20677       return opt_machine_mode ();
20678     }
20679 }
20680
20681 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
20682    if it exists.  */
20683 opt_machine_mode
20684 aarch64_vq_mode (scalar_mode mode)
20685 {
20686   switch (mode)
20687     {
20688     case E_DFmode:
20689       return V2DFmode;
20690     case E_SFmode:
20691       return V4SFmode;
20692     case E_HFmode:
20693       return V8HFmode;
20694     case E_BFmode:
20695       return V8BFmode;
20696     case E_SImode:
20697       return V4SImode;
20698     case E_HImode:
20699       return V8HImode;
20700     case E_QImode:
20701       return V16QImode;
20702     case E_DImode:
20703       return V2DImode;
20704     default:
20705       return opt_machine_mode ();
20706     }
20707 }
20708
20709 /* Return appropriate SIMD container
20710    for MODE within a vector of WIDTH bits.  */
20711 static machine_mode
20712 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
20713 {
20714   if (TARGET_SVE
20715       && maybe_ne (width, 128)
20716       && known_eq (width, BITS_PER_SVE_VECTOR))
20717     return aarch64_full_sve_mode (mode).else_mode (word_mode);
20718
20719   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
20720   if (TARGET_SIMD)
20721     {
20722       if (known_eq (width, 128))
20723         return aarch64_vq_mode (mode).else_mode (word_mode);
20724       else
20725         switch (mode)
20726           {
20727           case E_SFmode:
20728             return V2SFmode;
20729           case E_HFmode:
20730             return V4HFmode;
20731           case E_BFmode:
20732             return V4BFmode;
20733           case E_SImode:
20734             return V2SImode;
20735           case E_HImode:
20736             return V4HImode;
20737           case E_QImode:
20738             return V8QImode;
20739           default:
20740             break;
20741           }
20742     }
20743   return word_mode;
20744 }
20745
20746 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
20747    and return whether the SVE mode should be preferred over the
20748    Advanced SIMD one in aarch64_autovectorize_vector_modes.  */
20749 static bool
20750 aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
20751 {
20752   /* Take into account the aarch64-autovec-preference param if non-zero.  */
20753   bool only_asimd_p = aarch64_autovec_preference == 1;
20754   bool only_sve_p = aarch64_autovec_preference == 2;
20755
20756   if (only_asimd_p)
20757     return false;
20758   if (only_sve_p)
20759     return true;
20760
20761   /* The preference in case of a tie in costs.  */
20762   bool prefer_asimd = aarch64_autovec_preference == 3;
20763   bool prefer_sve = aarch64_autovec_preference == 4;
20764
20765   poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
20766   poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
20767   /* If the CPU information does not have an SVE width registered use the
20768      generic poly_int comparison that prefers SVE.  If a preference is
20769      explicitly requested avoid this path.  */
20770   if (aarch64_tune_params.sve_width == SVE_SCALABLE
20771       && !prefer_asimd
20772       && !prefer_sve)
20773     return maybe_gt (nunits_sve, nunits_asimd);
20774
20775   /* Otherwise estimate the runtime width of the modes involved.  */
20776   HOST_WIDE_INT est_sve = estimated_poly_value (nunits_sve);
20777   HOST_WIDE_INT est_asimd = estimated_poly_value (nunits_asimd);
20778
20779   /* Preferring SVE means picking it first unless the Advanced SIMD mode
20780      is clearly wider.  */
20781   if (prefer_sve)
20782     return est_sve >= est_asimd;
20783   /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
20784      is clearly wider.  */
20785   if (prefer_asimd)
20786     return est_sve > est_asimd;
20787
20788   /* In the default case prefer Advanced SIMD over SVE in case of a tie.  */
20789   return est_sve > est_asimd;
20790 }
20791
20792 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
20793 static machine_mode
20794 aarch64_preferred_simd_mode (scalar_mode mode)
20795 {
20796   /* Take into account explicit auto-vectorization ISA preferences through
20797      aarch64_cmp_autovec_modes.  */
20798   if (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
20799     return aarch64_full_sve_mode (mode).else_mode (word_mode);
20800   if (TARGET_SIMD)
20801     return aarch64_vq_mode (mode).else_mode (word_mode);
20802   return word_mode;
20803 }
20804
20805 /* Return a list of possible vector sizes for the vectorizer
20806    to iterate over.  */
20807 static unsigned int
20808 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
20809 {
20810   static const machine_mode sve_modes[] = {
20811     /* Try using full vectors for all element types.  */
20812     VNx16QImode,
20813
20814     /* Try using 16-bit containers for 8-bit elements and full vectors
20815        for wider elements.  */
20816     VNx8QImode,
20817
20818     /* Try using 32-bit containers for 8-bit and 16-bit elements and
20819        full vectors for wider elements.  */
20820     VNx4QImode,
20821
20822     /* Try using 64-bit containers for all element types.  */
20823     VNx2QImode
20824   };
20825
20826   static const machine_mode advsimd_modes[] = {
20827     /* Try using 128-bit vectors for all element types.  */
20828     V16QImode,
20829
20830     /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
20831        for wider elements.  */
20832     V8QImode,
20833
20834     /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
20835        for wider elements.
20836
20837        TODO: We could support a limited form of V4QImode too, so that
20838        we use 32-bit vectors for 8-bit elements.  */
20839     V4HImode,
20840
20841     /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
20842        for 64-bit elements.
20843
20844        TODO: We could similarly support limited forms of V2QImode and V2HImode
20845        for this case.  */
20846     V2SImode
20847   };
20848
20849   /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
20850      This is because:
20851
20852      - If we can't use N-byte Advanced SIMD vectors then the placement
20853        doesn't matter; we'll just continue as though the Advanced SIMD
20854        entry didn't exist.
20855
20856      - If an SVE main loop with N bytes ends up being cheaper than an
20857        Advanced SIMD main loop with N bytes then by default we'll replace
20858        the Advanced SIMD version with the SVE one.
20859
20860      - If an Advanced SIMD main loop with N bytes ends up being cheaper
20861        than an SVE main loop with N bytes then by default we'll try to
20862        use the SVE loop to vectorize the epilogue instead.  */
20863
20864   bool only_asimd_p = aarch64_autovec_preference == 1;
20865   bool only_sve_p = aarch64_autovec_preference == 2;
20866
20867   unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
20868   unsigned int advsimd_i = 0;
20869
20870   while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
20871     {
20872       if (sve_i < ARRAY_SIZE (sve_modes)
20873           && aarch64_cmp_autovec_modes (sve_modes[sve_i],
20874                                         advsimd_modes[advsimd_i]))
20875         modes->safe_push (sve_modes[sve_i++]);
20876       else
20877         modes->safe_push (advsimd_modes[advsimd_i++]);
20878     }
20879   while (sve_i < ARRAY_SIZE (sve_modes))
20880    modes->safe_push (sve_modes[sve_i++]);
20881
20882   unsigned int flags = 0;
20883   /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
20884      can compare SVE against Advanced SIMD and so that we can compare
20885      multiple SVE vectorization approaches against each other.  There's
20886      not really any point doing this for Advanced SIMD only, since the
20887      first mode that works should always be the best.  */
20888   if (TARGET_SVE && aarch64_sve_compare_costs)
20889     flags |= VECT_COMPARE_COSTS;
20890   return flags;
20891 }
20892
20893 /* Implement TARGET_MANGLE_TYPE.  */
20894
20895 static const char *
20896 aarch64_mangle_type (const_tree type)
20897 {
20898   /* The AArch64 ABI documents say that "__va_list" has to be
20899      mangled as if it is in the "std" namespace.  */
20900   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
20901     return "St9__va_list";
20902
20903   /* Half-precision floating point types.  */
20904   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
20905     {
20906       if (TYPE_MAIN_VARIANT (type) == float16_type_node)
20907         return NULL;
20908       if (TYPE_MODE (type) == BFmode)
20909         return "u6__bf16";
20910       else
20911         return "Dh";
20912     }
20913
20914   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
20915      builtin types.  */
20916   if (TYPE_NAME (type) != NULL)
20917     {
20918       const char *res;
20919       if ((res = aarch64_general_mangle_builtin_type (type))
20920           || (res = aarch64_sve::mangle_builtin_type (type)))
20921         return res;
20922     }
20923
20924   /* Use the default mangling.  */
20925   return NULL;
20926 }
20927
20928 /* Implement TARGET_VERIFY_TYPE_CONTEXT.  */
20929
20930 static bool
20931 aarch64_verify_type_context (location_t loc, type_context_kind context,
20932                              const_tree type, bool silent_p)
20933 {
20934   return aarch64_sve::verify_type_context (loc, context, type, silent_p);
20935 }
20936
20937 /* Find the first rtx_insn before insn that will generate an assembly
20938    instruction.  */
20939
20940 static rtx_insn *
20941 aarch64_prev_real_insn (rtx_insn *insn)
20942 {
20943   if (!insn)
20944     return NULL;
20945
20946   do
20947     {
20948       insn = prev_real_insn (insn);
20949     }
20950   while (insn && recog_memoized (insn) < 0);
20951
20952   return insn;
20953 }
20954
20955 static bool
20956 is_madd_op (enum attr_type t1)
20957 {
20958   unsigned int i;
20959   /* A number of these may be AArch32 only.  */
20960   enum attr_type mlatypes[] = {
20961     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
20962     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
20963     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
20964   };
20965
20966   for (i = 0; i < ARRAY_SIZE (mlatypes); i++)
20967     {
20968       if (t1 == mlatypes[i])
20969         return true;
20970     }
20971
20972   return false;
20973 }
20974
20975 /* Check if there is a register dependency between a load and the insn
20976    for which we hold recog_data.  */
20977
20978 static bool
20979 dep_between_memop_and_curr (rtx memop)
20980 {
20981   rtx load_reg;
20982   int opno;
20983
20984   gcc_assert (GET_CODE (memop) == SET);
20985
20986   if (!REG_P (SET_DEST (memop)))
20987     return false;
20988
20989   load_reg = SET_DEST (memop);
20990   for (opno = 1; opno < recog_data.n_operands; opno++)
20991     {
20992       rtx operand = recog_data.operand[opno];
20993       if (REG_P (operand)
20994           && reg_overlap_mentioned_p (load_reg, operand))
20995         return true;
20996
20997     }
20998   return false;
20999 }
21000
21001
21002 /* When working around the Cortex-A53 erratum 835769,
21003    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
21004    instruction and has a preceding memory instruction such that a NOP
21005    should be inserted between them.  */
21006
21007 bool
21008 aarch64_madd_needs_nop (rtx_insn* insn)
21009 {
21010   enum attr_type attr_type;
21011   rtx_insn *prev;
21012   rtx body;
21013
21014   if (!TARGET_FIX_ERR_A53_835769)
21015     return false;
21016
21017   if (!INSN_P (insn) || recog_memoized (insn) < 0)
21018     return false;
21019
21020   attr_type = get_attr_type (insn);
21021   if (!is_madd_op (attr_type))
21022     return false;
21023
21024   prev = aarch64_prev_real_insn (insn);
21025   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
21026      Restore recog state to INSN to avoid state corruption.  */
21027   extract_constrain_insn_cached (insn);
21028
21029   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
21030     return false;
21031
21032   body = single_set (prev);
21033
21034   /* If the previous insn is a memory op and there is no dependency between
21035      it and the DImode madd, emit a NOP between them.  If body is NULL then we
21036      have a complex memory operation, probably a load/store pair.
21037      Be conservative for now and emit a NOP.  */
21038   if (GET_MODE (recog_data.operand[0]) == DImode
21039       && (!body || !dep_between_memop_and_curr (body)))
21040     return true;
21041
21042   return false;
21043
21044 }
21045
21046
21047 /* Implement FINAL_PRESCAN_INSN.  */
21048
21049 void
21050 aarch64_final_prescan_insn (rtx_insn *insn)
21051 {
21052   if (aarch64_madd_needs_nop (insn))
21053     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
21054 }
21055
21056
21057 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
21058    instruction.  */
21059
21060 bool
21061 aarch64_sve_index_immediate_p (rtx base_or_step)
21062 {
21063   return (CONST_INT_P (base_or_step)
21064           && IN_RANGE (INTVAL (base_or_step), -16, 15));
21065 }
21066
21067 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
21068    when applied to mode MODE.  Negate X first if NEGATE_P is true.  */
21069
21070 bool
21071 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
21072 {
21073   rtx elt = unwrap_const_vec_duplicate (x);
21074   if (!CONST_INT_P (elt))
21075     return false;
21076
21077   HOST_WIDE_INT val = INTVAL (elt);
21078   if (negate_p)
21079     val = -val;
21080   val &= GET_MODE_MASK (GET_MODE_INNER (mode));
21081
21082   if (val & 0xff)
21083     return IN_RANGE (val, 0, 0xff);
21084   return IN_RANGE (val, 0, 0xff00);
21085 }
21086
21087 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
21088    instructions when applied to mode MODE.  Negate X first if NEGATE_P
21089    is true.  */
21090
21091 bool
21092 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
21093 {
21094   if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
21095     return false;
21096
21097   /* After the optional negation, the immediate must be nonnegative.
21098      E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
21099      instead of SQADD Zn.B, Zn.B, #129.  */
21100   rtx elt = unwrap_const_vec_duplicate (x);
21101   return negate_p == (INTVAL (elt) < 0);
21102 }
21103
21104 /* Return true if X is a valid immediate operand for an SVE logical
21105    instruction such as AND.  */
21106
21107 bool
21108 aarch64_sve_bitmask_immediate_p (rtx x)
21109 {
21110   rtx elt;
21111
21112   return (const_vec_duplicate_p (x, &elt)
21113           && CONST_INT_P (elt)
21114           && aarch64_bitmask_imm (INTVAL (elt),
21115                                   GET_MODE_INNER (GET_MODE (x))));
21116 }
21117
21118 /* Return true if X is a valid immediate for the SVE DUP and CPY
21119    instructions.  */
21120
21121 bool
21122 aarch64_sve_dup_immediate_p (rtx x)
21123 {
21124   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
21125   if (!CONST_INT_P (x))
21126     return false;
21127
21128   HOST_WIDE_INT val = INTVAL (x);
21129   if (val & 0xff)
21130     return IN_RANGE (val, -0x80, 0x7f);
21131   return IN_RANGE (val, -0x8000, 0x7f00);
21132 }
21133
21134 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
21135    SIGNED_P says whether the operand is signed rather than unsigned.  */
21136
21137 bool
21138 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
21139 {
21140   x = unwrap_const_vec_duplicate (x);
21141   return (CONST_INT_P (x)
21142           && (signed_p
21143               ? IN_RANGE (INTVAL (x), -16, 15)
21144               : IN_RANGE (INTVAL (x), 0, 127)));
21145 }
21146
21147 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
21148    instruction.  Negate X first if NEGATE_P is true.  */
21149
21150 bool
21151 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
21152 {
21153   rtx elt;
21154   REAL_VALUE_TYPE r;
21155
21156   if (!const_vec_duplicate_p (x, &elt)
21157       || !CONST_DOUBLE_P (elt))
21158     return false;
21159
21160   r = *CONST_DOUBLE_REAL_VALUE (elt);
21161
21162   if (negate_p)
21163     r = real_value_negate (&r);
21164
21165   if (real_equal (&r, &dconst1))
21166     return true;
21167   if (real_equal (&r, &dconsthalf))
21168     return true;
21169   return false;
21170 }
21171
21172 /* Return true if X is a valid immediate operand for an SVE FMUL
21173    instruction.  */
21174
21175 bool
21176 aarch64_sve_float_mul_immediate_p (rtx x)
21177 {
21178   rtx elt;
21179
21180   return (const_vec_duplicate_p (x, &elt)
21181           && CONST_DOUBLE_P (elt)
21182           && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
21183               || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
21184 }
21185
21186 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
21187    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
21188    is nonnull, use it to describe valid immediates.  */
21189 static bool
21190 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
21191                                     simd_immediate_info *info,
21192                                     enum simd_immediate_check which,
21193                                     simd_immediate_info::insn_type insn)
21194 {
21195   /* Try a 4-byte immediate with LSL.  */
21196   for (unsigned int shift = 0; shift < 32; shift += 8)
21197     if ((val32 & (0xff << shift)) == val32)
21198       {
21199         if (info)
21200           *info = simd_immediate_info (SImode, val32 >> shift, insn,
21201                                        simd_immediate_info::LSL, shift);
21202         return true;
21203       }
21204
21205   /* Try a 2-byte immediate with LSL.  */
21206   unsigned int imm16 = val32 & 0xffff;
21207   if (imm16 == (val32 >> 16))
21208     for (unsigned int shift = 0; shift < 16; shift += 8)
21209       if ((imm16 & (0xff << shift)) == imm16)
21210         {
21211           if (info)
21212             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
21213                                          simd_immediate_info::LSL, shift);
21214           return true;
21215         }
21216
21217   /* Try a 4-byte immediate with MSL, except for cases that MVN
21218      can handle.  */
21219   if (which == AARCH64_CHECK_MOV)
21220     for (unsigned int shift = 8; shift < 24; shift += 8)
21221       {
21222         unsigned int low = (1 << shift) - 1;
21223         if (((val32 & (0xff << shift)) | low) == val32)
21224           {
21225             if (info)
21226               *info = simd_immediate_info (SImode, val32 >> shift, insn,
21227                                            simd_immediate_info::MSL, shift);
21228             return true;
21229           }
21230       }
21231
21232   return false;
21233 }
21234
21235 /* Return true if replicating VAL64 is a valid immediate for the
21236    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
21237    use it to describe valid immediates.  */
21238 static bool
21239 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
21240                                  simd_immediate_info *info,
21241                                  enum simd_immediate_check which)
21242 {
21243   unsigned int val32 = val64 & 0xffffffff;
21244   unsigned int val16 = val64 & 0xffff;
21245   unsigned int val8 = val64 & 0xff;
21246
21247   if (val32 == (val64 >> 32))
21248     {
21249       if ((which & AARCH64_CHECK_ORR) != 0
21250           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
21251                                                  simd_immediate_info::MOV))
21252         return true;
21253
21254       if ((which & AARCH64_CHECK_BIC) != 0
21255           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
21256                                                  simd_immediate_info::MVN))
21257         return true;
21258
21259       /* Try using a replicated byte.  */
21260       if (which == AARCH64_CHECK_MOV
21261           && val16 == (val32 >> 16)
21262           && val8 == (val16 >> 8))
21263         {
21264           if (info)
21265             *info = simd_immediate_info (QImode, val8);
21266           return true;
21267         }
21268     }
21269
21270   /* Try using a bit-to-bytemask.  */
21271   if (which == AARCH64_CHECK_MOV)
21272     {
21273       unsigned int i;
21274       for (i = 0; i < 64; i += 8)
21275         {
21276           unsigned char byte = (val64 >> i) & 0xff;
21277           if (byte != 0 && byte != 0xff)
21278             break;
21279         }
21280       if (i == 64)
21281         {
21282           if (info)
21283             *info = simd_immediate_info (DImode, val64);
21284           return true;
21285         }
21286     }
21287   return false;
21288 }
21289
21290 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
21291    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
21292
21293 static bool
21294 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
21295                              simd_immediate_info *info)
21296 {
21297   scalar_int_mode mode = DImode;
21298   unsigned int val32 = val64 & 0xffffffff;
21299   if (val32 == (val64 >> 32))
21300     {
21301       mode = SImode;
21302       unsigned int val16 = val32 & 0xffff;
21303       if (val16 == (val32 >> 16))
21304         {
21305           mode = HImode;
21306           unsigned int val8 = val16 & 0xff;
21307           if (val8 == (val16 >> 8))
21308             mode = QImode;
21309         }
21310     }
21311   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
21312   if (IN_RANGE (val, -0x80, 0x7f))
21313     {
21314       /* DUP with no shift.  */
21315       if (info)
21316         *info = simd_immediate_info (mode, val);
21317       return true;
21318     }
21319   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
21320     {
21321       /* DUP with LSL #8.  */
21322       if (info)
21323         *info = simd_immediate_info (mode, val);
21324       return true;
21325     }
21326   if (aarch64_bitmask_imm (val64, mode))
21327     {
21328       /* DUPM.  */
21329       if (info)
21330         *info = simd_immediate_info (mode, val);
21331       return true;
21332     }
21333   return false;
21334 }
21335
21336 /* Return true if X is an UNSPEC_PTRUE constant of the form:
21337
21338        (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
21339
21340    where PATTERN is the svpattern as a CONST_INT and where ZERO
21341    is a zero constant of the required PTRUE mode (which can have
21342    fewer elements than X's mode, if zero bits are significant).
21343
21344    If so, and if INFO is nonnull, describe the immediate in INFO.  */
21345 bool
21346 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
21347 {
21348   if (GET_CODE (x) != CONST)
21349     return false;
21350
21351   x = XEXP (x, 0);
21352   if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
21353     return false;
21354
21355   if (info)
21356     {
21357       aarch64_svpattern pattern
21358         = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
21359       machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
21360       scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
21361       *info = simd_immediate_info (int_mode, pattern);
21362     }
21363   return true;
21364 }
21365
21366 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
21367    it to describe valid immediates.  */
21368
21369 static bool
21370 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
21371 {
21372   if (aarch64_sve_ptrue_svpattern_p (x, info))
21373     return true;
21374
21375   if (x == CONST0_RTX (GET_MODE (x)))
21376     {
21377       if (info)
21378         *info = simd_immediate_info (DImode, 0);
21379       return true;
21380     }
21381
21382   /* Analyze the value as a VNx16BImode.  This should be relatively
21383      efficient, since rtx_vector_builder has enough built-in capacity
21384      to store all VLA predicate constants without needing the heap.  */
21385   rtx_vector_builder builder;
21386   if (!aarch64_get_sve_pred_bits (builder, x))
21387     return false;
21388
21389   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
21390   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
21391     {
21392       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
21393       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
21394       if (pattern != AARCH64_NUM_SVPATTERNS)
21395         {
21396           if (info)
21397             {
21398               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
21399               *info = simd_immediate_info (int_mode, pattern);
21400             }
21401           return true;
21402         }
21403     }
21404   return false;
21405 }
21406
21407 /* Return true if OP is a valid SIMD immediate for the operation
21408    described by WHICH.  If INFO is nonnull, use it to describe valid
21409    immediates.  */
21410 bool
21411 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
21412                               enum simd_immediate_check which)
21413 {
21414   machine_mode mode = GET_MODE (op);
21415   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
21416   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
21417     return false;
21418
21419   if ((vec_flags & VEC_ADVSIMD) && !TARGET_SIMD)
21420     return false;
21421
21422   if (vec_flags & VEC_SVE_PRED)
21423     return aarch64_sve_pred_valid_immediate (op, info);
21424
21425   scalar_mode elt_mode = GET_MODE_INNER (mode);
21426   rtx base, step;
21427   unsigned int n_elts;
21428   if (CONST_VECTOR_P (op)
21429       && CONST_VECTOR_DUPLICATE_P (op))
21430     n_elts = CONST_VECTOR_NPATTERNS (op);
21431   else if ((vec_flags & VEC_SVE_DATA)
21432            && const_vec_series_p (op, &base, &step))
21433     {
21434       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
21435       if (!aarch64_sve_index_immediate_p (base)
21436           || !aarch64_sve_index_immediate_p (step))
21437         return false;
21438
21439       if (info)
21440         {
21441           /* Get the corresponding container mode.  E.g. an INDEX on V2SI
21442              should yield two integer values per 128-bit block, meaning
21443              that we need to treat it in the same way as V2DI and then
21444              ignore the upper 32 bits of each element.  */
21445           elt_mode = aarch64_sve_container_int_mode (mode);
21446           *info = simd_immediate_info (elt_mode, base, step);
21447         }
21448       return true;
21449     }
21450   else if (CONST_VECTOR_P (op)
21451            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
21452     /* N_ELTS set above.  */;
21453   else
21454     return false;
21455
21456   scalar_float_mode elt_float_mode;
21457   if (n_elts == 1
21458       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
21459     {
21460       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
21461       if (aarch64_float_const_zero_rtx_p (elt)
21462           || aarch64_float_const_representable_p (elt))
21463         {
21464           if (info)
21465             *info = simd_immediate_info (elt_float_mode, elt);
21466           return true;
21467         }
21468     }
21469
21470   /* If all elements in an SVE vector have the same value, we have a free
21471      choice between using the element mode and using the container mode.
21472      Using the element mode means that unused parts of the vector are
21473      duplicates of the used elements, while using the container mode means
21474      that the unused parts are an extension of the used elements.  Using the
21475      element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
21476      for its container mode VNx4SI while 0x00000101 isn't.
21477
21478      If not all elements in an SVE vector have the same value, we need the
21479      transition from one element to the next to occur at container boundaries.
21480      E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
21481      in the same way as a VNx4SI containing { 1, 2, 3, 4 }.  */
21482   scalar_int_mode elt_int_mode;
21483   if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
21484     elt_int_mode = aarch64_sve_container_int_mode (mode);
21485   else
21486     elt_int_mode = int_mode_for_mode (elt_mode).require ();
21487
21488   unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
21489   if (elt_size > 8)
21490     return false;
21491
21492   /* Expand the vector constant out into a byte vector, with the least
21493      significant byte of the register first.  */
21494   auto_vec<unsigned char, 16> bytes;
21495   bytes.reserve (n_elts * elt_size);
21496   for (unsigned int i = 0; i < n_elts; i++)
21497     {
21498       /* The vector is provided in gcc endian-neutral fashion.
21499          For aarch64_be Advanced SIMD, it must be laid out in the vector
21500          register in reverse order.  */
21501       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
21502       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
21503
21504       if (elt_mode != elt_int_mode)
21505         elt = gen_lowpart (elt_int_mode, elt);
21506
21507       if (!CONST_INT_P (elt))
21508         return false;
21509
21510       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
21511       for (unsigned int byte = 0; byte < elt_size; byte++)
21512         {
21513           bytes.quick_push (elt_val & 0xff);
21514           elt_val >>= BITS_PER_UNIT;
21515         }
21516     }
21517
21518   /* The immediate must repeat every eight bytes.  */
21519   unsigned int nbytes = bytes.length ();
21520   for (unsigned i = 8; i < nbytes; ++i)
21521     if (bytes[i] != bytes[i - 8])
21522       return false;
21523
21524   /* Get the repeating 8-byte value as an integer.  No endian correction
21525      is needed here because bytes is already in lsb-first order.  */
21526   unsigned HOST_WIDE_INT val64 = 0;
21527   for (unsigned int i = 0; i < 8; i++)
21528     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
21529               << (i * BITS_PER_UNIT));
21530
21531   if (vec_flags & VEC_SVE_DATA)
21532     return aarch64_sve_valid_immediate (val64, info);
21533   else
21534     return aarch64_advsimd_valid_immediate (val64, info, which);
21535 }
21536
21537 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
21538    has a step in the range of INDEX.  Return the index expression if so,
21539    otherwise return null.  */
21540 rtx
21541 aarch64_check_zero_based_sve_index_immediate (rtx x)
21542 {
21543   rtx base, step;
21544   if (const_vec_series_p (x, &base, &step)
21545       && base == const0_rtx
21546       && aarch64_sve_index_immediate_p (step))
21547     return step;
21548   return NULL_RTX;
21549 }
21550
21551 /* Check of immediate shift constants are within range.  */
21552 bool
21553 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
21554 {
21555   x = unwrap_const_vec_duplicate (x);
21556   if (!CONST_INT_P (x))
21557     return false;
21558   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
21559   if (left)
21560     return IN_RANGE (INTVAL (x), 0, bit_width - 1);
21561   else
21562     return IN_RANGE (INTVAL (x), 1, bit_width);
21563 }
21564
21565 /* Return the bitmask CONST_INT to select the bits required by a zero extract
21566    operation of width WIDTH at bit position POS.  */
21567
21568 rtx
21569 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
21570 {
21571   gcc_assert (CONST_INT_P (width));
21572   gcc_assert (CONST_INT_P (pos));
21573
21574   unsigned HOST_WIDE_INT mask
21575     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
21576   return GEN_INT (mask << UINTVAL (pos));
21577 }
21578
21579 bool
21580 aarch64_mov_operand_p (rtx x, machine_mode mode)
21581 {
21582   if (GET_CODE (x) == HIGH
21583       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
21584     return true;
21585
21586   if (CONST_INT_P (x))
21587     return true;
21588
21589   if (VECTOR_MODE_P (GET_MODE (x)))
21590     {
21591       /* Require predicate constants to be VNx16BI before RA, so that we
21592          force everything to have a canonical form.  */
21593       if (!lra_in_progress
21594           && !reload_completed
21595           && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
21596           && GET_MODE (x) != VNx16BImode)
21597         return false;
21598
21599       return aarch64_simd_valid_immediate (x, NULL);
21600     }
21601
21602   /* Remove UNSPEC_SALT_ADDR before checking symbol reference.  */
21603   x = strip_salt (x);
21604
21605   /* GOT accesses are valid moves.  */
21606   if (SYMBOL_REF_P (x)
21607       && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G)
21608     return true;
21609
21610   if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
21611     return true;
21612
21613   if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
21614     return true;
21615
21616   return aarch64_classify_symbolic_expression (x)
21617     == SYMBOL_TINY_ABSOLUTE;
21618 }
21619
21620 /* Create a 0 constant that is based on V4SI to allow CSE to optimally share
21621    the constant creation.  */
21622
21623 rtx
21624 aarch64_gen_shareable_zero (machine_mode mode)
21625 {
21626   machine_mode zmode = V4SImode;
21627   rtx tmp = gen_reg_rtx (zmode);
21628   emit_move_insn (tmp, CONST0_RTX (zmode));
21629   return lowpart_subreg (mode, tmp, zmode);
21630 }
21631
21632 /* Return a const_int vector of VAL.  */
21633 rtx
21634 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
21635 {
21636   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
21637   return gen_const_vec_duplicate (mode, c);
21638 }
21639
21640 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
21641
21642 bool
21643 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
21644 {
21645   machine_mode vmode;
21646
21647   vmode = aarch64_simd_container_mode (mode, 64);
21648   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
21649   return aarch64_simd_valid_immediate (op_v, NULL);
21650 }
21651
21652 /* Construct and return a PARALLEL RTX vector with elements numbering the
21653    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
21654    the vector - from the perspective of the architecture.  This does not
21655    line up with GCC's perspective on lane numbers, so we end up with
21656    different masks depending on our target endian-ness.  The diagram
21657    below may help.  We must draw the distinction when building masks
21658    which select one half of the vector.  An instruction selecting
21659    architectural low-lanes for a big-endian target, must be described using
21660    a mask selecting GCC high-lanes.
21661
21662                  Big-Endian             Little-Endian
21663
21664 GCC             0   1   2   3           3   2   1   0
21665               | x | x | x | x |       | x | x | x | x |
21666 Architecture    3   2   1   0           3   2   1   0
21667
21668 Low Mask:         { 2, 3 }                { 0, 1 }
21669 High Mask:        { 0, 1 }                { 2, 3 }
21670
21671    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
21672
21673 rtx
21674 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
21675 {
21676   rtvec v = rtvec_alloc (nunits / 2);
21677   int high_base = nunits / 2;
21678   int low_base = 0;
21679   int base;
21680   rtx t1;
21681   int i;
21682
21683   if (BYTES_BIG_ENDIAN)
21684     base = high ? low_base : high_base;
21685   else
21686     base = high ? high_base : low_base;
21687
21688   for (i = 0; i < nunits / 2; i++)
21689     RTVEC_ELT (v, i) = GEN_INT (base + i);
21690
21691   t1 = gen_rtx_PARALLEL (mode, v);
21692   return t1;
21693 }
21694
21695 /* Check OP for validity as a PARALLEL RTX vector with elements
21696    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
21697    from the perspective of the architecture.  See the diagram above
21698    aarch64_simd_vect_par_cnst_half for more details.  */
21699
21700 bool
21701 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
21702                                        bool high)
21703 {
21704   int nelts;
21705   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
21706     return false;
21707
21708   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
21709   HOST_WIDE_INT count_op = XVECLEN (op, 0);
21710   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
21711   int i = 0;
21712
21713   if (count_op != count_ideal)
21714     return false;
21715
21716   for (i = 0; i < count_ideal; i++)
21717     {
21718       rtx elt_op = XVECEXP (op, 0, i);
21719       rtx elt_ideal = XVECEXP (ideal, 0, i);
21720
21721       if (!CONST_INT_P (elt_op)
21722           || INTVAL (elt_ideal) != INTVAL (elt_op))
21723         return false;
21724     }
21725   return true;
21726 }
21727
21728 /* Return a PARALLEL containing NELTS elements, with element I equal
21729    to BASE + I * STEP.  */
21730
21731 rtx
21732 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
21733 {
21734   rtvec vec = rtvec_alloc (nelts);
21735   for (unsigned int i = 0; i < nelts; ++i)
21736     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
21737   return gen_rtx_PARALLEL (VOIDmode, vec);
21738 }
21739
21740 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
21741    series with step STEP.  */
21742
21743 bool
21744 aarch64_stepped_int_parallel_p (rtx op, int step)
21745 {
21746   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
21747     return false;
21748
21749   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
21750   for (int i = 1; i < XVECLEN (op, 0); ++i)
21751     if (!CONST_INT_P (XVECEXP (op, 0, i))
21752         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
21753       return false;
21754
21755   return true;
21756 }
21757
21758 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
21759    HIGH (exclusive).  */
21760 void
21761 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
21762                           const_tree exp)
21763 {
21764   HOST_WIDE_INT lane;
21765   gcc_assert (CONST_INT_P (operand));
21766   lane = INTVAL (operand);
21767
21768   if (lane < low || lane >= high)
21769   {
21770     if (exp)
21771       error_at (EXPR_LOCATION (exp), "lane %wd out of range %wd - %wd",
21772                 lane, low, high - 1);
21773     else
21774       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
21775   }
21776 }
21777
21778 /* Peform endian correction on lane number N, which indexes a vector
21779    of mode MODE, and return the result as an SImode rtx.  */
21780
21781 rtx
21782 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
21783 {
21784   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
21785 }
21786
21787 /* Return TRUE if OP is a valid vector addressing mode.  */
21788
21789 bool
21790 aarch64_simd_mem_operand_p (rtx op)
21791 {
21792   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
21793                         || REG_P (XEXP (op, 0)));
21794 }
21795
21796 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
21797
21798 bool
21799 aarch64_sve_ld1r_operand_p (rtx op)
21800 {
21801   struct aarch64_address_info addr;
21802   scalar_mode mode;
21803
21804   return (MEM_P (op)
21805           && is_a <scalar_mode> (GET_MODE (op), &mode)
21806           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
21807           && addr.type == ADDRESS_REG_IMM
21808           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
21809 }
21810
21811 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
21812    where the size of the read data is specified by `mode` and the size of the
21813    vector elements are specified by `elem_mode`.   */
21814 bool
21815 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
21816                                    scalar_mode elem_mode)
21817 {
21818   struct aarch64_address_info addr;
21819   if (!MEM_P (op)
21820       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
21821     return false;
21822
21823   if (addr.type == ADDRESS_REG_IMM)
21824     return offset_4bit_signed_scaled_p (mode, addr.const_offset);
21825
21826   if (addr.type == ADDRESS_REG_REG)
21827     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
21828
21829   return false;
21830 }
21831
21832 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
21833 bool
21834 aarch64_sve_ld1rq_operand_p (rtx op)
21835 {
21836   return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
21837                                             GET_MODE_INNER (GET_MODE (op)));
21838 }
21839
21840 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
21841    accessing a vector where the element size is specified by `elem_mode`.  */
21842 bool
21843 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
21844 {
21845   return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
21846 }
21847
21848 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction.  */
21849 bool
21850 aarch64_sve_ldff1_operand_p (rtx op)
21851 {
21852   if (!MEM_P (op))
21853     return false;
21854
21855   struct aarch64_address_info addr;
21856   if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
21857     return false;
21858
21859   if (addr.type == ADDRESS_REG_IMM)
21860     return known_eq (addr.const_offset, 0);
21861
21862   return addr.type == ADDRESS_REG_REG;
21863 }
21864
21865 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction.  */
21866 bool
21867 aarch64_sve_ldnf1_operand_p (rtx op)
21868 {
21869   struct aarch64_address_info addr;
21870
21871   return (MEM_P (op)
21872           && aarch64_classify_address (&addr, XEXP (op, 0),
21873                                        GET_MODE (op), false)
21874           && addr.type == ADDRESS_REG_IMM);
21875 }
21876
21877 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
21878    The conditions for STR are the same.  */
21879 bool
21880 aarch64_sve_ldr_operand_p (rtx op)
21881 {
21882   struct aarch64_address_info addr;
21883
21884   return (MEM_P (op)
21885           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
21886                                        false, ADDR_QUERY_ANY)
21887           && addr.type == ADDRESS_REG_IMM);
21888 }
21889
21890 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
21891    addressing memory of mode MODE.  */
21892 bool
21893 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
21894 {
21895   struct aarch64_address_info addr;
21896   if (!aarch64_classify_address (&addr, op, mode, false, ADDR_QUERY_ANY))
21897     return false;
21898
21899   if (addr.type == ADDRESS_REG_IMM)
21900     return offset_6bit_signed_scaled_p (mode, addr.const_offset);
21901
21902   return addr.type == ADDRESS_REG_REG;
21903 }
21904
21905 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
21906    We need to be able to access the individual pieces, so the range
21907    is different from LD[234] and ST[234].  */
21908 bool
21909 aarch64_sve_struct_memory_operand_p (rtx op)
21910 {
21911   if (!MEM_P (op))
21912     return false;
21913
21914   machine_mode mode = GET_MODE (op);
21915   struct aarch64_address_info addr;
21916   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
21917                                  ADDR_QUERY_ANY)
21918       || addr.type != ADDRESS_REG_IMM)
21919     return false;
21920
21921   poly_int64 first = addr.const_offset;
21922   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
21923   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
21924           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
21925 }
21926
21927 /* Emit a register copy from operand to operand, taking care not to
21928    early-clobber source registers in the process.
21929
21930    COUNT is the number of components into which the copy needs to be
21931    decomposed.  */
21932 void
21933 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
21934                                 unsigned int count)
21935 {
21936   unsigned int i;
21937   int rdest = REGNO (operands[0]);
21938   int rsrc = REGNO (operands[1]);
21939
21940   if (!reg_overlap_mentioned_p (operands[0], operands[1])
21941       || rdest < rsrc)
21942     for (i = 0; i < count; i++)
21943       emit_move_insn (gen_rtx_REG (mode, rdest + i),
21944                       gen_rtx_REG (mode, rsrc + i));
21945   else
21946     for (i = 0; i < count; i++)
21947       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
21948                       gen_rtx_REG (mode, rsrc + count - i - 1));
21949 }
21950
21951 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
21952    one of VSTRUCT modes: OI, CI, or XI.  */
21953 int
21954 aarch64_simd_attr_length_rglist (machine_mode mode)
21955 {
21956   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
21957   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
21958 }
21959
21960 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
21961    alignment of a vector to 128 bits.  SVE predicates have an alignment of
21962    16 bits.  */
21963 static HOST_WIDE_INT
21964 aarch64_simd_vector_alignment (const_tree type)
21965 {
21966   /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
21967      be set for non-predicate vectors of booleans.  Modes are the most
21968      direct way we have of identifying real SVE predicate types.  */
21969   if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
21970     return 16;
21971   widest_int min_size
21972     = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
21973   return wi::umin (min_size, 128).to_uhwi ();
21974 }
21975
21976 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
21977 static poly_uint64
21978 aarch64_vectorize_preferred_vector_alignment (const_tree type)
21979 {
21980   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
21981     {
21982       /* If the length of the vector is a fixed power of 2, try to align
21983          to that length, otherwise don't try to align at all.  */
21984       HOST_WIDE_INT result;
21985       if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
21986           || !pow2p_hwi (result))
21987         result = TYPE_ALIGN (TREE_TYPE (type));
21988       return result;
21989     }
21990   return TYPE_ALIGN (type);
21991 }
21992
21993 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
21994 static bool
21995 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
21996 {
21997   if (is_packed)
21998     return false;
21999
22000   /* For fixed-length vectors, check that the vectorizer will aim for
22001      full-vector alignment.  This isn't true for generic GCC vectors
22002      that are wider than the ABI maximum of 128 bits.  */
22003   poly_uint64 preferred_alignment =
22004     aarch64_vectorize_preferred_vector_alignment (type);
22005   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
22006       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
22007                    preferred_alignment))
22008     return false;
22009
22010   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
22011   return true;
22012 }
22013
22014 /* Return true if the vector misalignment factor is supported by the
22015    target.  */
22016 static bool
22017 aarch64_builtin_support_vector_misalignment (machine_mode mode,
22018                                              const_tree type, int misalignment,
22019                                              bool is_packed)
22020 {
22021   if (TARGET_SIMD && STRICT_ALIGNMENT)
22022     {
22023       /* Return if movmisalign pattern is not supported for this mode.  */
22024       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
22025         return false;
22026
22027       /* Misalignment factor is unknown at compile time.  */
22028       if (misalignment == -1)
22029         return false;
22030     }
22031   return default_builtin_support_vector_misalignment (mode, type, misalignment,
22032                                                       is_packed);
22033 }
22034
22035 /* If VALS is a vector constant that can be loaded into a register
22036    using DUP, generate instructions to do so and return an RTX to
22037    assign to the register.  Otherwise return NULL_RTX.  */
22038 static rtx
22039 aarch64_simd_dup_constant (rtx vals)
22040 {
22041   machine_mode mode = GET_MODE (vals);
22042   machine_mode inner_mode = GET_MODE_INNER (mode);
22043   rtx x;
22044
22045   if (!const_vec_duplicate_p (vals, &x))
22046     return NULL_RTX;
22047
22048   /* We can load this constant by using DUP and a constant in a
22049      single ARM register.  This will be cheaper than a vector
22050      load.  */
22051   x = copy_to_mode_reg (inner_mode, x);
22052   return gen_vec_duplicate (mode, x);
22053 }
22054
22055
22056 /* Generate code to load VALS, which is a PARALLEL containing only
22057    constants (for vec_init) or CONST_VECTOR, efficiently into a
22058    register.  Returns an RTX to copy into the register, or NULL_RTX
22059    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
22060 static rtx
22061 aarch64_simd_make_constant (rtx vals)
22062 {
22063   machine_mode mode = GET_MODE (vals);
22064   rtx const_dup;
22065   rtx const_vec = NULL_RTX;
22066   int n_const = 0;
22067   int i;
22068
22069   if (CONST_VECTOR_P (vals))
22070     const_vec = vals;
22071   else if (GET_CODE (vals) == PARALLEL)
22072     {
22073       /* A CONST_VECTOR must contain only CONST_INTs and
22074          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
22075          Only store valid constants in a CONST_VECTOR.  */
22076       int n_elts = XVECLEN (vals, 0);
22077       for (i = 0; i < n_elts; ++i)
22078         {
22079           rtx x = XVECEXP (vals, 0, i);
22080           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
22081             n_const++;
22082         }
22083       if (n_const == n_elts)
22084         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
22085     }
22086   else
22087     gcc_unreachable ();
22088
22089   if (const_vec != NULL_RTX
22090       && aarch64_simd_valid_immediate (const_vec, NULL))
22091     /* Load using MOVI/MVNI.  */
22092     return const_vec;
22093   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
22094     /* Loaded using DUP.  */
22095     return const_dup;
22096   else if (const_vec != NULL_RTX)
22097     /* Load from constant pool. We cannot take advantage of single-cycle
22098        LD1 because we need a PC-relative addressing mode.  */
22099     return const_vec;
22100   else
22101     /* A PARALLEL containing something not valid inside CONST_VECTOR.
22102        We cannot construct an initializer.  */
22103     return NULL_RTX;
22104 }
22105
22106 /* Expand a vector initialisation sequence, such that TARGET is
22107    initialised to contain VALS.  */
22108
22109 void
22110 aarch64_expand_vector_init (rtx target, rtx vals)
22111 {
22112   machine_mode mode = GET_MODE (target);
22113   scalar_mode inner_mode = GET_MODE_INNER (mode);
22114   /* The number of vector elements.  */
22115   int n_elts = XVECLEN (vals, 0);
22116   /* The number of vector elements which are not constant.  */
22117   int n_var = 0;
22118   rtx any_const = NULL_RTX;
22119   /* The first element of vals.  */
22120   rtx v0 = XVECEXP (vals, 0, 0);
22121   bool all_same = true;
22122
22123   /* This is a special vec_init<M><N> where N is not an element mode but a
22124      vector mode with half the elements of M.  We expect to find two entries
22125      of mode N in VALS and we must put their concatentation into TARGET.  */
22126   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
22127     {
22128       machine_mode narrow_mode = GET_MODE (XVECEXP (vals, 0, 0));
22129       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode
22130                   && known_eq (GET_MODE_SIZE (mode),
22131                                2 * GET_MODE_SIZE (narrow_mode)));
22132       emit_insn (gen_aarch64_vec_concat (narrow_mode, target,
22133                                          XVECEXP (vals, 0, 0),
22134                                          XVECEXP (vals, 0, 1)));
22135      return;
22136    }
22137
22138   /* Count the number of variable elements to initialise.  */
22139   for (int i = 0; i < n_elts; ++i)
22140     {
22141       rtx x = XVECEXP (vals, 0, i);
22142       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
22143         ++n_var;
22144       else
22145         any_const = x;
22146
22147       all_same &= rtx_equal_p (x, v0);
22148     }
22149
22150   /* No variable elements, hand off to aarch64_simd_make_constant which knows
22151      how best to handle this.  */
22152   if (n_var == 0)
22153     {
22154       rtx constant = aarch64_simd_make_constant (vals);
22155       if (constant != NULL_RTX)
22156         {
22157           emit_move_insn (target, constant);
22158           return;
22159         }
22160     }
22161
22162   /* Splat a single non-constant element if we can.  */
22163   if (all_same)
22164     {
22165       rtx x = copy_to_mode_reg (inner_mode, v0);
22166       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
22167       return;
22168     }
22169
22170   /* Check for interleaving case.
22171      For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
22172      Generate following code:
22173      dup v0.h, x
22174      dup v1.h, y
22175      zip1 v0.h, v0.h, v1.h
22176      for "large enough" initializer.  */
22177
22178   if (n_elts >= 8)
22179     {
22180       int i;
22181       for (i = 2; i < n_elts; i++)
22182         if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
22183           break;
22184
22185       if (i == n_elts)
22186         {
22187           machine_mode mode = GET_MODE (target);
22188           rtx dest[2];
22189
22190           for (int i = 0; i < 2; i++)
22191             {
22192               rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
22193               dest[i] = force_reg (mode, x);
22194             }
22195
22196           rtvec v = gen_rtvec (2, dest[0], dest[1]);
22197           emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
22198           return;
22199         }
22200     }
22201
22202   enum insn_code icode = optab_handler (vec_set_optab, mode);
22203   gcc_assert (icode != CODE_FOR_nothing);
22204
22205   /* If there are only variable elements, try to optimize
22206      the insertion using dup for the most common element
22207      followed by insertions.  */
22208
22209   /* The algorithm will fill matches[*][0] with the earliest matching element,
22210      and matches[X][1] with the count of duplicate elements (if X is the
22211      earliest element which has duplicates).  */
22212
22213   if (n_var == n_elts && n_elts <= 16)
22214     {
22215       int matches[16][2] = {0};
22216       for (int i = 0; i < n_elts; i++)
22217         {
22218           for (int j = 0; j <= i; j++)
22219             {
22220               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
22221                 {
22222                   matches[i][0] = j;
22223                   matches[j][1]++;
22224                   break;
22225                 }
22226             }
22227         }
22228       int maxelement = 0;
22229       int maxv = 0;
22230       for (int i = 0; i < n_elts; i++)
22231         if (matches[i][1] > maxv)
22232           {
22233             maxelement = i;
22234             maxv = matches[i][1];
22235           }
22236
22237       /* Create a duplicate of the most common element, unless all elements
22238          are equally useless to us, in which case just immediately set the
22239          vector register using the first element.  */
22240
22241       if (maxv == 1)
22242         {
22243           /* For vectors of two 64-bit elements, we can do even better.  */
22244           if (n_elts == 2
22245               && (inner_mode == E_DImode
22246                   || inner_mode == E_DFmode))
22247
22248             {
22249               rtx x0 = XVECEXP (vals, 0, 0);
22250               rtx x1 = XVECEXP (vals, 0, 1);
22251               /* Combine can pick up this case, but handling it directly
22252                  here leaves clearer RTL.
22253
22254                  This is load_pair_lanes<mode>, and also gives us a clean-up
22255                  for store_pair_lanes<mode>.  */
22256               if (memory_operand (x0, inner_mode)
22257                   && memory_operand (x1, inner_mode)
22258                   && aarch64_mergeable_load_pair_p (mode, x0, x1))
22259                 {
22260                   rtx t;
22261                   if (inner_mode == DFmode)
22262                     t = gen_load_pair_lanesdf (target, x0, x1);
22263                   else
22264                     t = gen_load_pair_lanesdi (target, x0, x1);
22265                   emit_insn (t);
22266                   return;
22267                 }
22268             }
22269           /* The subreg-move sequence below will move into lane zero of the
22270              vector register.  For big-endian we want that position to hold
22271              the last element of VALS.  */
22272           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
22273           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
22274           aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
22275         }
22276       else
22277         {
22278           rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
22279           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
22280         }
22281
22282       /* Insert the rest.  */
22283       for (int i = 0; i < n_elts; i++)
22284         {
22285           rtx x = XVECEXP (vals, 0, i);
22286           if (matches[i][0] == maxelement)
22287             continue;
22288           x = copy_to_mode_reg (inner_mode, x);
22289           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
22290         }
22291       return;
22292     }
22293
22294   /* Initialise a vector which is part-variable.  We want to first try
22295      to build those lanes which are constant in the most efficient way we
22296      can.  */
22297   if (n_var != n_elts)
22298     {
22299       rtx copy = copy_rtx (vals);
22300
22301       /* Load constant part of vector.  We really don't care what goes into the
22302          parts we will overwrite, but we're more likely to be able to load the
22303          constant efficiently if it has fewer, larger, repeating parts
22304          (see aarch64_simd_valid_immediate).  */
22305       for (int i = 0; i < n_elts; i++)
22306         {
22307           rtx x = XVECEXP (vals, 0, i);
22308           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
22309             continue;
22310           rtx subst = any_const;
22311           for (int bit = n_elts / 2; bit > 0; bit /= 2)
22312             {
22313               /* Look in the copied vector, as more elements are const.  */
22314               rtx test = XVECEXP (copy, 0, i ^ bit);
22315               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
22316                 {
22317                   subst = test;
22318                   break;
22319                 }
22320             }
22321           XVECEXP (copy, 0, i) = subst;
22322         }
22323       aarch64_expand_vector_init (target, copy);
22324     }
22325
22326   /* Insert the variable lanes directly.  */
22327   for (int i = 0; i < n_elts; i++)
22328     {
22329       rtx x = XVECEXP (vals, 0, i);
22330       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
22331         continue;
22332       x = copy_to_mode_reg (inner_mode, x);
22333       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
22334     }
22335 }
22336
22337 /* Emit RTL corresponding to:
22338    insr TARGET, ELEM.  */
22339
22340 static void
22341 emit_insr (rtx target, rtx elem)
22342 {
22343   machine_mode mode = GET_MODE (target);
22344   scalar_mode elem_mode = GET_MODE_INNER (mode);
22345   elem = force_reg (elem_mode, elem);
22346
22347   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
22348   gcc_assert (icode != CODE_FOR_nothing);
22349   emit_insn (GEN_FCN (icode) (target, target, elem));
22350 }
22351
22352 /* Subroutine of aarch64_sve_expand_vector_init for handling
22353    trailing constants.
22354    This function works as follows:
22355    (a) Create a new vector consisting of trailing constants.
22356    (b) Initialize TARGET with the constant vector using emit_move_insn.
22357    (c) Insert remaining elements in TARGET using insr.
22358    NELTS is the total number of elements in original vector while
22359    while NELTS_REQD is the number of elements that are actually
22360    significant.
22361
22362    ??? The heuristic used is to do above only if number of constants
22363    is at least half the total number of elements.  May need fine tuning.  */
22364
22365 static bool
22366 aarch64_sve_expand_vector_init_handle_trailing_constants
22367  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
22368 {
22369   machine_mode mode = GET_MODE (target);
22370   scalar_mode elem_mode = GET_MODE_INNER (mode);
22371   int n_trailing_constants = 0;
22372
22373   for (int i = nelts_reqd - 1;
22374        i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
22375        i--)
22376     n_trailing_constants++;
22377
22378   if (n_trailing_constants >= nelts_reqd / 2)
22379     {
22380       /* Try to use the natural pattern of BUILDER to extend the trailing
22381          constant elements to a full vector.  Replace any variables in the
22382          extra elements with zeros.
22383
22384          ??? It would be better if the builders supported "don't care"
22385              elements, with the builder filling in whichever elements
22386              give the most compact encoding.  */
22387       rtx_vector_builder v (mode, nelts, 1);
22388       for (int i = 0; i < nelts; i++)
22389         {
22390           rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
22391           if (!valid_for_const_vector_p (elem_mode, x))
22392             x = CONST0_RTX (elem_mode);
22393           v.quick_push (x);
22394         }
22395       rtx const_vec = v.build ();
22396       emit_move_insn (target, const_vec);
22397
22398       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
22399         emit_insr (target, builder.elt (i));
22400
22401       return true;
22402     }
22403
22404   return false;
22405 }
22406
22407 /* Subroutine of aarch64_sve_expand_vector_init.
22408    Works as follows:
22409    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
22410    (b) Skip trailing elements from BUILDER, which are the same as
22411        element NELTS_REQD - 1.
22412    (c) Insert earlier elements in reverse order in TARGET using insr.  */
22413
22414 static void
22415 aarch64_sve_expand_vector_init_insert_elems (rtx target,
22416                                              const rtx_vector_builder &builder,
22417                                              int nelts_reqd)
22418 {
22419   machine_mode mode = GET_MODE (target);
22420   scalar_mode elem_mode = GET_MODE_INNER (mode);
22421
22422   struct expand_operand ops[2];
22423   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
22424   gcc_assert (icode != CODE_FOR_nothing);
22425
22426   create_output_operand (&ops[0], target, mode);
22427   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
22428   expand_insn (icode, 2, ops);
22429
22430   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
22431   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
22432     emit_insr (target, builder.elt (i));
22433 }
22434
22435 /* Subroutine of aarch64_sve_expand_vector_init to handle case
22436    when all trailing elements of builder are same.
22437    This works as follows:
22438    (a) Use expand_insn interface to broadcast last vector element in TARGET.
22439    (b) Insert remaining elements in TARGET using insr.
22440
22441    ??? The heuristic used is to do above if number of same trailing elements
22442    is at least 3/4 of total number of elements, loosely based on
22443    heuristic from mostly_zeros_p.  May need fine-tuning.  */
22444
22445 static bool
22446 aarch64_sve_expand_vector_init_handle_trailing_same_elem
22447  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
22448 {
22449   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
22450   if (ndups >= (3 * nelts_reqd) / 4)
22451     {
22452       aarch64_sve_expand_vector_init_insert_elems (target, builder,
22453                                                    nelts_reqd - ndups + 1);
22454       return true;
22455     }
22456
22457   return false;
22458 }
22459
22460 /* Initialize register TARGET from BUILDER. NELTS is the constant number
22461    of elements in BUILDER.
22462
22463    The function tries to initialize TARGET from BUILDER if it fits one
22464    of the special cases outlined below.
22465
22466    Failing that, the function divides BUILDER into two sub-vectors:
22467    v_even = even elements of BUILDER;
22468    v_odd = odd elements of BUILDER;
22469
22470    and recursively calls itself with v_even and v_odd.
22471
22472    if (recursive call succeeded for v_even or v_odd)
22473      TARGET = zip (v_even, v_odd)
22474
22475    The function returns true if it managed to build TARGET from BUILDER
22476    with one of the special cases, false otherwise.
22477
22478    Example: {a, 1, b, 2, c, 3, d, 4}
22479
22480    The vector gets divided into:
22481    v_even = {a, b, c, d}
22482    v_odd = {1, 2, 3, 4}
22483
22484    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
22485    initialize tmp2 from constant vector v_odd using emit_move_insn.
22486
22487    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
22488    4 elements, so we construct tmp1 from v_even using insr:
22489    tmp1 = dup(d)
22490    insr tmp1, c
22491    insr tmp1, b
22492    insr tmp1, a
22493
22494    And finally:
22495    TARGET = zip (tmp1, tmp2)
22496    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
22497
22498 static bool
22499 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
22500                                 int nelts, int nelts_reqd)
22501 {
22502   machine_mode mode = GET_MODE (target);
22503
22504   /* Case 1: Vector contains trailing constants.  */
22505
22506   if (aarch64_sve_expand_vector_init_handle_trailing_constants
22507        (target, builder, nelts, nelts_reqd))
22508     return true;
22509
22510   /* Case 2: Vector contains leading constants.  */
22511
22512   rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
22513   for (int i = 0; i < nelts_reqd; i++)
22514     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
22515   rev_builder.finalize ();
22516
22517   if (aarch64_sve_expand_vector_init_handle_trailing_constants
22518        (target, rev_builder, nelts, nelts_reqd))
22519     {
22520       emit_insn (gen_aarch64_sve_rev (mode, target, target));
22521       return true;
22522     }
22523
22524   /* Case 3: Vector contains trailing same element.  */
22525
22526   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
22527        (target, builder, nelts_reqd))
22528     return true;
22529
22530   /* Case 4: Vector contains leading same element.  */
22531
22532   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
22533        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
22534     {
22535       emit_insn (gen_aarch64_sve_rev (mode, target, target));
22536       return true;
22537     }
22538
22539   /* Avoid recursing below 4-elements.
22540      ??? The threshold 4 may need fine-tuning.  */
22541
22542   if (nelts_reqd <= 4)
22543     return false;
22544
22545   rtx_vector_builder v_even (mode, nelts, 1);
22546   rtx_vector_builder v_odd (mode, nelts, 1);
22547
22548   for (int i = 0; i < nelts * 2; i += 2)
22549     {
22550       v_even.quick_push (builder.elt (i));
22551       v_odd.quick_push (builder.elt (i + 1));
22552     }
22553
22554   v_even.finalize ();
22555   v_odd.finalize ();
22556
22557   rtx tmp1 = gen_reg_rtx (mode);
22558   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
22559                                                     nelts, nelts_reqd / 2);
22560
22561   rtx tmp2 = gen_reg_rtx (mode);
22562   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
22563                                                    nelts, nelts_reqd / 2);
22564
22565   if (!did_even_p && !did_odd_p)
22566     return false;
22567
22568   /* Initialize v_even and v_odd using INSR if it didn't match any of the
22569      special cases and zip v_even, v_odd.  */
22570
22571   if (!did_even_p)
22572     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
22573
22574   if (!did_odd_p)
22575     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
22576
22577   rtvec v = gen_rtvec (2, tmp1, tmp2);
22578   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
22579   return true;
22580 }
22581
22582 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
22583
22584 void
22585 aarch64_sve_expand_vector_init (rtx target, rtx vals)
22586 {
22587   machine_mode mode = GET_MODE (target);
22588   int nelts = XVECLEN (vals, 0);
22589
22590   rtx_vector_builder v (mode, nelts, 1);
22591   for (int i = 0; i < nelts; i++)
22592     v.quick_push (XVECEXP (vals, 0, i));
22593   v.finalize ();
22594
22595   /* If neither sub-vectors of v could be initialized specially,
22596      then use INSR to insert all elements from v into TARGET.
22597      ??? This might not be optimal for vectors with large
22598      initializers like 16-element or above.
22599      For nelts < 4, it probably isn't useful to handle specially.  */
22600
22601   if (nelts < 4
22602       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
22603     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
22604 }
22605
22606 /* Check whether VALUE is a vector constant in which every element
22607    is either a power of 2 or a negated power of 2.  If so, return
22608    a constant vector of log2s, and flip CODE between PLUS and MINUS
22609    if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
22610
22611 static rtx
22612 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
22613 {
22614   if (!CONST_VECTOR_P (value))
22615     return NULL_RTX;
22616
22617   rtx_vector_builder builder;
22618   if (!builder.new_unary_operation (GET_MODE (value), value, false))
22619     return NULL_RTX;
22620
22621   scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
22622   /* 1 if the result of the multiplication must be negated,
22623      0 if it mustn't, or -1 if we don't yet care.  */
22624   int negate = -1;
22625   unsigned int encoded_nelts = const_vector_encoded_nelts (value);
22626   for (unsigned int i = 0; i < encoded_nelts; ++i)
22627     {
22628       rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
22629       if (!CONST_SCALAR_INT_P (elt))
22630         return NULL_RTX;
22631       rtx_mode_t val (elt, int_mode);
22632       wide_int pow2 = wi::neg (val);
22633       if (val != pow2)
22634         {
22635           /* It matters whether we negate or not.  Make that choice,
22636              and make sure that it's consistent with previous elements.  */
22637           if (negate == !wi::neg_p (val))
22638             return NULL_RTX;
22639           negate = wi::neg_p (val);
22640           if (!negate)
22641             pow2 = val;
22642         }
22643       /* POW2 is now the value that we want to be a power of 2.  */
22644       int shift = wi::exact_log2 (pow2);
22645       if (shift < 0)
22646         return NULL_RTX;
22647       builder.quick_push (gen_int_mode (shift, int_mode));
22648     }
22649   if (negate == -1)
22650     /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
22651     code = PLUS;
22652   else if (negate == 1)
22653     code = code == PLUS ? MINUS : PLUS;
22654   return builder.build ();
22655 }
22656
22657 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
22658    CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
22659    operands array, in the same order as for fma_optab.  Return true if
22660    the function emitted all the necessary instructions, false if the caller
22661    should generate the pattern normally with the new OPERANDS array.  */
22662
22663 bool
22664 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
22665 {
22666   machine_mode mode = GET_MODE (operands[0]);
22667   if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
22668     {
22669       rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
22670                                   NULL_RTX, true, OPTAB_DIRECT);
22671       force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
22672                           operands[3], product, operands[0], true,
22673                           OPTAB_DIRECT);
22674       return true;
22675     }
22676   operands[2] = force_reg (mode, operands[2]);
22677   return false;
22678 }
22679
22680 /* Likewise, but for a conditional pattern.  */
22681
22682 bool
22683 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
22684 {
22685   machine_mode mode = GET_MODE (operands[0]);
22686   if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
22687     {
22688       rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
22689                                   NULL_RTX, true, OPTAB_DIRECT);
22690       emit_insn (gen_cond (code, mode, operands[0], operands[1],
22691                            operands[4], product, operands[5]));
22692       return true;
22693     }
22694   operands[3] = force_reg (mode, operands[3]);
22695   return false;
22696 }
22697
22698 static unsigned HOST_WIDE_INT
22699 aarch64_shift_truncation_mask (machine_mode mode)
22700 {
22701   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
22702     return 0;
22703   return GET_MODE_UNIT_BITSIZE (mode) - 1;
22704 }
22705
22706 /* Select a format to encode pointers in exception handling data.  */
22707 int
22708 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
22709 {
22710    int type;
22711    switch (aarch64_cmodel)
22712      {
22713      case AARCH64_CMODEL_TINY:
22714      case AARCH64_CMODEL_TINY_PIC:
22715      case AARCH64_CMODEL_SMALL:
22716      case AARCH64_CMODEL_SMALL_PIC:
22717      case AARCH64_CMODEL_SMALL_SPIC:
22718        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
22719           for everything.  */
22720        type = DW_EH_PE_sdata4;
22721        break;
22722      default:
22723        /* No assumptions here.  8-byte relocs required.  */
22724        type = DW_EH_PE_sdata8;
22725        break;
22726      }
22727    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
22728 }
22729
22730 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
22731
22732 static void
22733 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
22734 {
22735   if (TREE_CODE (decl) == FUNCTION_DECL)
22736     {
22737       arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
22738       if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
22739         {
22740           fprintf (stream, "\t.variant_pcs\t");
22741           assemble_name (stream, name);
22742           fprintf (stream, "\n");
22743         }
22744     }
22745 }
22746
22747 /* The last .arch and .tune assembly strings that we printed.  */
22748 static std::string aarch64_last_printed_arch_string;
22749 static std::string aarch64_last_printed_tune_string;
22750
22751 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
22752    by the function fndecl.  */
22753
22754 void
22755 aarch64_declare_function_name (FILE *stream, const char* name,
22756                                 tree fndecl)
22757 {
22758   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
22759
22760   struct cl_target_option *targ_options;
22761   if (target_parts)
22762     targ_options = TREE_TARGET_OPTION (target_parts);
22763   else
22764     targ_options = TREE_TARGET_OPTION (target_option_current_node);
22765   gcc_assert (targ_options);
22766
22767   const struct processor *this_arch
22768     = aarch64_get_arch (targ_options->x_selected_arch);
22769
22770   auto isa_flags = targ_options->x_aarch64_asm_isa_flags;
22771   std::string extension
22772     = aarch64_get_extension_string_for_isa_flags (isa_flags,
22773                                                   this_arch->flags);
22774   /* Only update the assembler .arch string if it is distinct from the last
22775      such string we printed.  */
22776   std::string to_print = this_arch->name + extension;
22777   if (to_print != aarch64_last_printed_arch_string)
22778     {
22779       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
22780       aarch64_last_printed_arch_string = to_print;
22781     }
22782
22783   /* Print the cpu name we're tuning for in the comments, might be
22784      useful to readers of the generated asm.  Do it only when it changes
22785      from function to function and verbose assembly is requested.  */
22786   const struct processor *this_tune
22787     = aarch64_get_tune_cpu (targ_options->x_selected_tune);
22788
22789   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
22790     {
22791       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
22792                    this_tune->name);
22793       aarch64_last_printed_tune_string = this_tune->name;
22794     }
22795
22796   aarch64_asm_output_variant_pcs (stream, fndecl, name);
22797
22798   /* Don't forget the type directive for ELF.  */
22799   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
22800   ASM_OUTPUT_LABEL (stream, name);
22801
22802   cfun->machine->label_is_assembled = true;
22803 }
22804
22805 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY.  */
22806
22807 void
22808 aarch64_print_patchable_function_entry (FILE *file,
22809                                         unsigned HOST_WIDE_INT patch_area_size,
22810                                         bool record_p)
22811 {
22812   if (!cfun->machine->label_is_assembled)
22813     {
22814       /* Emit the patching area before the entry label, if any.  */
22815       default_print_patchable_function_entry (file, patch_area_size,
22816                                               record_p);
22817       return;
22818     }
22819
22820   rtx pa = gen_patchable_area (GEN_INT (patch_area_size),
22821                                GEN_INT (record_p));
22822   basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
22823
22824   if (!aarch64_bti_enabled ()
22825       || cgraph_node::get (cfun->decl)->only_called_directly_p ())
22826     {
22827       /* Emit the patchable_area at the beginning of the function.  */
22828       rtx_insn *insn = emit_insn_before (pa, BB_HEAD (bb));
22829       INSN_ADDRESSES_NEW (insn, -1);
22830       return;
22831     }
22832
22833   rtx_insn *insn = next_real_nondebug_insn (get_insns ());
22834   if (!insn
22835       || !INSN_P (insn)
22836       || GET_CODE (PATTERN (insn)) != UNSPEC_VOLATILE
22837       || XINT (PATTERN (insn), 1) != UNSPECV_BTI_C)
22838     {
22839       /* Emit a BTI_C.  */
22840       insn = emit_insn_before (gen_bti_c (), BB_HEAD (bb));
22841     }
22842
22843   /* Emit the patchable_area after BTI_C.  */
22844   insn = emit_insn_after (pa, insn);
22845   INSN_ADDRESSES_NEW (insn, -1);
22846 }
22847
22848 /* Output patchable area.  */
22849
22850 void
22851 aarch64_output_patchable_area (unsigned int patch_area_size, bool record_p)
22852 {
22853   default_print_patchable_function_entry (asm_out_file, patch_area_size,
22854                                           record_p);
22855 }
22856
22857 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
22858
22859 void
22860 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
22861 {
22862   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
22863   const char *value = IDENTIFIER_POINTER (target);
22864   aarch64_asm_output_variant_pcs (stream, decl, name);
22865   ASM_OUTPUT_DEF (stream, name, value);
22866 }
22867
22868 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
22869    function symbol references.  */
22870
22871 void
22872 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
22873 {
22874   default_elf_asm_output_external (stream, decl, name);
22875   aarch64_asm_output_variant_pcs (stream, decl, name);
22876 }
22877
22878 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
22879    Used to output the .cfi_b_key_frame directive when signing the current
22880    function with the B key.  */
22881
22882 void
22883 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
22884 {
22885   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
22886       && aarch64_ra_sign_key == AARCH64_KEY_B)
22887         asm_fprintf (f, "\t.cfi_b_key_frame\n");
22888 }
22889
22890 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
22891
22892 static void
22893 aarch64_start_file (void)
22894 {
22895   struct cl_target_option *default_options
22896     = TREE_TARGET_OPTION (target_option_default_node);
22897
22898   const struct processor *default_arch
22899     = aarch64_get_arch (default_options->x_selected_arch);
22900   auto default_isa_flags = default_options->x_aarch64_asm_isa_flags;
22901   std::string extension
22902     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
22903                                                   default_arch->flags);
22904
22905    aarch64_last_printed_arch_string = default_arch->name + extension;
22906    aarch64_last_printed_tune_string = "";
22907    asm_fprintf (asm_out_file, "\t.arch %s\n",
22908                 aarch64_last_printed_arch_string.c_str ());
22909
22910    default_file_start ();
22911 }
22912
22913 /* Emit load exclusive.  */
22914
22915 static void
22916 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
22917                              rtx mem, rtx model_rtx)
22918 {
22919   if (mode == TImode)
22920     emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
22921                                                 gen_highpart (DImode, rval),
22922                                                 mem, model_rtx));
22923   else
22924     emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
22925 }
22926
22927 /* Emit store exclusive.  */
22928
22929 static void
22930 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
22931                               rtx mem, rtx rval, rtx model_rtx)
22932 {
22933   if (mode == TImode)
22934     emit_insn (gen_aarch64_store_exclusive_pair
22935                (bval, mem, operand_subword (rval, 0, 0, TImode),
22936                 operand_subword (rval, 1, 0, TImode), model_rtx));
22937   else
22938     emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
22939 }
22940
22941 /* Mark the previous jump instruction as unlikely.  */
22942
22943 static void
22944 aarch64_emit_unlikely_jump (rtx insn)
22945 {
22946   rtx_insn *jump = emit_jump_insn (insn);
22947   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
22948 }
22949
22950 /* We store the names of the various atomic helpers in a 5x5 array.
22951    Return the libcall function given MODE, MODEL and NAMES.  */
22952
22953 rtx
22954 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
22955                         const atomic_ool_names *names)
22956 {
22957   memmodel model = memmodel_from_int (INTVAL (model_rtx));
22958   int mode_idx, model_idx;
22959
22960   switch (mode)
22961     {
22962     case E_QImode:
22963       mode_idx = 0;
22964       break;
22965     case E_HImode:
22966       mode_idx = 1;
22967       break;
22968     case E_SImode:
22969       mode_idx = 2;
22970       break;
22971     case E_DImode:
22972       mode_idx = 3;
22973       break;
22974     case E_TImode:
22975       mode_idx = 4;
22976       break;
22977     default:
22978       gcc_unreachable ();
22979     }
22980
22981   switch (model)
22982     {
22983     case MEMMODEL_RELAXED:
22984       model_idx = 0;
22985       break;
22986     case MEMMODEL_CONSUME:
22987     case MEMMODEL_ACQUIRE:
22988       model_idx = 1;
22989       break;
22990     case MEMMODEL_RELEASE:
22991       model_idx = 2;
22992       break;
22993     case MEMMODEL_ACQ_REL:
22994     case MEMMODEL_SEQ_CST:
22995       model_idx = 3;
22996       break;
22997     case MEMMODEL_SYNC_ACQUIRE:
22998     case MEMMODEL_SYNC_RELEASE:
22999     case MEMMODEL_SYNC_SEQ_CST:
23000       model_idx = 4;
23001       break;
23002     default:
23003       gcc_unreachable ();
23004     }
23005
23006   return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
23007                                       VISIBILITY_HIDDEN);
23008 }
23009
23010 #define DEF0(B, N) \
23011   { "__aarch64_" #B #N "_relax", \
23012     "__aarch64_" #B #N "_acq", \
23013     "__aarch64_" #B #N "_rel", \
23014     "__aarch64_" #B #N "_acq_rel", \
23015     "__aarch64_" #B #N "_sync" }
23016
23017 #define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
23018                  { NULL, NULL, NULL, NULL }
23019 #define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
23020
23021 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
23022 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
23023 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
23024 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
23025 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
23026 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
23027
23028 #undef DEF0
23029 #undef DEF4
23030 #undef DEF5
23031
23032 /* Expand a compare and swap pattern.  */
23033
23034 void
23035 aarch64_expand_compare_and_swap (rtx operands[])
23036 {
23037   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
23038   machine_mode mode, r_mode;
23039
23040   bval = operands[0];
23041   rval = operands[1];
23042   mem = operands[2];
23043   oldval = operands[3];
23044   newval = operands[4];
23045   is_weak = operands[5];
23046   mod_s = operands[6];
23047   mod_f = operands[7];
23048   mode = GET_MODE (mem);
23049
23050   /* Normally the succ memory model must be stronger than fail, but in the
23051      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
23052      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
23053   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
23054       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
23055     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
23056
23057   r_mode = mode;
23058   if (mode == QImode || mode == HImode)
23059     {
23060       r_mode = SImode;
23061       rval = gen_reg_rtx (r_mode);
23062     }
23063
23064   if (TARGET_LSE)
23065     {
23066       /* The CAS insn requires oldval and rval overlap, but we need to
23067          have a copy of oldval saved across the operation to tell if
23068          the operation is successful.  */
23069       if (reg_overlap_mentioned_p (rval, oldval))
23070         rval = copy_to_mode_reg (r_mode, oldval);
23071       else
23072         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
23073
23074       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
23075                                                    newval, mod_s));
23076       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
23077     }
23078   else if (TARGET_OUTLINE_ATOMICS)
23079     {
23080       /* Oldval must satisfy compare afterward.  */
23081       if (!aarch64_plus_operand (oldval, mode))
23082         oldval = force_reg (mode, oldval);
23083       rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
23084       rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
23085                                       oldval, mode, newval, mode,
23086                                       XEXP (mem, 0), Pmode);
23087       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
23088     }
23089   else
23090     {
23091       /* The oldval predicate varies by mode.  Test it and force to reg.  */
23092       insn_code code = code_for_aarch64_compare_and_swap (mode);
23093       if (!insn_data[code].operand[2].predicate (oldval, mode))
23094         oldval = force_reg (mode, oldval);
23095
23096       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
23097                                  is_weak, mod_s, mod_f));
23098       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
23099     }
23100
23101   if (r_mode != mode)
23102     rval = gen_lowpart (mode, rval);
23103   emit_move_insn (operands[1], rval);
23104
23105   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
23106   emit_insn (gen_rtx_SET (bval, x));
23107 }
23108
23109 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
23110    sequence implementing an atomic operation.  */
23111
23112 static void
23113 aarch64_emit_post_barrier (enum memmodel model)
23114 {
23115   const enum memmodel base_model = memmodel_base (model);
23116
23117   if (is_mm_sync (model)
23118       && (base_model == MEMMODEL_ACQUIRE
23119           || base_model == MEMMODEL_ACQ_REL
23120           || base_model == MEMMODEL_SEQ_CST))
23121     {
23122       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
23123     }
23124 }
23125
23126 /* Split a compare and swap pattern.  */
23127
23128 void
23129 aarch64_split_compare_and_swap (rtx operands[])
23130 {
23131   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
23132   gcc_assert (epilogue_completed);
23133
23134   rtx rval, mem, oldval, newval, scratch, x, model_rtx;
23135   machine_mode mode;
23136   bool is_weak;
23137   rtx_code_label *label1, *label2;
23138   enum memmodel model;
23139
23140   rval = operands[0];
23141   mem = operands[1];
23142   oldval = operands[2];
23143   newval = operands[3];
23144   is_weak = (operands[4] != const0_rtx);
23145   model_rtx = operands[5];
23146   scratch = operands[7];
23147   mode = GET_MODE (mem);
23148   model = memmodel_from_int (INTVAL (model_rtx));
23149
23150   /* When OLDVAL is zero and we want the strong version we can emit a tighter
23151     loop:
23152     .label1:
23153         LD[A]XR rval, [mem]
23154         CBNZ    rval, .label2
23155         ST[L]XR scratch, newval, [mem]
23156         CBNZ    scratch, .label1
23157     .label2:
23158         CMP     rval, 0.  */
23159   bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
23160                         oldval == const0_rtx && mode != TImode);
23161
23162   label1 = NULL;
23163   if (!is_weak)
23164     {
23165       label1 = gen_label_rtx ();
23166       emit_label (label1);
23167     }
23168   label2 = gen_label_rtx ();
23169
23170   /* The initial load can be relaxed for a __sync operation since a final
23171      barrier will be emitted to stop code hoisting.  */
23172   if (is_mm_sync (model))
23173     aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
23174   else
23175     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
23176
23177   if (strong_zero_p)
23178     x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
23179   else
23180     {
23181       rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
23182       x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
23183     }
23184   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23185                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
23186   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
23187
23188   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
23189
23190   if (!is_weak)
23191     {
23192       if (aarch64_track_speculation)
23193         {
23194           /* Emit an explicit compare instruction, so that we can correctly
23195              track the condition codes.  */
23196           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
23197           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
23198         }
23199       else
23200         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
23201
23202       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23203                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
23204       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
23205     }
23206   else
23207     aarch64_gen_compare_reg (NE, scratch, const0_rtx);
23208
23209   emit_label (label2);
23210
23211   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
23212      to set the condition flags.  If this is not used it will be removed by
23213      later passes.  */
23214   if (strong_zero_p)
23215     aarch64_gen_compare_reg (NE, rval, const0_rtx);
23216
23217   /* Emit any final barrier needed for a __sync operation.  */
23218   if (is_mm_sync (model))
23219     aarch64_emit_post_barrier (model);
23220 }
23221
23222 /* Split an atomic operation.  */
23223
23224 void
23225 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
23226                          rtx value, rtx model_rtx, rtx cond)
23227 {
23228   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
23229   gcc_assert (epilogue_completed);
23230
23231   machine_mode mode = GET_MODE (mem);
23232   machine_mode wmode = (mode == DImode ? DImode : SImode);
23233   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
23234   const bool is_sync = is_mm_sync (model);
23235   rtx_code_label *label;
23236   rtx x;
23237
23238   /* Split the atomic operation into a sequence.  */
23239   label = gen_label_rtx ();
23240   emit_label (label);
23241
23242   if (new_out)
23243     new_out = gen_lowpart (wmode, new_out);
23244   if (old_out)
23245     old_out = gen_lowpart (wmode, old_out);
23246   else
23247     old_out = new_out;
23248   value = simplify_gen_subreg (wmode, value, mode, 0);
23249
23250   /* The initial load can be relaxed for a __sync operation since a final
23251      barrier will be emitted to stop code hoisting.  */
23252  if (is_sync)
23253     aarch64_emit_load_exclusive (mode, old_out, mem,
23254                                  GEN_INT (MEMMODEL_RELAXED));
23255   else
23256     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
23257
23258   switch (code)
23259     {
23260     case SET:
23261       new_out = value;
23262       break;
23263
23264     case NOT:
23265       x = gen_rtx_AND (wmode, old_out, value);
23266       emit_insn (gen_rtx_SET (new_out, x));
23267       x = gen_rtx_NOT (wmode, new_out);
23268       emit_insn (gen_rtx_SET (new_out, x));
23269       break;
23270
23271     case MINUS:
23272       if (CONST_INT_P (value))
23273         {
23274           value = GEN_INT (-UINTVAL (value));
23275           code = PLUS;
23276         }
23277       /* Fall through.  */
23278
23279     default:
23280       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
23281       emit_insn (gen_rtx_SET (new_out, x));
23282       break;
23283     }
23284
23285   aarch64_emit_store_exclusive (mode, cond, mem,
23286                                 gen_lowpart (mode, new_out), model_rtx);
23287
23288   if (aarch64_track_speculation)
23289     {
23290       /* Emit an explicit compare instruction, so that we can correctly
23291          track the condition codes.  */
23292       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
23293       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
23294     }
23295   else
23296     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
23297
23298   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23299                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
23300   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
23301
23302   /* Emit any final barrier needed for a __sync operation.  */
23303   if (is_sync)
23304     aarch64_emit_post_barrier (model);
23305 }
23306
23307 static void
23308 aarch64_init_libfuncs (void)
23309 {
23310    /* Half-precision float operations.  The compiler handles all operations
23311      with NULL libfuncs by converting to SFmode.  */
23312
23313   /* Conversions.  */
23314   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
23315   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
23316
23317   /* Arithmetic.  */
23318   set_optab_libfunc (add_optab, HFmode, NULL);
23319   set_optab_libfunc (sdiv_optab, HFmode, NULL);
23320   set_optab_libfunc (smul_optab, HFmode, NULL);
23321   set_optab_libfunc (neg_optab, HFmode, NULL);
23322   set_optab_libfunc (sub_optab, HFmode, NULL);
23323
23324   /* Comparisons.  */
23325   set_optab_libfunc (eq_optab, HFmode, NULL);
23326   set_optab_libfunc (ne_optab, HFmode, NULL);
23327   set_optab_libfunc (lt_optab, HFmode, NULL);
23328   set_optab_libfunc (le_optab, HFmode, NULL);
23329   set_optab_libfunc (ge_optab, HFmode, NULL);
23330   set_optab_libfunc (gt_optab, HFmode, NULL);
23331   set_optab_libfunc (unord_optab, HFmode, NULL);
23332 }
23333
23334 /* Target hook for c_mode_for_suffix.  */
23335 static machine_mode
23336 aarch64_c_mode_for_suffix (char suffix)
23337 {
23338   if (suffix == 'q')
23339     return TFmode;
23340
23341   return VOIDmode;
23342 }
23343
23344 /* We can only represent floating point constants which will fit in
23345    "quarter-precision" values.  These values are characterised by
23346    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
23347    by:
23348
23349    (-1)^s * (n/16) * 2^r
23350
23351    Where:
23352      's' is the sign bit.
23353      'n' is an integer in the range 16 <= n <= 31.
23354      'r' is an integer in the range -3 <= r <= 4.  */
23355
23356 /* Return true iff X can be represented by a quarter-precision
23357    floating point immediate operand X.  Note, we cannot represent 0.0.  */
23358 bool
23359 aarch64_float_const_representable_p (rtx x)
23360 {
23361   /* This represents our current view of how many bits
23362      make up the mantissa.  */
23363   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
23364   int exponent;
23365   unsigned HOST_WIDE_INT mantissa, mask;
23366   REAL_VALUE_TYPE r, m;
23367   bool fail;
23368
23369   x = unwrap_const_vec_duplicate (x);
23370   if (!CONST_DOUBLE_P (x))
23371     return false;
23372
23373   if (GET_MODE (x) == VOIDmode
23374       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
23375     return false;
23376
23377   r = *CONST_DOUBLE_REAL_VALUE (x);
23378
23379   /* We cannot represent infinities, NaNs or +/-zero.  We won't
23380      know if we have +zero until we analyse the mantissa, but we
23381      can reject the other invalid values.  */
23382   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
23383       || REAL_VALUE_MINUS_ZERO (r))
23384     return false;
23385
23386   /* Extract exponent.  */
23387   r = real_value_abs (&r);
23388   exponent = REAL_EXP (&r);
23389
23390   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
23391      highest (sign) bit, with a fixed binary point at bit point_pos.
23392      m1 holds the low part of the mantissa, m2 the high part.
23393      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
23394      bits for the mantissa, this can fail (low bits will be lost).  */
23395   real_ldexp (&m, &r, point_pos - exponent);
23396   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
23397
23398   /* If the low part of the mantissa has bits set we cannot represent
23399      the value.  */
23400   if (w.ulow () != 0)
23401     return false;
23402   /* We have rejected the lower HOST_WIDE_INT, so update our
23403      understanding of how many bits lie in the mantissa and
23404      look only at the high HOST_WIDE_INT.  */
23405   mantissa = w.elt (1);
23406   point_pos -= HOST_BITS_PER_WIDE_INT;
23407
23408   /* We can only represent values with a mantissa of the form 1.xxxx.  */
23409   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
23410   if ((mantissa & mask) != 0)
23411     return false;
23412
23413   /* Having filtered unrepresentable values, we may now remove all
23414      but the highest 5 bits.  */
23415   mantissa >>= point_pos - 5;
23416
23417   /* We cannot represent the value 0.0, so reject it.  This is handled
23418      elsewhere.  */
23419   if (mantissa == 0)
23420     return false;
23421
23422   /* Then, as bit 4 is always set, we can mask it off, leaving
23423      the mantissa in the range [0, 15].  */
23424   mantissa &= ~(1 << 4);
23425   gcc_assert (mantissa <= 15);
23426
23427   /* GCC internally does not use IEEE754-like encoding (where normalized
23428      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.cc).
23429      Our mantissa values are shifted 4 places to the left relative to
23430      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
23431      by 5 places to correct for GCC's representation.  */
23432   exponent = 5 - exponent;
23433
23434   return (exponent >= 0 && exponent <= 7);
23435 }
23436
23437 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
23438    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
23439    output MOVI/MVNI, ORR or BIC immediate.  */
23440 char*
23441 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
23442                                    enum simd_immediate_check which)
23443 {
23444   bool is_valid;
23445   static char templ[40];
23446   const char *mnemonic;
23447   const char *shift_op;
23448   unsigned int lane_count = 0;
23449   char element_char;
23450
23451   struct simd_immediate_info info;
23452
23453   /* This will return true to show const_vector is legal for use as either
23454      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
23455      It will also update INFO to show how the immediate should be generated.
23456      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
23457   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
23458   gcc_assert (is_valid);
23459
23460   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23461   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
23462
23463   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
23464     {
23465       gcc_assert (info.insn == simd_immediate_info::MOV
23466                   && info.u.mov.shift == 0);
23467       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
23468          move immediate path.  */
23469       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
23470         info.u.mov.value = GEN_INT (0);
23471       else
23472         {
23473           const unsigned int buf_size = 20;
23474           char float_buf[buf_size] = {'\0'};
23475           real_to_decimal_for_mode (float_buf,
23476                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
23477                                     buf_size, buf_size, 1, info.elt_mode);
23478
23479           if (lane_count == 1)
23480             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
23481           else
23482             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
23483                       lane_count, element_char, float_buf);
23484           return templ;
23485         }
23486     }
23487
23488   gcc_assert (CONST_INT_P (info.u.mov.value));
23489
23490   if (which == AARCH64_CHECK_MOV)
23491     {
23492       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
23493       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
23494                   ? "msl" : "lsl");
23495       if (lane_count == 1)
23496         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
23497                   mnemonic, UINTVAL (info.u.mov.value));
23498       else if (info.u.mov.shift)
23499         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
23500                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
23501                   element_char, UINTVAL (info.u.mov.value), shift_op,
23502                   info.u.mov.shift);
23503       else
23504         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
23505                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
23506                   element_char, UINTVAL (info.u.mov.value));
23507     }
23508   else
23509     {
23510       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
23511       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
23512       if (info.u.mov.shift)
23513         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
23514                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
23515                   element_char, UINTVAL (info.u.mov.value), "lsl",
23516                   info.u.mov.shift);
23517       else
23518         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
23519                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
23520                   element_char, UINTVAL (info.u.mov.value));
23521     }
23522   return templ;
23523 }
23524
23525 char*
23526 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
23527 {
23528
23529   /* If a floating point number was passed and we desire to use it in an
23530      integer mode do the conversion to integer.  */
23531   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
23532     {
23533       unsigned HOST_WIDE_INT ival;
23534       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
23535           gcc_unreachable ();
23536       immediate = gen_int_mode (ival, mode);
23537     }
23538
23539   machine_mode vmode;
23540   /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
23541      a 128 bit vector mode.  */
23542   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
23543
23544   vmode = aarch64_simd_container_mode (mode, width);
23545   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
23546   return aarch64_output_simd_mov_immediate (v_op, width);
23547 }
23548
23549 /* Return the output string to use for moving immediate CONST_VECTOR
23550    into an SVE register.  */
23551
23552 char *
23553 aarch64_output_sve_mov_immediate (rtx const_vector)
23554 {
23555   static char templ[40];
23556   struct simd_immediate_info info;
23557   char element_char;
23558
23559   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
23560   gcc_assert (is_valid);
23561
23562   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23563
23564   machine_mode vec_mode = GET_MODE (const_vector);
23565   if (aarch64_sve_pred_mode_p (vec_mode))
23566     {
23567       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
23568       if (info.insn == simd_immediate_info::MOV)
23569         {
23570           gcc_assert (info.u.mov.value == const0_rtx);
23571           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
23572         }
23573       else
23574         {
23575           gcc_assert (info.insn == simd_immediate_info::PTRUE);
23576           unsigned int total_bytes;
23577           if (info.u.pattern == AARCH64_SV_ALL
23578               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
23579             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
23580                       total_bytes / GET_MODE_SIZE (info.elt_mode));
23581           else
23582             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
23583                       svpattern_token (info.u.pattern));
23584         }
23585       return buf;
23586     }
23587
23588   if (info.insn == simd_immediate_info::INDEX)
23589     {
23590       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
23591                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
23592                 element_char, INTVAL (info.u.index.base),
23593                 INTVAL (info.u.index.step));
23594       return templ;
23595     }
23596
23597   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
23598     {
23599       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
23600         info.u.mov.value = GEN_INT (0);
23601       else
23602         {
23603           const int buf_size = 20;
23604           char float_buf[buf_size] = {};
23605           real_to_decimal_for_mode (float_buf,
23606                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
23607                                     buf_size, buf_size, 1, info.elt_mode);
23608
23609           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
23610                     element_char, float_buf);
23611           return templ;
23612         }
23613     }
23614
23615   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
23616             element_char, INTVAL (info.u.mov.value));
23617   return templ;
23618 }
23619
23620 /* Return the asm template for a PTRUES.  CONST_UNSPEC is the
23621    aarch64_sve_ptrue_svpattern_immediate that describes the predicate
23622    pattern.  */
23623
23624 char *
23625 aarch64_output_sve_ptrues (rtx const_unspec)
23626 {
23627   static char templ[40];
23628
23629   struct simd_immediate_info info;
23630   bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
23631   gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
23632
23633   char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23634   snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
23635             svpattern_token (info.u.pattern));
23636   return templ;
23637 }
23638
23639 /* Split operands into moves from op[1] + op[2] into op[0].  */
23640
23641 void
23642 aarch64_split_combinev16qi (rtx operands[3])
23643 {
23644   unsigned int dest = REGNO (operands[0]);
23645   unsigned int src1 = REGNO (operands[1]);
23646   unsigned int src2 = REGNO (operands[2]);
23647   machine_mode halfmode = GET_MODE (operands[1]);
23648   unsigned int halfregs = REG_NREGS (operands[1]);
23649   rtx destlo, desthi;
23650
23651   gcc_assert (halfmode == V16QImode);
23652
23653   if (src1 == dest && src2 == dest + halfregs)
23654     {
23655       /* No-op move.  Can't split to nothing; emit something.  */
23656       emit_note (NOTE_INSN_DELETED);
23657       return;
23658     }
23659
23660   /* Preserve register attributes for variable tracking.  */
23661   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
23662   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
23663                                GET_MODE_SIZE (halfmode));
23664
23665   /* Special case of reversed high/low parts.  */
23666   if (reg_overlap_mentioned_p (operands[2], destlo)
23667       && reg_overlap_mentioned_p (operands[1], desthi))
23668     {
23669       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
23670       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
23671       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
23672     }
23673   else if (!reg_overlap_mentioned_p (operands[2], destlo))
23674     {
23675       /* Try to avoid unnecessary moves if part of the result
23676          is in the right place already.  */
23677       if (src1 != dest)
23678         emit_move_insn (destlo, operands[1]);
23679       if (src2 != dest + halfregs)
23680         emit_move_insn (desthi, operands[2]);
23681     }
23682   else
23683     {
23684       if (src2 != dest + halfregs)
23685         emit_move_insn (desthi, operands[2]);
23686       if (src1 != dest)
23687         emit_move_insn (destlo, operands[1]);
23688     }
23689 }
23690
23691 /* vec_perm support.  */
23692
23693 struct expand_vec_perm_d
23694 {
23695   rtx target, op0, op1;
23696   vec_perm_indices perm;
23697   machine_mode vmode;
23698   machine_mode op_mode;
23699   unsigned int vec_flags;
23700   unsigned int op_vec_flags;
23701   bool one_vector_p;
23702   bool testing_p;
23703 };
23704
23705 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
23706
23707 /* Generate a variable permutation.  */
23708
23709 static void
23710 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
23711 {
23712   machine_mode vmode = GET_MODE (target);
23713   bool one_vector_p = rtx_equal_p (op0, op1);
23714
23715   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
23716   gcc_checking_assert (GET_MODE (op0) == vmode);
23717   gcc_checking_assert (GET_MODE (op1) == vmode);
23718   gcc_checking_assert (GET_MODE (sel) == vmode);
23719   gcc_checking_assert (TARGET_SIMD);
23720
23721   if (one_vector_p)
23722     {
23723       if (vmode == V8QImode)
23724         {
23725           /* Expand the argument to a V16QI mode by duplicating it.  */
23726           rtx pair = gen_reg_rtx (V16QImode);
23727           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
23728           emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
23729         }
23730       else
23731         {
23732           emit_insn (gen_aarch64_qtbl1v16qi (target, op0, sel));
23733         }
23734     }
23735   else
23736     {
23737       rtx pair;
23738
23739       if (vmode == V8QImode)
23740         {
23741           pair = gen_reg_rtx (V16QImode);
23742           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
23743           emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
23744         }
23745       else
23746         {
23747           pair = gen_reg_rtx (V2x16QImode);
23748           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
23749           emit_insn (gen_aarch64_qtbl2v16qi (target, pair, sel));
23750         }
23751     }
23752 }
23753
23754 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
23755    NELT is the number of elements in the vector.  */
23756
23757 void
23758 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
23759                          unsigned int nelt)
23760 {
23761   machine_mode vmode = GET_MODE (target);
23762   bool one_vector_p = rtx_equal_p (op0, op1);
23763   rtx mask;
23764
23765   /* The TBL instruction does not use a modulo index, so we must take care
23766      of that ourselves.  */
23767   mask = aarch64_simd_gen_const_vector_dup (vmode,
23768       one_vector_p ? nelt - 1 : 2 * nelt - 1);
23769   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
23770
23771   /* For big-endian, we also need to reverse the index within the vector
23772      (but not which vector).  */
23773   if (BYTES_BIG_ENDIAN)
23774     {
23775       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
23776       if (!one_vector_p)
23777         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
23778       sel = expand_simple_binop (vmode, XOR, sel, mask,
23779                                  NULL, 0, OPTAB_LIB_WIDEN);
23780     }
23781   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
23782 }
23783
23784 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
23785
23786 static void
23787 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
23788 {
23789   emit_insn (gen_rtx_SET (target,
23790                           gen_rtx_UNSPEC (GET_MODE (target),
23791                                           gen_rtvec (2, op0, op1), code)));
23792 }
23793
23794 /* Expand an SVE vec_perm with the given operands.  */
23795
23796 void
23797 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
23798 {
23799   machine_mode data_mode = GET_MODE (target);
23800   machine_mode sel_mode = GET_MODE (sel);
23801   /* Enforced by the pattern condition.  */
23802   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
23803
23804   /* Note: vec_perm indices are supposed to wrap when they go beyond the
23805      size of the two value vectors, i.e. the upper bits of the indices
23806      are effectively ignored.  SVE TBL instead produces 0 for any
23807      out-of-range indices, so we need to modulo all the vec_perm indices
23808      to ensure they are all in range.  */
23809   rtx sel_reg = force_reg (sel_mode, sel);
23810
23811   /* Check if the sel only references the first values vector.  */
23812   if (CONST_VECTOR_P (sel)
23813       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
23814     {
23815       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
23816       return;
23817     }
23818
23819   /* Check if the two values vectors are the same.  */
23820   if (rtx_equal_p (op0, op1))
23821     {
23822       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
23823       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
23824                                          NULL, 0, OPTAB_DIRECT);
23825       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
23826       return;
23827     }
23828
23829   /* Run TBL on for each value vector and combine the results.  */
23830
23831   rtx res0 = gen_reg_rtx (data_mode);
23832   rtx res1 = gen_reg_rtx (data_mode);
23833   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
23834   if (!CONST_VECTOR_P (sel)
23835       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
23836     {
23837       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
23838                                                        2 * nunits - 1);
23839       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
23840                                      NULL, 0, OPTAB_DIRECT);
23841     }
23842   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
23843   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
23844                                      NULL, 0, OPTAB_DIRECT);
23845   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
23846   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
23847     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
23848   else
23849     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
23850 }
23851
23852 /* Recognize patterns suitable for the TRN instructions.  */
23853 static bool
23854 aarch64_evpc_trn (struct expand_vec_perm_d *d)
23855 {
23856   HOST_WIDE_INT odd;
23857   poly_uint64 nelt = d->perm.length ();
23858   rtx out, in0, in1;
23859   machine_mode vmode = d->vmode;
23860
23861   if (GET_MODE_UNIT_SIZE (vmode) > 8)
23862     return false;
23863
23864   /* Note that these are little-endian tests.
23865      We correct for big-endian later.  */
23866   if (!d->perm[0].is_constant (&odd)
23867       || (odd != 0 && odd != 1)
23868       || !d->perm.series_p (0, 2, odd, 2)
23869       || !d->perm.series_p (1, 2, nelt + odd, 2))
23870     return false;
23871
23872   /* Success!  */
23873   if (d->testing_p)
23874     return true;
23875
23876   in0 = d->op0;
23877   in1 = d->op1;
23878   /* We don't need a big-endian lane correction for SVE; see the comment
23879      at the head of aarch64-sve.md for details.  */
23880   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
23881     {
23882       std::swap (in0, in1);
23883       odd = !odd;
23884     }
23885   out = d->target;
23886
23887   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
23888                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
23889   return true;
23890 }
23891
23892 /* Try to re-encode the PERM constant so it combines odd and even elements.
23893    This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
23894    We retry with this new constant with the full suite of patterns.  */
23895 static bool
23896 aarch64_evpc_reencode (struct expand_vec_perm_d *d)
23897 {
23898   expand_vec_perm_d newd;
23899   unsigned HOST_WIDE_INT nelt;
23900
23901   if (d->vec_flags != VEC_ADVSIMD)
23902     return false;
23903
23904   /* Get the new mode.  Always twice the size of the inner
23905      and half the elements.  */
23906   poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
23907   unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
23908   auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
23909   machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
23910
23911   if (new_mode == word_mode)
23912     return false;
23913
23914   /* to_constant is safe since this routine is specific to Advanced SIMD
23915      vectors.  */
23916   nelt = d->perm.length ().to_constant ();
23917
23918   vec_perm_builder newpermconst;
23919   newpermconst.new_vector (nelt / 2, nelt / 2, 1);
23920
23921   /* Convert the perm constant if we can.  Require even, odd as the pairs.  */
23922   for (unsigned int i = 0; i < nelt; i += 2)
23923     {
23924       poly_int64 elt0 = d->perm[i];
23925       poly_int64 elt1 = d->perm[i + 1];
23926       poly_int64 newelt;
23927       if (!multiple_p (elt0, 2, &newelt) || maybe_ne (elt0 + 1, elt1))
23928         return false;
23929       newpermconst.quick_push (newelt.to_constant ());
23930     }
23931   newpermconst.finalize ();
23932
23933   newd.vmode = new_mode;
23934   newd.vec_flags = VEC_ADVSIMD;
23935   newd.op_mode = newd.vmode;
23936   newd.op_vec_flags = newd.vec_flags;
23937   newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
23938   newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
23939   newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
23940   newd.testing_p = d->testing_p;
23941   newd.one_vector_p = d->one_vector_p;
23942
23943   newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2);
23944   return aarch64_expand_vec_perm_const_1 (&newd);
23945 }
23946
23947 /* Recognize patterns suitable for the UZP instructions.  */
23948 static bool
23949 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
23950 {
23951   HOST_WIDE_INT odd;
23952   rtx out, in0, in1;
23953   machine_mode vmode = d->vmode;
23954
23955   if (GET_MODE_UNIT_SIZE (vmode) > 8)
23956     return false;
23957
23958   /* Note that these are little-endian tests.
23959      We correct for big-endian later.  */
23960   if (!d->perm[0].is_constant (&odd)
23961       || (odd != 0 && odd != 1)
23962       || !d->perm.series_p (0, 1, odd, 2))
23963     return false;
23964
23965   /* Success!  */
23966   if (d->testing_p)
23967     return true;
23968
23969   in0 = d->op0;
23970   in1 = d->op1;
23971   /* We don't need a big-endian lane correction for SVE; see the comment
23972      at the head of aarch64-sve.md for details.  */
23973   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
23974     {
23975       std::swap (in0, in1);
23976       odd = !odd;
23977     }
23978   out = d->target;
23979
23980   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
23981                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
23982   return true;
23983 }
23984
23985 /* Recognize patterns suitable for the ZIP instructions.  */
23986 static bool
23987 aarch64_evpc_zip (struct expand_vec_perm_d *d)
23988 {
23989   unsigned int high;
23990   poly_uint64 nelt = d->perm.length ();
23991   rtx out, in0, in1;
23992   machine_mode vmode = d->vmode;
23993
23994   if (GET_MODE_UNIT_SIZE (vmode) > 8)
23995     return false;
23996
23997   /* Note that these are little-endian tests.
23998      We correct for big-endian later.  */
23999   poly_uint64 first = d->perm[0];
24000   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
24001       || !d->perm.series_p (0, 2, first, 1)
24002       || !d->perm.series_p (1, 2, first + nelt, 1))
24003     return false;
24004   high = maybe_ne (first, 0U);
24005
24006   /* Success!  */
24007   if (d->testing_p)
24008     return true;
24009
24010   in0 = d->op0;
24011   in1 = d->op1;
24012   /* We don't need a big-endian lane correction for SVE; see the comment
24013      at the head of aarch64-sve.md for details.  */
24014   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
24015     {
24016       std::swap (in0, in1);
24017       high = !high;
24018     }
24019   out = d->target;
24020
24021   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
24022                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
24023   return true;
24024 }
24025
24026 /* Recognize patterns for the EXT insn.  */
24027
24028 static bool
24029 aarch64_evpc_ext (struct expand_vec_perm_d *d)
24030 {
24031   HOST_WIDE_INT location;
24032   rtx offset;
24033
24034   /* The first element always refers to the first vector.
24035      Check if the extracted indices are increasing by one.  */
24036   if (d->vec_flags == VEC_SVE_PRED
24037       || !d->perm[0].is_constant (&location)
24038       || !d->perm.series_p (0, 1, location, 1))
24039     return false;
24040
24041   /* Success! */
24042   if (d->testing_p)
24043     return true;
24044
24045   /* The case where (location == 0) is a no-op for both big- and little-endian,
24046      and is removed by the mid-end at optimization levels -O1 and higher.
24047
24048      We don't need a big-endian lane correction for SVE; see the comment
24049      at the head of aarch64-sve.md for details.  */
24050   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
24051     {
24052       /* After setup, we want the high elements of the first vector (stored
24053          at the LSB end of the register), and the low elements of the second
24054          vector (stored at the MSB end of the register). So swap.  */
24055       std::swap (d->op0, d->op1);
24056       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
24057          to_constant () is safe since this is restricted to Advanced SIMD
24058          vectors.  */
24059       location = d->perm.length ().to_constant () - location;
24060     }
24061
24062   offset = GEN_INT (location);
24063   emit_set_insn (d->target,
24064                  gen_rtx_UNSPEC (d->vmode,
24065                                  gen_rtvec (3, d->op0, d->op1, offset),
24066                                  UNSPEC_EXT));
24067   return true;
24068 }
24069
24070 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
24071    within each 64-bit, 32-bit or 16-bit granule.  */
24072
24073 static bool
24074 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
24075 {
24076   HOST_WIDE_INT diff;
24077   unsigned int i, size, unspec;
24078   machine_mode pred_mode;
24079
24080   if (d->vec_flags == VEC_SVE_PRED
24081       || !d->one_vector_p
24082       || !d->perm[0].is_constant (&diff)
24083       || !diff)
24084     return false;
24085
24086   if (d->vec_flags & VEC_SVE_DATA)
24087     size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
24088   else
24089     size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
24090   if (size == 64)
24091     {
24092       unspec = UNSPEC_REV64;
24093       pred_mode = VNx2BImode;
24094     }
24095   else if (size == 32)
24096     {
24097       unspec = UNSPEC_REV32;
24098       pred_mode = VNx4BImode;
24099     }
24100   else if (size == 16)
24101     {
24102       unspec = UNSPEC_REV16;
24103       pred_mode = VNx8BImode;
24104     }
24105   else
24106     return false;
24107
24108   unsigned int step = diff + 1;
24109   for (i = 0; i < step; ++i)
24110     if (!d->perm.series_p (i, step, diff - i, step))
24111       return false;
24112
24113   /* Success! */
24114   if (d->testing_p)
24115     return true;
24116
24117   if (d->vec_flags & VEC_SVE_DATA)
24118     {
24119       rtx pred = aarch64_ptrue_reg (pred_mode);
24120       emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
24121                                          d->target, pred, d->op0));
24122       return true;
24123     }
24124   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
24125   emit_set_insn (d->target, src);
24126   return true;
24127 }
24128
24129 /* Recognize patterns for the REV insn, which reverses elements within
24130    a full vector.  */
24131
24132 static bool
24133 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
24134 {
24135   poly_uint64 nelt = d->perm.length ();
24136
24137   if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
24138     return false;
24139
24140   if (!d->perm.series_p (0, 1, nelt - 1, -1))
24141     return false;
24142
24143   /* Success! */
24144   if (d->testing_p)
24145     return true;
24146
24147   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
24148   emit_set_insn (d->target, src);
24149   return true;
24150 }
24151
24152 static bool
24153 aarch64_evpc_dup (struct expand_vec_perm_d *d)
24154 {
24155   rtx out = d->target;
24156   rtx in0;
24157   HOST_WIDE_INT elt;
24158   machine_mode vmode = d->vmode;
24159   rtx lane;
24160
24161   if (d->vec_flags == VEC_SVE_PRED
24162       || d->perm.encoding ().encoded_nelts () != 1
24163       || !d->perm[0].is_constant (&elt))
24164     return false;
24165
24166   if ((d->vec_flags & VEC_SVE_DATA)
24167       && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
24168     return false;
24169
24170   /* Success! */
24171   if (d->testing_p)
24172     return true;
24173
24174   /* The generic preparation in aarch64_expand_vec_perm_const_1
24175      swaps the operand order and the permute indices if it finds
24176      d->perm[0] to be in the second operand.  Thus, we can always
24177      use d->op0 and need not do any extra arithmetic to get the
24178      correct lane number.  */
24179   in0 = d->op0;
24180   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
24181
24182   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
24183   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
24184   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
24185   return true;
24186 }
24187
24188 static bool
24189 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
24190 {
24191   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
24192   machine_mode vmode = d->vmode;
24193
24194   /* Make sure that the indices are constant.  */
24195   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
24196   for (unsigned int i = 0; i < encoded_nelts; ++i)
24197     if (!d->perm[i].is_constant ())
24198       return false;
24199
24200   if (d->testing_p)
24201     return true;
24202
24203   /* Generic code will try constant permutation twice.  Once with the
24204      original mode and again with the elements lowered to QImode.
24205      So wait and don't do the selector expansion ourselves.  */
24206   if (vmode != V8QImode && vmode != V16QImode)
24207     return false;
24208
24209   /* to_constant is safe since this routine is specific to Advanced SIMD
24210      vectors.  */
24211   unsigned int nelt = d->perm.length ().to_constant ();
24212   for (unsigned int i = 0; i < nelt; ++i)
24213     /* If big-endian and two vectors we end up with a weird mixed-endian
24214        mode on NEON.  Reverse the index within each word but not the word
24215        itself.  to_constant is safe because we checked is_constant above.  */
24216     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
24217                         ? d->perm[i].to_constant () ^ (nelt - 1)
24218                         : d->perm[i].to_constant ());
24219
24220   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
24221   sel = force_reg (vmode, sel);
24222
24223   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
24224   return true;
24225 }
24226
24227 /* Try to implement D using an SVE TBL instruction.  */
24228
24229 static bool
24230 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
24231 {
24232   unsigned HOST_WIDE_INT nelt;
24233
24234   /* Permuting two variable-length vectors could overflow the
24235      index range.  */
24236   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
24237     return false;
24238
24239   if (d->testing_p)
24240     return true;
24241
24242   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
24243   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
24244   if (d->one_vector_p)
24245     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
24246   else
24247     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
24248   return true;
24249 }
24250
24251 /* Try to implement D using SVE dup instruction.  */
24252
24253 static bool
24254 aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
24255 {
24256   if (BYTES_BIG_ENDIAN
24257       || !d->one_vector_p
24258       || d->vec_flags != VEC_SVE_DATA
24259       || d->op_vec_flags != VEC_ADVSIMD
24260       || d->perm.encoding ().nelts_per_pattern () != 1
24261       || !known_eq (d->perm.encoding ().npatterns (),
24262                     GET_MODE_NUNITS (d->op_mode))
24263       || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
24264     return false;
24265
24266   int npatterns = d->perm.encoding ().npatterns ();
24267   for (int i = 0; i < npatterns; i++)
24268     if (!known_eq (d->perm[i], i))
24269       return false;
24270
24271   if (d->testing_p)
24272     return true;
24273
24274   aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
24275   return true;
24276 }
24277
24278 /* Try to implement D using SVE SEL instruction.  */
24279
24280 static bool
24281 aarch64_evpc_sel (struct expand_vec_perm_d *d)
24282 {
24283   machine_mode vmode = d->vmode;
24284   int unit_size = GET_MODE_UNIT_SIZE (vmode);
24285
24286   if (d->vec_flags != VEC_SVE_DATA
24287       || unit_size > 8)
24288     return false;
24289
24290   int n_patterns = d->perm.encoding ().npatterns ();
24291   poly_int64 vec_len = d->perm.length ();
24292
24293   for (int i = 0; i < n_patterns; ++i)
24294     if (!known_eq (d->perm[i], i)
24295         && !known_eq (d->perm[i], vec_len + i))
24296       return false;
24297
24298   for (int i = n_patterns; i < n_patterns * 2; i++)
24299     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
24300         && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
24301       return false;
24302
24303   if (d->testing_p)
24304     return true;
24305
24306   machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
24307
24308   /* Build a predicate that is true when op0 elements should be used.  */
24309   rtx_vector_builder builder (pred_mode, n_patterns, 2);
24310   for (int i = 0; i < n_patterns * 2; i++)
24311     {
24312       rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
24313                                           : CONST0_RTX (BImode);
24314       builder.quick_push (elem);
24315     }
24316
24317   rtx const_vec = builder.build ();
24318   rtx pred = force_reg (pred_mode, const_vec);
24319   /* TARGET = PRED ? OP0 : OP1.  */
24320   emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
24321   return true;
24322 }
24323
24324 /* Recognize patterns suitable for the INS instructions.  */
24325 static bool
24326 aarch64_evpc_ins (struct expand_vec_perm_d *d)
24327 {
24328   machine_mode mode = d->vmode;
24329   unsigned HOST_WIDE_INT nelt;
24330
24331   if (d->vec_flags != VEC_ADVSIMD)
24332     return false;
24333
24334   /* to_constant is safe since this routine is specific to Advanced SIMD
24335      vectors.  */
24336   nelt = d->perm.length ().to_constant ();
24337   rtx insv = d->op0;
24338
24339   HOST_WIDE_INT idx = -1;
24340
24341   for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
24342     {
24343       HOST_WIDE_INT elt;
24344       if (!d->perm[i].is_constant (&elt))
24345         return false;
24346       if (elt == (HOST_WIDE_INT) i)
24347         continue;
24348       if (idx != -1)
24349         {
24350           idx = -1;
24351           break;
24352         }
24353       idx = i;
24354     }
24355
24356   if (idx == -1)
24357     {
24358       insv = d->op1;
24359       for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
24360         {
24361           if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
24362             continue;
24363           if (idx != -1)
24364             return false;
24365           idx = i;
24366         }
24367
24368       if (idx == -1)
24369         return false;
24370     }
24371
24372   if (d->testing_p)
24373     return true;
24374
24375   gcc_assert (idx != -1);
24376
24377   unsigned extractindex = d->perm[idx].to_constant ();
24378   rtx extractv = d->op0;
24379   if (extractindex >= nelt)
24380     {
24381       extractv = d->op1;
24382       extractindex -= nelt;
24383     }
24384   gcc_assert (extractindex < nelt);
24385
24386   insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
24387   expand_operand ops[5];
24388   create_output_operand (&ops[0], d->target, mode);
24389   create_input_operand (&ops[1], insv, mode);
24390   create_integer_operand (&ops[2], 1 << idx);
24391   create_input_operand (&ops[3], extractv, mode);
24392   create_integer_operand (&ops[4], extractindex);
24393   expand_insn (icode, 5, ops);
24394
24395   return true;
24396 }
24397
24398 static bool
24399 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
24400 {
24401   gcc_assert (d->op_mode != E_VOIDmode);
24402
24403   /* The pattern matching functions above are written to look for a small
24404      number to begin the sequence (0, 1, N/2).  If we begin with an index
24405      from the second operand, we can swap the operands.  */
24406   poly_int64 nelt = d->perm.length ();
24407   if (known_ge (d->perm[0], nelt))
24408     {
24409       d->perm.rotate_inputs (1);
24410       std::swap (d->op0, d->op1);
24411     }
24412
24413   if (((d->vec_flags == VEC_ADVSIMD && TARGET_SIMD)
24414        || d->vec_flags == VEC_SVE_DATA
24415        || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
24416        || d->vec_flags == VEC_SVE_PRED)
24417       && known_gt (nelt, 1))
24418     {
24419       if (d->vmode == d->op_mode)
24420         {
24421           if (aarch64_evpc_rev_local (d))
24422             return true;
24423           else if (aarch64_evpc_rev_global (d))
24424             return true;
24425           else if (aarch64_evpc_ext (d))
24426             return true;
24427           else if (aarch64_evpc_dup (d))
24428             return true;
24429           else if (aarch64_evpc_zip (d))
24430             return true;
24431           else if (aarch64_evpc_uzp (d))
24432             return true;
24433           else if (aarch64_evpc_trn (d))
24434             return true;
24435           else if (aarch64_evpc_sel (d))
24436             return true;
24437           else if (aarch64_evpc_ins (d))
24438             return true;
24439           else if (aarch64_evpc_reencode (d))
24440             return true;
24441
24442           if (d->vec_flags == VEC_SVE_DATA)
24443             return aarch64_evpc_sve_tbl (d);
24444           else if (d->vec_flags == VEC_ADVSIMD)
24445             return aarch64_evpc_tbl (d);
24446         }
24447       else
24448         {
24449           if (aarch64_evpc_sve_dup (d))
24450             return true;
24451         }
24452     }
24453   return false;
24454 }
24455
24456 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
24457
24458 static bool
24459 aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
24460                                   rtx target, rtx op0, rtx op1,
24461                                   const vec_perm_indices &sel)
24462 {
24463   struct expand_vec_perm_d d;
24464
24465   /* Check whether the mask can be applied to a single vector.  */
24466   if (sel.ninputs () == 1
24467       || (op0 && rtx_equal_p (op0, op1)))
24468     d.one_vector_p = true;
24469   else if (sel.all_from_input_p (0))
24470     {
24471       d.one_vector_p = true;
24472       op1 = op0;
24473     }
24474   else if (sel.all_from_input_p (1))
24475     {
24476       d.one_vector_p = true;
24477       op0 = op1;
24478     }
24479   else
24480     d.one_vector_p = false;
24481
24482   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
24483                      sel.nelts_per_input ());
24484   d.vmode = vmode;
24485   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
24486   d.op_mode = op_mode;
24487   d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
24488   d.target = target;
24489   d.op0 = op0 ? force_reg (op_mode, op0) : NULL_RTX;
24490   if (op0 == op1)
24491     d.op1 = d.op0;
24492   else
24493     d.op1 = op1 ? force_reg (op_mode, op1) : NULL_RTX;
24494   d.testing_p = !target;
24495
24496   if (!d.testing_p)
24497     return aarch64_expand_vec_perm_const_1 (&d);
24498
24499   rtx_insn *last = get_last_insn ();
24500   bool ret = aarch64_expand_vec_perm_const_1 (&d);
24501   gcc_assert (last == get_last_insn ());
24502
24503   return ret;
24504 }
24505
24506 /* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST.  */
24507
24508 bool
24509 aarch64_vectorize_can_special_div_by_constant (enum tree_code code,
24510                                                tree vectype, wide_int cst,
24511                                                rtx *output, rtx in0, rtx in1)
24512 {
24513   if (code != TRUNC_DIV_EXPR
24514       || !TYPE_UNSIGNED (vectype))
24515     return false;
24516
24517   machine_mode mode = TYPE_MODE (vectype);
24518   unsigned int flags = aarch64_classify_vector_mode (mode);
24519   if ((flags & VEC_ANY_SVE) && !TARGET_SVE2)
24520     return false;
24521
24522   int pow = wi::exact_log2 (cst + 1);
24523   auto insn_code = maybe_code_for_aarch64_bitmask_udiv3 (TYPE_MODE (vectype));
24524   /* SVE actually has a div operator, we may have gotten here through
24525      that route.  */
24526   if (pow != (int) (element_precision (vectype) / 2)
24527       || insn_code == CODE_FOR_nothing)
24528     return false;
24529
24530   /* We can use the optimized pattern.  */
24531   if (in0 == NULL_RTX && in1 == NULL_RTX)
24532     return true;
24533
24534   gcc_assert (output);
24535
24536   expand_operand ops[3];
24537   create_output_operand (&ops[0], *output, mode);
24538   create_input_operand (&ops[1], in0, mode);
24539   create_fixed_operand (&ops[2], in1);
24540   expand_insn (insn_code, 3, ops);
24541   *output = ops[0].value;
24542   return true;
24543 }
24544
24545 /* Generate a byte permute mask for a register of mode MODE,
24546    which has NUNITS units.  */
24547
24548 rtx
24549 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
24550 {
24551   /* We have to reverse each vector because we dont have
24552      a permuted load that can reverse-load according to ABI rules.  */
24553   rtx mask;
24554   rtvec v = rtvec_alloc (16);
24555   unsigned int i, j;
24556   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
24557
24558   gcc_assert (BYTES_BIG_ENDIAN);
24559   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
24560
24561   for (i = 0; i < nunits; i++)
24562     for (j = 0; j < usize; j++)
24563       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
24564   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
24565   return force_reg (V16QImode, mask);
24566 }
24567
24568 /* Expand an SVE integer comparison using the SVE equivalent of:
24569
24570      (set TARGET (CODE OP0 OP1)).  */
24571
24572 void
24573 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
24574 {
24575   machine_mode pred_mode = GET_MODE (target);
24576   machine_mode data_mode = GET_MODE (op0);
24577   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
24578                                       op0, op1);
24579   if (!rtx_equal_p (target, res))
24580     emit_move_insn (target, res);
24581 }
24582
24583 /* Return the UNSPEC_COND_* code for comparison CODE.  */
24584
24585 static unsigned int
24586 aarch64_unspec_cond_code (rtx_code code)
24587 {
24588   switch (code)
24589     {
24590     case NE:
24591       return UNSPEC_COND_FCMNE;
24592     case EQ:
24593       return UNSPEC_COND_FCMEQ;
24594     case LT:
24595       return UNSPEC_COND_FCMLT;
24596     case GT:
24597       return UNSPEC_COND_FCMGT;
24598     case LE:
24599       return UNSPEC_COND_FCMLE;
24600     case GE:
24601       return UNSPEC_COND_FCMGE;
24602     case UNORDERED:
24603       return UNSPEC_COND_FCMUO;
24604     default:
24605       gcc_unreachable ();
24606     }
24607 }
24608
24609 /* Emit:
24610
24611       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
24612
24613    where <X> is the operation associated with comparison CODE.
24614    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
24615
24616 static void
24617 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
24618                           bool known_ptrue_p, rtx op0, rtx op1)
24619 {
24620   rtx flag = gen_int_mode (known_ptrue_p, SImode);
24621   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
24622                                gen_rtvec (4, pred, flag, op0, op1),
24623                                aarch64_unspec_cond_code (code));
24624   emit_set_insn (target, unspec);
24625 }
24626
24627 /* Emit the SVE equivalent of:
24628
24629       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
24630       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
24631       (set TARGET (ior:PRED_MODE TMP1 TMP2))
24632
24633    where <Xi> is the operation associated with comparison CODEi.
24634    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
24635
24636 static void
24637 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
24638                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
24639 {
24640   machine_mode pred_mode = GET_MODE (pred);
24641   rtx tmp1 = gen_reg_rtx (pred_mode);
24642   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
24643   rtx tmp2 = gen_reg_rtx (pred_mode);
24644   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
24645   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
24646 }
24647
24648 /* Emit the SVE equivalent of:
24649
24650       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
24651       (set TARGET (not TMP))
24652
24653    where <X> is the operation associated with comparison CODE.
24654    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
24655
24656 static void
24657 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
24658                                  bool known_ptrue_p, rtx op0, rtx op1)
24659 {
24660   machine_mode pred_mode = GET_MODE (pred);
24661   rtx tmp = gen_reg_rtx (pred_mode);
24662   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
24663   aarch64_emit_unop (target, one_cmpl_optab, tmp);
24664 }
24665
24666 /* Expand an SVE floating-point comparison using the SVE equivalent of:
24667
24668      (set TARGET (CODE OP0 OP1))
24669
24670    If CAN_INVERT_P is true, the caller can also handle inverted results;
24671    return true if the result is in fact inverted.  */
24672
24673 bool
24674 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
24675                                   rtx op0, rtx op1, bool can_invert_p)
24676 {
24677   machine_mode pred_mode = GET_MODE (target);
24678   machine_mode data_mode = GET_MODE (op0);
24679
24680   rtx ptrue = aarch64_ptrue_reg (pred_mode);
24681   switch (code)
24682     {
24683     case UNORDERED:
24684       /* UNORDERED has no immediate form.  */
24685       op1 = force_reg (data_mode, op1);
24686       /* fall through */
24687     case LT:
24688     case LE:
24689     case GT:
24690     case GE:
24691     case EQ:
24692     case NE:
24693       {
24694         /* There is native support for the comparison.  */
24695         aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
24696         return false;
24697       }
24698
24699     case LTGT:
24700       /* This is a trapping operation (LT or GT).  */
24701       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
24702       return false;
24703
24704     case UNEQ:
24705       if (!flag_trapping_math)
24706         {
24707           /* This would trap for signaling NaNs.  */
24708           op1 = force_reg (data_mode, op1);
24709           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
24710                                         ptrue, true, op0, op1);
24711           return false;
24712         }
24713       /* fall through */
24714     case UNLT:
24715     case UNLE:
24716     case UNGT:
24717     case UNGE:
24718       if (flag_trapping_math)
24719         {
24720           /* Work out which elements are ordered.  */
24721           rtx ordered = gen_reg_rtx (pred_mode);
24722           op1 = force_reg (data_mode, op1);
24723           aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
24724                                            ptrue, true, op0, op1);
24725
24726           /* Test the opposite condition for the ordered elements,
24727              then invert the result.  */
24728           if (code == UNEQ)
24729             code = NE;
24730           else
24731             code = reverse_condition_maybe_unordered (code);
24732           if (can_invert_p)
24733             {
24734               aarch64_emit_sve_fp_cond (target, code,
24735                                         ordered, false, op0, op1);
24736               return true;
24737             }
24738           aarch64_emit_sve_invert_fp_cond (target, code,
24739                                            ordered, false, op0, op1);
24740           return false;
24741         }
24742       break;
24743
24744     case ORDERED:
24745       /* ORDERED has no immediate form.  */
24746       op1 = force_reg (data_mode, op1);
24747       break;
24748
24749     default:
24750       gcc_unreachable ();
24751     }
24752
24753   /* There is native support for the inverse comparison.  */
24754   code = reverse_condition_maybe_unordered (code);
24755   if (can_invert_p)
24756     {
24757       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
24758       return true;
24759     }
24760   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
24761   return false;
24762 }
24763
24764 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
24765    of the data being selected and CMP_MODE is the mode of the values being
24766    compared.  */
24767
24768 void
24769 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
24770                           rtx *ops)
24771 {
24772   machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
24773   rtx pred = gen_reg_rtx (pred_mode);
24774   if (FLOAT_MODE_P (cmp_mode))
24775     {
24776       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
24777                                             ops[4], ops[5], true))
24778         std::swap (ops[1], ops[2]);
24779     }
24780   else
24781     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
24782
24783   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
24784     ops[1] = force_reg (data_mode, ops[1]);
24785   /* The "false" value can only be zero if the "true" value is a constant.  */
24786   if (register_operand (ops[1], data_mode)
24787       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
24788     ops[2] = force_reg (data_mode, ops[2]);
24789
24790   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
24791   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
24792 }
24793
24794 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
24795    true.  However due to issues with register allocation it is preferable
24796    to avoid tieing integer scalar and FP scalar modes.  Executing integer
24797    operations in general registers is better than treating them as scalar
24798    vector operations.  This reduces latency and avoids redundant int<->FP
24799    moves.  So tie modes if they are either the same class, or vector modes
24800    with other vector modes, vector structs or any scalar mode.  */
24801
24802 static bool
24803 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
24804 {
24805   if ((aarch64_advsimd_partial_struct_mode_p (mode1)
24806        != aarch64_advsimd_partial_struct_mode_p (mode2))
24807       && maybe_gt (GET_MODE_SIZE (mode1), 8)
24808       && maybe_gt (GET_MODE_SIZE (mode2), 8))
24809     return false;
24810
24811   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
24812     return true;
24813
24814   /* We specifically want to allow elements of "structure" modes to
24815      be tieable to the structure.  This more general condition allows
24816      other rarer situations too.  The reason we don't extend this to
24817      predicate modes is that there are no predicate structure modes
24818      nor any specific instructions for extracting part of a predicate
24819      register.  */
24820   if (aarch64_vector_data_mode_p (mode1)
24821       && aarch64_vector_data_mode_p (mode2))
24822     return true;
24823
24824   /* Also allow any scalar modes with vectors.  */
24825   if (aarch64_vector_mode_supported_p (mode1)
24826       || aarch64_vector_mode_supported_p (mode2))
24827     return true;
24828
24829   return false;
24830 }
24831
24832 /* Return a new RTX holding the result of moving POINTER forward by
24833    AMOUNT bytes.  */
24834
24835 static rtx
24836 aarch64_move_pointer (rtx pointer, poly_int64 amount)
24837 {
24838   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
24839
24840   return adjust_automodify_address (pointer, GET_MODE (pointer),
24841                                     next, amount);
24842 }
24843
24844 /* Return a new RTX holding the result of moving POINTER forward by the
24845    size of the mode it points to.  */
24846
24847 static rtx
24848 aarch64_progress_pointer (rtx pointer)
24849 {
24850   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
24851 }
24852
24853 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
24854    MODE bytes.  */
24855
24856 static void
24857 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
24858                                               machine_mode mode)
24859 {
24860   /* Handle 256-bit memcpy separately.  We do this by making 2 adjacent memory
24861      address copies using V4SImode so that we can use Q registers.  */
24862   if (known_eq (GET_MODE_BITSIZE (mode), 256))
24863     {
24864       mode = V4SImode;
24865       rtx reg1 = gen_reg_rtx (mode);
24866       rtx reg2 = gen_reg_rtx (mode);
24867       /* "Cast" the pointers to the correct mode.  */
24868       *src = adjust_address (*src, mode, 0);
24869       *dst = adjust_address (*dst, mode, 0);
24870       /* Emit the memcpy.  */
24871       emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
24872                                         aarch64_progress_pointer (*src)));
24873       emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
24874                                          aarch64_progress_pointer (*dst), reg2));
24875       /* Move the pointers forward.  */
24876       *src = aarch64_move_pointer (*src, 32);
24877       *dst = aarch64_move_pointer (*dst, 32);
24878       return;
24879     }
24880
24881   rtx reg = gen_reg_rtx (mode);
24882
24883   /* "Cast" the pointers to the correct mode.  */
24884   *src = adjust_address (*src, mode, 0);
24885   *dst = adjust_address (*dst, mode, 0);
24886   /* Emit the memcpy.  */
24887   emit_move_insn (reg, *src);
24888   emit_move_insn (*dst, reg);
24889   /* Move the pointers forward.  */
24890   *src = aarch64_progress_pointer (*src);
24891   *dst = aarch64_progress_pointer (*dst);
24892 }
24893
24894 /* Expand a cpymem using the MOPS extension.  OPERANDS are taken
24895    from the cpymem pattern.  Return true iff we succeeded.  */
24896 static bool
24897 aarch64_expand_cpymem_mops (rtx *operands)
24898 {
24899   if (!TARGET_MOPS)
24900     return false;
24901
24902   /* All three registers are changed by the instruction, so each one
24903      must be a fresh pseudo.  */
24904   rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
24905   rtx src_addr = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
24906   rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
24907   rtx src_mem = replace_equiv_address (operands[1], src_addr);
24908   rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
24909   emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
24910
24911   return true;
24912 }
24913
24914 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
24915    we succeed, otherwise return false, indicating that a libcall to
24916    memcpy should be emitted.  */
24917
24918 bool
24919 aarch64_expand_cpymem (rtx *operands)
24920 {
24921   int mode_bits;
24922   rtx dst = operands[0];
24923   rtx src = operands[1];
24924   rtx base;
24925   machine_mode cur_mode = BLKmode;
24926
24927   /* Variable-sized memcpy can go through the MOPS expansion if available.  */
24928   if (!CONST_INT_P (operands[2]))
24929     return aarch64_expand_cpymem_mops (operands);
24930
24931   unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
24932
24933   /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
24934   unsigned HOST_WIDE_INT max_copy_size
24935     = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
24936
24937   bool size_p = optimize_function_for_size_p (cfun);
24938
24939   /* Large constant-sized cpymem should go through MOPS when possible.
24940      It should be a win even for size optimization in the general case.
24941      For speed optimization the choice between MOPS and the SIMD sequence
24942      depends on the size of the copy, rather than number of instructions,
24943      alignment etc.  */
24944   if (size > max_copy_size)
24945     return aarch64_expand_cpymem_mops (operands);
24946
24947   int copy_bits = 256;
24948
24949   /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
24950      support or slow 256-bit LDP/STP fall back to 128-bit chunks.  */
24951   if (size <= 24
24952       || !TARGET_SIMD
24953       || (aarch64_tune_params.extra_tuning_flags
24954           & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
24955     copy_bits = 128;
24956
24957   /* Emit an inline load+store sequence and count the number of operations
24958      involved.  We use a simple count of just the loads and stores emitted
24959      rather than rtx_insn count as all the pointer adjustments and reg copying
24960      in this function will get optimized away later in the pipeline.  */
24961   start_sequence ();
24962   unsigned nops = 0;
24963
24964   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
24965   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
24966
24967   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
24968   src = adjust_automodify_address (src, VOIDmode, base, 0);
24969
24970   /* Convert size to bits to make the rest of the code simpler.  */
24971   int n = size * BITS_PER_UNIT;
24972
24973   while (n > 0)
24974     {
24975       /* Find the largest mode in which to do the copy in without over reading
24976          or writing.  */
24977       opt_scalar_int_mode mode_iter;
24978       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
24979         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
24980           cur_mode = mode_iter.require ();
24981
24982       gcc_assert (cur_mode != BLKmode);
24983
24984       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
24985
24986       /* Prefer Q-register accesses for the last bytes.  */
24987       if (mode_bits == 128 && copy_bits == 256)
24988         cur_mode = V4SImode;
24989
24990       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
24991       /* A single block copy is 1 load + 1 store.  */
24992       nops += 2;
24993       n -= mode_bits;
24994
24995       /* Emit trailing copies using overlapping unaligned accesses
24996         (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
24997       if (n > 0 && n < copy_bits / 2 && !STRICT_ALIGNMENT)
24998         {
24999           machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
25000           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
25001           gcc_assert (n_bits <= mode_bits);
25002           src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
25003           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
25004           n = n_bits;
25005         }
25006     }
25007   rtx_insn *seq = get_insns ();
25008   end_sequence ();
25009   /* MOPS sequence requires 3 instructions for the memory copying + 1 to move
25010      the constant size into a register.  */
25011   unsigned mops_cost = 3 + 1;
25012
25013   /* If MOPS is available at this point we don't consider the libcall as it's
25014      not a win even on code size.  At this point only consider MOPS if
25015      optimizing for size.  For speed optimizations we will have chosen between
25016      the two based on copy size already.  */
25017   if (TARGET_MOPS)
25018     {
25019       if (size_p && mops_cost < nops)
25020         return aarch64_expand_cpymem_mops (operands);
25021       emit_insn (seq);
25022       return true;
25023     }
25024
25025   /* A memcpy libcall in the worst case takes 3 instructions to prepare the
25026      arguments + 1 for the call.  When MOPS is not available and we're
25027      optimizing for size a libcall may be preferable.  */
25028   unsigned libcall_cost = 4;
25029   if (size_p && libcall_cost < nops)
25030     return false;
25031
25032   emit_insn (seq);
25033   return true;
25034 }
25035
25036 /* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
25037    SRC is a register we have created with the duplicated value to be set.  */
25038 static void
25039 aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
25040                                             machine_mode mode)
25041 {
25042   /* If we are copying 128bits or 256bits, we can do that straight from
25043      the SIMD register we prepared.  */
25044   if (known_eq (GET_MODE_BITSIZE (mode), 256))
25045     {
25046       mode = GET_MODE (src);
25047       /* "Cast" the *dst to the correct mode.  */
25048       *dst = adjust_address (*dst, mode, 0);
25049       /* Emit the memset.  */
25050       emit_insn (aarch64_gen_store_pair (mode, *dst, src,
25051                                          aarch64_progress_pointer (*dst), src));
25052
25053       /* Move the pointers forward.  */
25054       *dst = aarch64_move_pointer (*dst, 32);
25055       return;
25056     }
25057   if (known_eq (GET_MODE_BITSIZE (mode), 128))
25058     {
25059       /* "Cast" the *dst to the correct mode.  */
25060       *dst = adjust_address (*dst, GET_MODE (src), 0);
25061       /* Emit the memset.  */
25062       emit_move_insn (*dst, src);
25063       /* Move the pointers forward.  */
25064       *dst = aarch64_move_pointer (*dst, 16);
25065       return;
25066     }
25067   /* For copying less, we have to extract the right amount from src.  */
25068   rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
25069
25070   /* "Cast" the *dst to the correct mode.  */
25071   *dst = adjust_address (*dst, mode, 0);
25072   /* Emit the memset.  */
25073   emit_move_insn (*dst, reg);
25074   /* Move the pointer forward.  */
25075   *dst = aarch64_progress_pointer (*dst);
25076 }
25077
25078 /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
25079    as for the setmem pattern.  Return true iff we succeed.  */
25080 static bool
25081 aarch64_expand_setmem_mops (rtx *operands)
25082 {
25083   if (!TARGET_MOPS)
25084     return false;
25085
25086   /* The first two registers are changed by the instruction, so both
25087      of them must be a fresh pseudo.  */
25088   rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
25089   rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
25090   rtx sz_reg = copy_to_mode_reg (DImode, operands[1]);
25091   rtx val = operands[2];
25092   if (val != CONST0_RTX (QImode))
25093     val = force_reg (QImode, val);
25094   emit_insn (gen_aarch64_setmemdi (dst_mem, val, sz_reg));
25095   return true;
25096 }
25097
25098 /* Expand setmem, as if from a __builtin_memset.  Return true if
25099    we succeed, otherwise return false.  */
25100
25101 bool
25102 aarch64_expand_setmem (rtx *operands)
25103 {
25104   int n, mode_bits;
25105   unsigned HOST_WIDE_INT len;
25106   rtx dst = operands[0];
25107   rtx val = operands[2], src;
25108   rtx base;
25109   machine_mode cur_mode = BLKmode, next_mode;
25110
25111   /* If we don't have SIMD registers or the size is variable use the MOPS
25112      inlined sequence if possible.  */
25113   if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
25114     return aarch64_expand_setmem_mops (operands);
25115
25116   bool size_p = optimize_function_for_size_p (cfun);
25117
25118   /* Default the maximum to 256-bytes when considering only libcall vs
25119      SIMD broadcast sequence.  */
25120   unsigned max_set_size = 256;
25121
25122   len = INTVAL (operands[1]);
25123   if (len > max_set_size && !TARGET_MOPS)
25124     return false;
25125
25126   int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
25127   /* The MOPS sequence takes:
25128      3 instructions for the memory storing
25129      + 1 to move the constant size into a reg
25130      + 1 if VAL is a non-zero constant to move into a reg
25131     (zero constants can use XZR directly).  */
25132   unsigned mops_cost = 3 + 1 + cst_val;
25133   /* A libcall to memset in the worst case takes 3 instructions to prepare
25134      the arguments + 1 for the call.  */
25135   unsigned libcall_cost = 4;
25136
25137   /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
25138      when available.  */
25139   if (TARGET_MOPS
25140       && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
25141     return aarch64_expand_setmem_mops (operands);
25142
25143   /* Attempt a sequence with a vector broadcast followed by stores.
25144      Count the number of operations involved to see if it's worth it
25145      against the alternatives.  A simple counter simd_ops on the
25146      algorithmically-relevant operations is used rather than an rtx_insn count
25147      as all the pointer adjusmtents and mode reinterprets will be optimized
25148      away later.  */
25149   start_sequence ();
25150   unsigned simd_ops = 0;
25151
25152   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
25153   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
25154
25155   /* Prepare the val using a DUP/MOVI v0.16B, val.  */
25156   src = expand_vector_broadcast (V16QImode, val);
25157   src = force_reg (V16QImode, src);
25158   simd_ops++;
25159   /* Convert len to bits to make the rest of the code simpler.  */
25160   n = len * BITS_PER_UNIT;
25161
25162   /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
25163      AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
25164   const int copy_limit = (aarch64_tune_params.extra_tuning_flags
25165                           & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
25166                           ? GET_MODE_BITSIZE (TImode) : 256;
25167
25168   while (n > 0)
25169     {
25170       /* Find the largest mode in which to do the copy without
25171          over writing.  */
25172       opt_scalar_int_mode mode_iter;
25173       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
25174         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
25175           cur_mode = mode_iter.require ();
25176
25177       gcc_assert (cur_mode != BLKmode);
25178
25179       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
25180       aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
25181       simd_ops++;
25182       n -= mode_bits;
25183
25184       /* Do certain trailing copies as overlapping if it's going to be
25185          cheaper.  i.e. less instructions to do so.  For instance doing a 15
25186          byte copy it's more efficient to do two overlapping 8 byte copies than
25187          8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
25188       if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
25189         {
25190           next_mode = smallest_mode_for_size (n, MODE_INT);
25191           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
25192           gcc_assert (n_bits <= mode_bits);
25193           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
25194           n = n_bits;
25195         }
25196     }
25197   rtx_insn *seq = get_insns ();
25198   end_sequence ();
25199
25200   if (size_p)
25201     {
25202       /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
25203          call to memset or the MOPS expansion.  */
25204       if (TARGET_MOPS
25205           && mops_cost <= libcall_cost
25206           && mops_cost <= simd_ops)
25207         return aarch64_expand_setmem_mops (operands);
25208       /* If MOPS is not available or not shorter pick a libcall if the SIMD
25209          sequence is too long.  */
25210       else if (libcall_cost < simd_ops)
25211         return false;
25212       emit_insn (seq);
25213       return true;
25214     }
25215
25216   /* At this point the SIMD broadcast sequence is the best choice when
25217      optimizing for speed.  */
25218   emit_insn (seq);
25219   return true;
25220 }
25221
25222
25223 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
25224    SImode stores.  Handle the case when the constant has identical
25225    bottom and top halves.  This is beneficial when the two stores can be
25226    merged into an STP and we avoid synthesising potentially expensive
25227    immediates twice.  Return true if such a split is possible.  */
25228
25229 bool
25230 aarch64_split_dimode_const_store (rtx dst, rtx src)
25231 {
25232   rtx lo = gen_lowpart (SImode, src);
25233   rtx hi = gen_highpart_mode (SImode, DImode, src);
25234
25235   bool size_p = optimize_function_for_size_p (cfun);
25236
25237   if (!rtx_equal_p (lo, hi))
25238     return false;
25239
25240   unsigned int orig_cost
25241     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
25242   unsigned int lo_cost
25243     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
25244
25245   /* We want to transform:
25246      MOV        x1, 49370
25247      MOVK       x1, 0x140, lsl 16
25248      MOVK       x1, 0xc0da, lsl 32
25249      MOVK       x1, 0x140, lsl 48
25250      STR        x1, [x0]
25251    into:
25252      MOV        w1, 49370
25253      MOVK       w1, 0x140, lsl 16
25254      STP        w1, w1, [x0]
25255    So we want to perform this only when we save two instructions
25256    or more.  When optimizing for size, however, accept any code size
25257    savings we can.  */
25258   if (size_p && orig_cost <= lo_cost)
25259     return false;
25260
25261   if (!size_p
25262       && (orig_cost <= lo_cost + 1))
25263     return false;
25264
25265   rtx mem_lo = adjust_address (dst, SImode, 0);
25266   if (!aarch64_mem_pair_operand (mem_lo, SImode))
25267     return false;
25268
25269   rtx tmp_reg = gen_reg_rtx (SImode);
25270   aarch64_expand_mov_immediate (tmp_reg, lo);
25271   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
25272   /* Don't emit an explicit store pair as this may not be always profitable.
25273      Let the sched-fusion logic decide whether to merge them.  */
25274   emit_move_insn (mem_lo, tmp_reg);
25275   emit_move_insn (mem_hi, tmp_reg);
25276
25277   return true;
25278 }
25279
25280 /* Generate RTL for a conditional branch with rtx comparison CODE in
25281    mode CC_MODE.  The destination of the unlikely conditional branch
25282    is LABEL_REF.  */
25283
25284 void
25285 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
25286                               rtx label_ref)
25287 {
25288   rtx x;
25289   x = gen_rtx_fmt_ee (code, VOIDmode,
25290                       gen_rtx_REG (cc_mode, CC_REGNUM),
25291                       const0_rtx);
25292
25293   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
25294                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
25295                             pc_rtx);
25296   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
25297 }
25298
25299 /* Generate DImode scratch registers for 128-bit (TImode) addition.
25300
25301    OP1 represents the TImode destination operand 1
25302    OP2 represents the TImode destination operand 2
25303    LOW_DEST represents the low half (DImode) of TImode operand 0
25304    LOW_IN1 represents the low half (DImode) of TImode operand 1
25305    LOW_IN2 represents the low half (DImode) of TImode operand 2
25306    HIGH_DEST represents the high half (DImode) of TImode operand 0
25307    HIGH_IN1 represents the high half (DImode) of TImode operand 1
25308    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
25309
25310 void
25311 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
25312                             rtx *low_in1, rtx *low_in2,
25313                             rtx *high_dest, rtx *high_in1,
25314                             rtx *high_in2)
25315 {
25316   *low_dest = gen_reg_rtx (DImode);
25317   *low_in1 = gen_lowpart (DImode, op1);
25318   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
25319                                   subreg_lowpart_offset (DImode, TImode));
25320   *high_dest = gen_reg_rtx (DImode);
25321   *high_in1 = gen_highpart (DImode, op1);
25322   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
25323                                    subreg_highpart_offset (DImode, TImode));
25324 }
25325
25326 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
25327
25328    This function differs from 'arch64_addti_scratch_regs' in that
25329    OP1 can be an immediate constant (zero). We must call
25330    subreg_highpart_offset with DImode and TImode arguments, otherwise
25331    VOIDmode will be used for the const_int which generates an internal
25332    error from subreg_size_highpart_offset which does not expect a size of zero.
25333
25334    OP1 represents the TImode destination operand 1
25335    OP2 represents the TImode destination operand 2
25336    LOW_DEST represents the low half (DImode) of TImode operand 0
25337    LOW_IN1 represents the low half (DImode) of TImode operand 1
25338    LOW_IN2 represents the low half (DImode) of TImode operand 2
25339    HIGH_DEST represents the high half (DImode) of TImode operand 0
25340    HIGH_IN1 represents the high half (DImode) of TImode operand 1
25341    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
25342
25343
25344 void
25345 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
25346                              rtx *low_in1, rtx *low_in2,
25347                              rtx *high_dest, rtx *high_in1,
25348                              rtx *high_in2)
25349 {
25350   *low_dest = gen_reg_rtx (DImode);
25351   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
25352                                   subreg_lowpart_offset (DImode, TImode));
25353
25354   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
25355                                   subreg_lowpart_offset (DImode, TImode));
25356   *high_dest = gen_reg_rtx (DImode);
25357
25358   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
25359                                    subreg_highpart_offset (DImode, TImode));
25360   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
25361                                    subreg_highpart_offset (DImode, TImode));
25362 }
25363
25364 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
25365
25366    OP0 represents the TImode destination operand 0
25367    LOW_DEST represents the low half (DImode) of TImode operand 0
25368    LOW_IN1 represents the low half (DImode) of TImode operand 1
25369    LOW_IN2 represents the low half (DImode) of TImode operand 2
25370    HIGH_DEST represents the high half (DImode) of TImode operand 0
25371    HIGH_IN1 represents the high half (DImode) of TImode operand 1
25372    HIGH_IN2 represents the high half (DImode) of TImode operand 2
25373    UNSIGNED_P is true if the operation is being performed on unsigned
25374    values.  */
25375 void
25376 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
25377                        rtx low_in2, rtx high_dest, rtx high_in1,
25378                        rtx high_in2, bool unsigned_p)
25379 {
25380   if (low_in2 == const0_rtx)
25381     {
25382       low_dest = low_in1;
25383       high_in2 = force_reg (DImode, high_in2);
25384       if (unsigned_p)
25385         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
25386       else
25387         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
25388     }
25389   else
25390     {
25391       if (aarch64_plus_immediate (low_in2, DImode))
25392         emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
25393                                             GEN_INT (-UINTVAL (low_in2))));
25394       else
25395         {
25396           low_in2 = force_reg (DImode, low_in2);
25397           emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
25398         }
25399       high_in2 = force_reg (DImode, high_in2);
25400
25401       if (unsigned_p)
25402         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
25403       else
25404         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
25405     }
25406
25407   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
25408   emit_move_insn (gen_highpart (DImode, op0), high_dest);
25409
25410 }
25411
25412 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
25413
25414 static unsigned HOST_WIDE_INT
25415 aarch64_asan_shadow_offset (void)
25416 {
25417   if (TARGET_ILP32)
25418     return (HOST_WIDE_INT_1 << 29);
25419   else
25420     return (HOST_WIDE_INT_1 << 36);
25421 }
25422
25423 static rtx
25424 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
25425                         int code, tree treeop0, tree treeop1)
25426 {
25427   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
25428   rtx op0, op1;
25429   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
25430   insn_code icode;
25431   struct expand_operand ops[4];
25432
25433   start_sequence ();
25434   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
25435
25436   op_mode = GET_MODE (op0);
25437   if (op_mode == VOIDmode)
25438     op_mode = GET_MODE (op1);
25439
25440   switch (op_mode)
25441     {
25442     case E_QImode:
25443     case E_HImode:
25444     case E_SImode:
25445       cmp_mode = SImode;
25446       icode = CODE_FOR_cmpsi;
25447       break;
25448
25449     case E_DImode:
25450       cmp_mode = DImode;
25451       icode = CODE_FOR_cmpdi;
25452       break;
25453
25454     case E_SFmode:
25455       cmp_mode = SFmode;
25456       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
25457       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
25458       break;
25459
25460     case E_DFmode:
25461       cmp_mode = DFmode;
25462       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
25463       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
25464       break;
25465
25466     default:
25467       end_sequence ();
25468       return NULL_RTX;
25469     }
25470
25471   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
25472   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
25473   if (!op0 || !op1)
25474     {
25475       end_sequence ();
25476       return NULL_RTX;
25477     }
25478   *prep_seq = get_insns ();
25479   end_sequence ();
25480
25481   create_fixed_operand (&ops[0], op0);
25482   create_fixed_operand (&ops[1], op1);
25483
25484   start_sequence ();
25485   if (!maybe_expand_insn (icode, 2, ops))
25486     {
25487       end_sequence ();
25488       return NULL_RTX;
25489     }
25490   *gen_seq = get_insns ();
25491   end_sequence ();
25492
25493   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
25494                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
25495 }
25496
25497 static rtx
25498 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
25499                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
25500 {
25501   rtx op0, op1, target;
25502   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
25503   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
25504   insn_code icode;
25505   struct expand_operand ops[6];
25506   int aarch64_cond;
25507
25508   push_to_sequence (*prep_seq);
25509   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
25510
25511   op_mode = GET_MODE (op0);
25512   if (op_mode == VOIDmode)
25513     op_mode = GET_MODE (op1);
25514
25515   switch (op_mode)
25516     {
25517     case E_QImode:
25518     case E_HImode:
25519     case E_SImode:
25520       cmp_mode = SImode;
25521       break;
25522
25523     case E_DImode:
25524       cmp_mode = DImode;
25525       break;
25526
25527     case E_SFmode:
25528       cmp_mode = SFmode;
25529       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
25530       break;
25531
25532     case E_DFmode:
25533       cmp_mode = DFmode;
25534       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
25535       break;
25536
25537     default:
25538       end_sequence ();
25539       return NULL_RTX;
25540     }
25541
25542   icode = code_for_ccmp (cc_mode, cmp_mode);
25543
25544   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
25545   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
25546   if (!op0 || !op1)
25547     {
25548       end_sequence ();
25549       return NULL_RTX;
25550     }
25551   *prep_seq = get_insns ();
25552   end_sequence ();
25553
25554   target = gen_rtx_REG (cc_mode, CC_REGNUM);
25555   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
25556
25557   if (bit_code != AND)
25558     {
25559       /* Treat the ccmp patterns as canonical and use them where possible,
25560          but fall back to ccmp_rev patterns if there's no other option.  */
25561       rtx_code prev_code = GET_CODE (prev);
25562       machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
25563       if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
25564           && !(prev_code == EQ
25565                || prev_code == NE
25566                || prev_code == ORDERED
25567                || prev_code == UNORDERED))
25568         icode = code_for_ccmp_rev (cc_mode, cmp_mode);
25569       else
25570         {
25571           rtx_code code = reverse_condition (prev_code);
25572           prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
25573         }
25574       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
25575     }
25576
25577   create_fixed_operand (&ops[0], XEXP (prev, 0));
25578   create_fixed_operand (&ops[1], target);
25579   create_fixed_operand (&ops[2], op0);
25580   create_fixed_operand (&ops[3], op1);
25581   create_fixed_operand (&ops[4], prev);
25582   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
25583
25584   push_to_sequence (*gen_seq);
25585   if (!maybe_expand_insn (icode, 6, ops))
25586     {
25587       end_sequence ();
25588       return NULL_RTX;
25589     }
25590
25591   *gen_seq = get_insns ();
25592   end_sequence ();
25593
25594   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
25595 }
25596
25597 #undef TARGET_GEN_CCMP_FIRST
25598 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
25599
25600 #undef TARGET_GEN_CCMP_NEXT
25601 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
25602
25603 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
25604    instruction fusion of some sort.  */
25605
25606 static bool
25607 aarch64_macro_fusion_p (void)
25608 {
25609   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
25610 }
25611
25612
25613 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
25614    should be kept together during scheduling.  */
25615
25616 static bool
25617 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
25618 {
25619   rtx set_dest;
25620   rtx prev_set = single_set (prev);
25621   rtx curr_set = single_set (curr);
25622   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
25623   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
25624
25625   if (!aarch64_macro_fusion_p ())
25626     return false;
25627
25628   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
25629     {
25630       /* We are trying to match:
25631          prev (mov)  == (set (reg r0) (const_int imm16))
25632          curr (movk) == (set (zero_extract (reg r0)
25633                                            (const_int 16)
25634                                            (const_int 16))
25635                              (const_int imm16_1))  */
25636
25637       set_dest = SET_DEST (curr_set);
25638
25639       if (GET_CODE (set_dest) == ZERO_EXTRACT
25640           && CONST_INT_P (SET_SRC (curr_set))
25641           && CONST_INT_P (SET_SRC (prev_set))
25642           && CONST_INT_P (XEXP (set_dest, 2))
25643           && INTVAL (XEXP (set_dest, 2)) == 16
25644           && REG_P (XEXP (set_dest, 0))
25645           && REG_P (SET_DEST (prev_set))
25646           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
25647         {
25648           return true;
25649         }
25650     }
25651
25652   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
25653     {
25654
25655       /*  We're trying to match:
25656           prev (adrp) == (set (reg r1)
25657                               (high (symbol_ref ("SYM"))))
25658           curr (add) == (set (reg r0)
25659                              (lo_sum (reg r1)
25660                                      (symbol_ref ("SYM"))))
25661           Note that r0 need not necessarily be the same as r1, especially
25662           during pre-regalloc scheduling.  */
25663
25664       if (satisfies_constraint_Ush (SET_SRC (prev_set))
25665           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
25666         {
25667           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
25668               && REG_P (XEXP (SET_SRC (curr_set), 0))
25669               && REGNO (XEXP (SET_SRC (curr_set), 0))
25670                  == REGNO (SET_DEST (prev_set))
25671               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
25672                               XEXP (SET_SRC (curr_set), 1)))
25673             return true;
25674         }
25675     }
25676
25677   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
25678     {
25679
25680       /* We're trying to match:
25681          prev (movk) == (set (zero_extract (reg r0)
25682                                            (const_int 16)
25683                                            (const_int 32))
25684                              (const_int imm16_1))
25685          curr (movk) == (set (zero_extract (reg r0)
25686                                            (const_int 16)
25687                                            (const_int 48))
25688                              (const_int imm16_2))  */
25689
25690       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
25691           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
25692           && REG_P (XEXP (SET_DEST (prev_set), 0))
25693           && REG_P (XEXP (SET_DEST (curr_set), 0))
25694           && REGNO (XEXP (SET_DEST (prev_set), 0))
25695              == REGNO (XEXP (SET_DEST (curr_set), 0))
25696           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
25697           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
25698           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
25699           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
25700           && CONST_INT_P (SET_SRC (prev_set))
25701           && CONST_INT_P (SET_SRC (curr_set)))
25702         return true;
25703
25704     }
25705   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
25706     {
25707       /* We're trying to match:
25708           prev (adrp) == (set (reg r0)
25709                               (high (symbol_ref ("SYM"))))
25710           curr (ldr) == (set (reg r1)
25711                              (mem (lo_sum (reg r0)
25712                                              (symbol_ref ("SYM")))))
25713                  or
25714           curr (ldr) == (set (reg r1)
25715                              (zero_extend (mem
25716                                            (lo_sum (reg r0)
25717                                                    (symbol_ref ("SYM"))))))  */
25718       if (satisfies_constraint_Ush (SET_SRC (prev_set))
25719           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
25720         {
25721           rtx curr_src = SET_SRC (curr_set);
25722
25723           if (GET_CODE (curr_src) == ZERO_EXTEND)
25724             curr_src = XEXP (curr_src, 0);
25725
25726           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
25727               && REG_P (XEXP (XEXP (curr_src, 0), 0))
25728               && REGNO (XEXP (XEXP (curr_src, 0), 0))
25729                  == REGNO (SET_DEST (prev_set))
25730               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
25731                               XEXP (SET_SRC (prev_set), 0)))
25732               return true;
25733         }
25734     }
25735
25736   /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch.  */
25737   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
25738       && prev_set && curr_set && any_condjump_p (curr)
25739       && GET_CODE (SET_SRC (prev_set)) == COMPARE
25740       && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
25741       && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
25742     return true;
25743
25744   /* Fuse flag-setting ALU instructions and conditional branch.  */
25745   if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
25746       && any_condjump_p (curr))
25747     {
25748       unsigned int condreg1, condreg2;
25749       rtx cc_reg_1;
25750       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
25751       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
25752
25753       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
25754           && prev
25755           && modified_in_p (cc_reg_1, prev))
25756         {
25757           enum attr_type prev_type = get_attr_type (prev);
25758
25759           /* FIXME: this misses some which is considered simple arthematic
25760              instructions for ThunderX.  Simple shifts are missed here.  */
25761           if (prev_type == TYPE_ALUS_SREG
25762               || prev_type == TYPE_ALUS_IMM
25763               || prev_type == TYPE_LOGICS_REG
25764               || prev_type == TYPE_LOGICS_IMM)
25765             return true;
25766         }
25767     }
25768
25769   /* Fuse ALU instructions and CBZ/CBNZ.  */
25770   if (prev_set
25771       && curr_set
25772       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
25773       && any_condjump_p (curr))
25774     {
25775       /* We're trying to match:
25776           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
25777           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
25778                                                          (const_int 0))
25779                                                  (label_ref ("SYM"))
25780                                                  (pc))  */
25781       if (SET_DEST (curr_set) == (pc_rtx)
25782           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
25783           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
25784           && REG_P (SET_DEST (prev_set))
25785           && REGNO (SET_DEST (prev_set))
25786              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
25787         {
25788           /* Fuse ALU operations followed by conditional branch instruction.  */
25789           switch (get_attr_type (prev))
25790             {
25791             case TYPE_ALU_IMM:
25792             case TYPE_ALU_SREG:
25793             case TYPE_ADC_REG:
25794             case TYPE_ADC_IMM:
25795             case TYPE_ADCS_REG:
25796             case TYPE_ADCS_IMM:
25797             case TYPE_LOGIC_REG:
25798             case TYPE_LOGIC_IMM:
25799             case TYPE_CSEL:
25800             case TYPE_ADR:
25801             case TYPE_MOV_IMM:
25802             case TYPE_SHIFT_REG:
25803             case TYPE_SHIFT_IMM:
25804             case TYPE_BFM:
25805             case TYPE_RBIT:
25806             case TYPE_REV:
25807             case TYPE_EXTEND:
25808               return true;
25809
25810             default:;
25811             }
25812         }
25813     }
25814
25815   /* Fuse A+B+1 and A-B-1 */
25816   if (simple_sets_p
25817       && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1))
25818     {
25819       /* We're trying to match:
25820           prev == (set (r0) (plus (r0) (r1)))
25821           curr == (set (r0) (plus (r0) (const_int 1)))
25822         or:
25823           prev == (set (r0) (minus (r0) (r1)))
25824           curr == (set (r0) (plus (r0) (const_int -1))) */
25825
25826       rtx prev_src = SET_SRC (prev_set);
25827       rtx curr_src = SET_SRC (curr_set);
25828
25829       int polarity = 1;
25830       if (GET_CODE (prev_src) == MINUS)
25831         polarity = -1;
25832
25833       if (GET_CODE (curr_src) == PLUS
25834           && (GET_CODE (prev_src) == PLUS || GET_CODE (prev_src) == MINUS)
25835           && CONST_INT_P (XEXP (curr_src, 1))
25836           && INTVAL (XEXP (curr_src, 1)) == polarity
25837           && REG_P (XEXP (curr_src, 0))
25838           && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (curr_src, 0)))
25839         return true;
25840     }
25841
25842   return false;
25843 }
25844
25845 /* Return true iff the instruction fusion described by OP is enabled.  */
25846
25847 bool
25848 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
25849 {
25850   return (aarch64_tune_params.fusible_ops & op) != 0;
25851 }
25852
25853 /* If MEM is in the form of [base+offset], extract the two parts
25854    of address and set to BASE and OFFSET, otherwise return false
25855    after clearing BASE and OFFSET.  */
25856
25857 bool
25858 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
25859 {
25860   rtx addr;
25861
25862   gcc_assert (MEM_P (mem));
25863
25864   addr = XEXP (mem, 0);
25865
25866   if (REG_P (addr))
25867     {
25868       *base = addr;
25869       *offset = const0_rtx;
25870       return true;
25871     }
25872
25873   if (GET_CODE (addr) == PLUS
25874       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
25875     {
25876       *base = XEXP (addr, 0);
25877       *offset = XEXP (addr, 1);
25878       return true;
25879     }
25880
25881   *base = NULL_RTX;
25882   *offset = NULL_RTX;
25883
25884   return false;
25885 }
25886
25887 /* Types for scheduling fusion.  */
25888 enum sched_fusion_type
25889 {
25890   SCHED_FUSION_NONE = 0,
25891   SCHED_FUSION_LD_SIGN_EXTEND,
25892   SCHED_FUSION_LD_ZERO_EXTEND,
25893   SCHED_FUSION_LD,
25894   SCHED_FUSION_ST,
25895   SCHED_FUSION_NUM
25896 };
25897
25898 /* If INSN is a load or store of address in the form of [base+offset],
25899    extract the two parts and set to BASE and OFFSET.  Return scheduling
25900    fusion type this INSN is.  */
25901
25902 static enum sched_fusion_type
25903 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
25904 {
25905   rtx x, dest, src;
25906   enum sched_fusion_type fusion = SCHED_FUSION_LD;
25907
25908   gcc_assert (INSN_P (insn));
25909   x = PATTERN (insn);
25910   if (GET_CODE (x) != SET)
25911     return SCHED_FUSION_NONE;
25912
25913   src = SET_SRC (x);
25914   dest = SET_DEST (x);
25915
25916   machine_mode dest_mode = GET_MODE (dest);
25917
25918   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
25919     return SCHED_FUSION_NONE;
25920
25921   if (GET_CODE (src) == SIGN_EXTEND)
25922     {
25923       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
25924       src = XEXP (src, 0);
25925       if (!MEM_P (src) || GET_MODE (src) != SImode)
25926         return SCHED_FUSION_NONE;
25927     }
25928   else if (GET_CODE (src) == ZERO_EXTEND)
25929     {
25930       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
25931       src = XEXP (src, 0);
25932       if (!MEM_P (src) || GET_MODE (src) != SImode)
25933         return SCHED_FUSION_NONE;
25934     }
25935
25936   if (MEM_P (src) && REG_P (dest))
25937     extract_base_offset_in_addr (src, base, offset);
25938   else if (MEM_P (dest) && (REG_P (src) || src == const0_rtx))
25939     {
25940       fusion = SCHED_FUSION_ST;
25941       extract_base_offset_in_addr (dest, base, offset);
25942     }
25943   else
25944     return SCHED_FUSION_NONE;
25945
25946   if (*base == NULL_RTX || *offset == NULL_RTX)
25947     fusion = SCHED_FUSION_NONE;
25948
25949   return fusion;
25950 }
25951
25952 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
25953
25954    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
25955    and PRI are only calculated for these instructions.  For other instruction,
25956    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
25957    type instruction fusion can be added by returning different priorities.
25958
25959    It's important that irrelevant instructions get the largest FUSION_PRI.  */
25960
25961 static void
25962 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
25963                                int *fusion_pri, int *pri)
25964 {
25965   int tmp, off_val;
25966   rtx base, offset;
25967   enum sched_fusion_type fusion;
25968
25969   gcc_assert (INSN_P (insn));
25970
25971   tmp = max_pri - 1;
25972   fusion = fusion_load_store (insn, &base, &offset);
25973   if (fusion == SCHED_FUSION_NONE)
25974     {
25975       *pri = tmp;
25976       *fusion_pri = tmp;
25977       return;
25978     }
25979
25980   /* Set FUSION_PRI according to fusion type and base register.  */
25981   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
25982
25983   /* Calculate PRI.  */
25984   tmp /= 2;
25985
25986   /* INSN with smaller offset goes first.  */
25987   off_val = (int)(INTVAL (offset));
25988   if (off_val >= 0)
25989     tmp -= (off_val & 0xfffff);
25990   else
25991     tmp += ((- off_val) & 0xfffff);
25992
25993   *pri = tmp;
25994   return;
25995 }
25996
25997 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
25998    Adjust priority of sha1h instructions so they are scheduled before
25999    other SHA1 instructions.  */
26000
26001 static int
26002 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
26003 {
26004   rtx x = PATTERN (insn);
26005
26006   if (GET_CODE (x) == SET)
26007     {
26008       x = SET_SRC (x);
26009
26010       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
26011         return priority + 10;
26012     }
26013
26014   return priority;
26015 }
26016
26017 /* If REVERSED is null, return true if memory reference *MEM2 comes
26018    immediately after memory reference *MEM1.  Do not change the references
26019    in this case.
26020
26021    Otherwise, check if *MEM1 and *MEM2 are consecutive memory references and,
26022    if they are, try to make them use constant offsets from the same base
26023    register.  Return true on success.  When returning true, set *REVERSED
26024    to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2.  */
26025 static bool
26026 aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed)
26027 {
26028   if (reversed)
26029     *reversed = false;
26030
26031   if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1, 0))) == RTX_AUTOINC
26032       || GET_RTX_CLASS (GET_CODE (XEXP (*mem2, 0))) == RTX_AUTOINC)
26033     return false;
26034
26035   if (!MEM_SIZE_KNOWN_P (*mem1) || !MEM_SIZE_KNOWN_P (*mem2))
26036     return false;
26037
26038   auto size1 = MEM_SIZE (*mem1);
26039   auto size2 = MEM_SIZE (*mem2);
26040
26041   rtx base1, base2, offset1, offset2;
26042   extract_base_offset_in_addr (*mem1, &base1, &offset1);
26043   extract_base_offset_in_addr (*mem2, &base2, &offset2);
26044
26045   /* Make sure at least one memory is in base+offset form.  */
26046   if (!(base1 && offset1) && !(base2 && offset2))
26047     return false;
26048
26049   /* If both mems already use the same base register, just check the
26050      offsets.  */
26051   if (base1 && base2 && rtx_equal_p (base1, base2))
26052     {
26053       if (!offset1 || !offset2)
26054         return false;
26055
26056       if (known_eq (UINTVAL (offset1) + size1, UINTVAL (offset2)))
26057         return true;
26058
26059       if (known_eq (UINTVAL (offset2) + size2, UINTVAL (offset1)) && reversed)
26060         {
26061           *reversed = true;
26062           return true;
26063         }
26064
26065       return false;
26066     }
26067
26068   /* Otherwise, check whether the MEM_EXPRs and MEM_OFFSETs together
26069      guarantee that the values are consecutive.  */
26070   if (MEM_EXPR (*mem1)
26071       && MEM_EXPR (*mem2)
26072       && MEM_OFFSET_KNOWN_P (*mem1)
26073       && MEM_OFFSET_KNOWN_P (*mem2))
26074     {
26075       poly_int64 expr_offset1;
26076       poly_int64 expr_offset2;
26077       tree expr_base1 = get_addr_base_and_unit_offset (MEM_EXPR (*mem1),
26078                                                        &expr_offset1);
26079       tree expr_base2 = get_addr_base_and_unit_offset (MEM_EXPR (*mem2),
26080                                                        &expr_offset2);
26081       if (!expr_base1
26082           || !expr_base2
26083           || !DECL_P (expr_base1)
26084           || !operand_equal_p (expr_base1, expr_base2, OEP_ADDRESS_OF))
26085         return false;
26086
26087       expr_offset1 += MEM_OFFSET (*mem1);
26088       expr_offset2 += MEM_OFFSET (*mem2);
26089
26090       if (known_eq (expr_offset1 + size1, expr_offset2))
26091         ;
26092       else if (known_eq (expr_offset2 + size2, expr_offset1) && reversed)
26093         *reversed = true;
26094       else
26095         return false;
26096
26097       if (reversed)
26098         {
26099           if (base2)
26100             {
26101               rtx addr1 = plus_constant (Pmode, XEXP (*mem2, 0),
26102                                          expr_offset1 - expr_offset2);
26103               *mem1 = replace_equiv_address_nv (*mem1, addr1);
26104             }
26105           else
26106             {
26107               rtx addr2 = plus_constant (Pmode, XEXP (*mem1, 0),
26108                                          expr_offset2 - expr_offset1);
26109               *mem2 = replace_equiv_address_nv (*mem2, addr2);
26110             }
26111         }
26112       return true;
26113     }
26114
26115   return false;
26116 }
26117
26118 /* Return true if MEM1 and MEM2 can be combined into a single access
26119    of mode MODE, with the combined access having the same address as MEM1.  */
26120
26121 bool
26122 aarch64_mergeable_load_pair_p (machine_mode mode, rtx mem1, rtx mem2)
26123 {
26124   if (STRICT_ALIGNMENT && MEM_ALIGN (mem1) < GET_MODE_ALIGNMENT (mode))
26125     return false;
26126   return aarch64_check_consecutive_mems (&mem1, &mem2, nullptr);
26127 }
26128
26129 /* Given OPERANDS of consecutive load/store, check if we can merge
26130    them into ldp/stp.  LOAD is true if they are load instructions.
26131    MODE is the mode of memory operands.  */
26132
26133 bool
26134 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
26135                                 machine_mode mode)
26136 {
26137   enum reg_class rclass_1, rclass_2;
26138   rtx mem_1, mem_2, reg_1, reg_2;
26139
26140   if (load)
26141     {
26142       mem_1 = operands[1];
26143       mem_2 = operands[3];
26144       reg_1 = operands[0];
26145       reg_2 = operands[2];
26146       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
26147       if (REGNO (reg_1) == REGNO (reg_2))
26148         return false;
26149       if (reg_overlap_mentioned_p (reg_1, mem_2))
26150         return false;
26151     }
26152   else
26153     {
26154       mem_1 = operands[0];
26155       mem_2 = operands[2];
26156       reg_1 = operands[1];
26157       reg_2 = operands[3];
26158     }
26159
26160   /* The mems cannot be volatile.  */
26161   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
26162     return false;
26163
26164   /* If we have SImode and slow unaligned ldp,
26165      check the alignment to be at least 8 byte. */
26166   if (mode == SImode
26167       && (aarch64_tune_params.extra_tuning_flags
26168           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
26169       && !optimize_size
26170       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
26171     return false;
26172
26173   /* Check if the addresses are in the form of [base+offset].  */
26174   bool reversed = false;
26175   if (!aarch64_check_consecutive_mems (&mem_1, &mem_2, &reversed))
26176     return false;
26177
26178   /* The operands must be of the same size.  */
26179   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
26180                         GET_MODE_SIZE (GET_MODE (mem_2))));
26181
26182   /* One of the memory accesses must be a mempair operand.
26183      If it is not the first one, they need to be swapped by the
26184      peephole.  */
26185   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
26186        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
26187     return false;
26188
26189   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
26190     rclass_1 = FP_REGS;
26191   else
26192     rclass_1 = GENERAL_REGS;
26193
26194   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
26195     rclass_2 = FP_REGS;
26196   else
26197     rclass_2 = GENERAL_REGS;
26198
26199   /* Check if the registers are of same class.  */
26200   if (rclass_1 != rclass_2)
26201     return false;
26202
26203   return true;
26204 }
26205
26206 /* Given OPERANDS of consecutive load/store that can be merged,
26207    swap them if they are not in ascending order.  */
26208 void
26209 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
26210 {
26211   int mem_op = load ? 1 : 0;
26212   bool reversed = false;
26213   if (!aarch64_check_consecutive_mems (operands + mem_op,
26214                                        operands + mem_op + 2, &reversed))
26215     gcc_unreachable ();
26216
26217   if (reversed)
26218     {
26219       /* Irrespective of whether this is a load or a store,
26220          we do the same swap.  */
26221       std::swap (operands[0], operands[2]);
26222       std::swap (operands[1], operands[3]);
26223     }
26224 }
26225
26226 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
26227    comparison between the two.  */
26228 int
26229 aarch64_host_wide_int_compare (const void *x, const void *y)
26230 {
26231   return wi::cmps (* ((const HOST_WIDE_INT *) x),
26232                    * ((const HOST_WIDE_INT *) y));
26233 }
26234
26235 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
26236    other pointing to a REG rtx containing an offset, compare the offsets
26237    of the two pairs.
26238
26239    Return:
26240
26241         1 iff offset (X) > offset (Y)
26242         0 iff offset (X) == offset (Y)
26243         -1 iff offset (X) < offset (Y)  */
26244 int
26245 aarch64_ldrstr_offset_compare (const void *x, const void *y)
26246 {
26247   const rtx * operands_1 = (const rtx *) x;
26248   const rtx * operands_2 = (const rtx *) y;
26249   rtx mem_1, mem_2, base, offset_1, offset_2;
26250
26251   if (MEM_P (operands_1[0]))
26252     mem_1 = operands_1[0];
26253   else
26254     mem_1 = operands_1[1];
26255
26256   if (MEM_P (operands_2[0]))
26257     mem_2 = operands_2[0];
26258   else
26259     mem_2 = operands_2[1];
26260
26261   /* Extract the offsets.  */
26262   extract_base_offset_in_addr (mem_1, &base, &offset_1);
26263   extract_base_offset_in_addr (mem_2, &base, &offset_2);
26264
26265   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
26266
26267   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
26268 }
26269
26270 /* Given OPERANDS of consecutive load/store, check if we can merge
26271    them into ldp/stp by adjusting the offset.  LOAD is true if they
26272    are load instructions.  MODE is the mode of memory operands.
26273
26274    Given below consecutive stores:
26275
26276      str  w1, [xb, 0x100]
26277      str  w1, [xb, 0x104]
26278      str  w1, [xb, 0x108]
26279      str  w1, [xb, 0x10c]
26280
26281    Though the offsets are out of the range supported by stp, we can
26282    still pair them after adjusting the offset, like:
26283
26284      add  scratch, xb, 0x100
26285      stp  w1, w1, [scratch]
26286      stp  w1, w1, [scratch, 0x8]
26287
26288    The peephole patterns detecting this opportunity should guarantee
26289    the scratch register is avaliable.  */
26290
26291 bool
26292 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
26293                                        machine_mode mode)
26294 {
26295   const int num_insns = 4;
26296   enum reg_class rclass;
26297   HOST_WIDE_INT offvals[num_insns], msize;
26298   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
26299
26300   if (load)
26301     {
26302       for (int i = 0; i < num_insns; i++)
26303         {
26304           reg[i] = operands[2 * i];
26305           mem[i] = operands[2 * i + 1];
26306
26307           gcc_assert (REG_P (reg[i]));
26308         }
26309
26310       /* Do not attempt to merge the loads if the loads clobber each other.  */
26311       for (int i = 0; i < 8; i += 2)
26312         for (int j = i + 2; j < 8; j += 2)
26313           if (reg_overlap_mentioned_p (operands[i], operands[j]))
26314             return false;
26315     }
26316   else
26317     for (int i = 0; i < num_insns; i++)
26318       {
26319         mem[i] = operands[2 * i];
26320         reg[i] = operands[2 * i + 1];
26321       }
26322
26323   /* Skip if memory operand is by itself valid for ldp/stp.  */
26324   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
26325     return false;
26326
26327   for (int i = 0; i < num_insns; i++)
26328     {
26329       /* The mems cannot be volatile.  */
26330       if (MEM_VOLATILE_P (mem[i]))
26331         return false;
26332
26333       /* Check if the addresses are in the form of [base+offset].  */
26334       extract_base_offset_in_addr (mem[i], base + i, offset + i);
26335       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
26336         return false;
26337     }
26338
26339   /* Check if the registers are of same class.  */
26340   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
26341     ? FP_REGS : GENERAL_REGS;
26342
26343   for (int i = 1; i < num_insns; i++)
26344     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
26345       {
26346         if (rclass != FP_REGS)
26347           return false;
26348       }
26349     else
26350       {
26351         if (rclass != GENERAL_REGS)
26352           return false;
26353       }
26354
26355   /* Only the last register in the order in which they occur
26356      may be clobbered by the load.  */
26357   if (rclass == GENERAL_REGS && load)
26358     for (int i = 0; i < num_insns - 1; i++)
26359       if (reg_mentioned_p (reg[i], mem[i]))
26360         return false;
26361
26362   /* Check if the bases are same.  */
26363   for (int i = 0; i < num_insns - 1; i++)
26364     if (!rtx_equal_p (base[i], base[i + 1]))
26365       return false;
26366
26367   for (int i = 0; i < num_insns; i++)
26368     offvals[i] = INTVAL (offset[i]);
26369
26370   msize = GET_MODE_SIZE (mode).to_constant ();
26371
26372   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
26373   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
26374          aarch64_host_wide_int_compare);
26375
26376   if (!(offvals[1] == offvals[0] + msize
26377         && offvals[3] == offvals[2] + msize))
26378     return false;
26379
26380   /* Check that offsets are within range of each other.  The ldp/stp
26381      instructions have 7 bit immediate offsets, so use 0x80.  */
26382   if (offvals[2] - offvals[0] >= msize * 0x80)
26383     return false;
26384
26385   /* The offsets must be aligned with respect to each other.  */
26386   if (offvals[0] % msize != offvals[2] % msize)
26387     return false;
26388
26389   /* If we have SImode and slow unaligned ldp,
26390      check the alignment to be at least 8 byte. */
26391   if (mode == SImode
26392       && (aarch64_tune_params.extra_tuning_flags
26393           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
26394       && !optimize_size
26395       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
26396     return false;
26397
26398   return true;
26399 }
26400
26401 /* Given OPERANDS of consecutive load/store, this function pairs them
26402    into LDP/STP after adjusting the offset.  It depends on the fact
26403    that the operands can be sorted so the offsets are correct for STP.
26404    MODE is the mode of memory operands.  CODE is the rtl operator
26405    which should be applied to all memory operands, it's SIGN_EXTEND,
26406    ZERO_EXTEND or UNKNOWN.  */
26407
26408 bool
26409 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
26410                              machine_mode mode, RTX_CODE code)
26411 {
26412   rtx base, offset_1, offset_3, t1, t2;
26413   rtx mem_1, mem_2, mem_3, mem_4;
26414   rtx temp_operands[8];
26415   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
26416                 stp_off_upper_limit, stp_off_lower_limit, msize;
26417
26418   /* We make changes on a copy as we may still bail out.  */
26419   for (int i = 0; i < 8; i ++)
26420     temp_operands[i] = operands[i];
26421
26422   /* Sort the operands.  Note for cases as below:
26423        [base + 0x310] = A
26424        [base + 0x320] = B
26425        [base + 0x330] = C
26426        [base + 0x320] = D
26427      We need stable sorting otherwise wrong data may be store to offset 0x320.
26428      Also note the dead store in above case should be optimized away, but no
26429      guarantees here.  */
26430   gcc_stablesort(temp_operands, 4, 2 * sizeof (rtx *),
26431                  aarch64_ldrstr_offset_compare);
26432
26433   /* Copy the memory operands so that if we have to bail for some
26434      reason the original addresses are unchanged.  */
26435   if (load)
26436     {
26437       mem_1 = copy_rtx (temp_operands[1]);
26438       mem_2 = copy_rtx (temp_operands[3]);
26439       mem_3 = copy_rtx (temp_operands[5]);
26440       mem_4 = copy_rtx (temp_operands[7]);
26441     }
26442   else
26443     {
26444       mem_1 = copy_rtx (temp_operands[0]);
26445       mem_2 = copy_rtx (temp_operands[2]);
26446       mem_3 = copy_rtx (temp_operands[4]);
26447       mem_4 = copy_rtx (temp_operands[6]);
26448       gcc_assert (code == UNKNOWN);
26449     }
26450
26451   extract_base_offset_in_addr (mem_1, &base, &offset_1);
26452   extract_base_offset_in_addr (mem_3, &base, &offset_3);
26453   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
26454               && offset_3 != NULL_RTX);
26455
26456   /* Adjust offset so it can fit in LDP/STP instruction.  */
26457   msize = GET_MODE_SIZE (mode).to_constant();
26458   stp_off_upper_limit = msize * (0x40 - 1);
26459   stp_off_lower_limit = - msize * 0x40;
26460
26461   off_val_1 = INTVAL (offset_1);
26462   off_val_3 = INTVAL (offset_3);
26463
26464   /* The base offset is optimally half way between the two STP/LDP offsets.  */
26465   if (msize <= 4)
26466     base_off = (off_val_1 + off_val_3) / 2;
26467   else
26468     /* However, due to issues with negative LDP/STP offset generation for
26469        larger modes, for DF, DD, DI and vector modes. we must not use negative
26470        addresses smaller than 9 signed unadjusted bits can store.  This
26471        provides the most range in this case.  */
26472     base_off = off_val_1;
26473
26474   /* Adjust the base so that it is aligned with the addresses but still
26475      optimal.  */
26476   if (base_off % msize != off_val_1 % msize)
26477     /* Fix the offset, bearing in mind we want to make it bigger not
26478        smaller.  */
26479     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26480   else if (msize <= 4)
26481     /* The negative range of LDP/STP is one larger than the positive range.  */
26482     base_off += msize;
26483
26484   /* Check if base offset is too big or too small.  We can attempt to resolve
26485      this issue by setting it to the maximum value and seeing if the offsets
26486      still fit.  */
26487   if (base_off >= 0x1000)
26488     {
26489       base_off = 0x1000 - 1;
26490       /* We must still make sure that the base offset is aligned with respect
26491          to the address.  But it may not be made any bigger.  */
26492       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26493     }
26494
26495   /* Likewise for the case where the base is too small.  */
26496   if (base_off <= -0x1000)
26497     {
26498       base_off = -0x1000 + 1;
26499       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26500     }
26501
26502   /* Offset of the first STP/LDP.  */
26503   new_off_1 = off_val_1 - base_off;
26504
26505   /* Offset of the second STP/LDP.  */
26506   new_off_3 = off_val_3 - base_off;
26507
26508   /* The offsets must be within the range of the LDP/STP instructions.  */
26509   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
26510       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
26511     return false;
26512
26513   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
26514                                                   new_off_1), true);
26515   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
26516                                                   new_off_1 + msize), true);
26517   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
26518                                                   new_off_3), true);
26519   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
26520                                                   new_off_3 + msize), true);
26521
26522   if (!aarch64_mem_pair_operand (mem_1, mode)
26523       || !aarch64_mem_pair_operand (mem_3, mode))
26524     return false;
26525
26526   if (code == ZERO_EXTEND)
26527     {
26528       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
26529       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
26530       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
26531       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
26532     }
26533   else if (code == SIGN_EXTEND)
26534     {
26535       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
26536       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
26537       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
26538       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
26539     }
26540
26541   if (load)
26542     {
26543       operands[0] = temp_operands[0];
26544       operands[1] = mem_1;
26545       operands[2] = temp_operands[2];
26546       operands[3] = mem_2;
26547       operands[4] = temp_operands[4];
26548       operands[5] = mem_3;
26549       operands[6] = temp_operands[6];
26550       operands[7] = mem_4;
26551     }
26552   else
26553     {
26554       operands[0] = mem_1;
26555       operands[1] = temp_operands[1];
26556       operands[2] = mem_2;
26557       operands[3] = temp_operands[3];
26558       operands[4] = mem_3;
26559       operands[5] = temp_operands[5];
26560       operands[6] = mem_4;
26561       operands[7] = temp_operands[7];
26562     }
26563
26564   /* Emit adjusting instruction.  */
26565   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
26566   /* Emit ldp/stp instructions.  */
26567   t1 = gen_rtx_SET (operands[0], operands[1]);
26568   t2 = gen_rtx_SET (operands[2], operands[3]);
26569   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
26570   t1 = gen_rtx_SET (operands[4], operands[5]);
26571   t2 = gen_rtx_SET (operands[6], operands[7]);
26572   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
26573   return true;
26574 }
26575
26576 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
26577    it isn't worth branching around empty masked ops (including masked
26578    stores).  */
26579
26580 static bool
26581 aarch64_empty_mask_is_expensive (unsigned)
26582 {
26583   return false;
26584 }
26585
26586 /* Return 1 if pseudo register should be created and used to hold
26587    GOT address for PIC code.  */
26588
26589 bool
26590 aarch64_use_pseudo_pic_reg (void)
26591 {
26592   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
26593 }
26594
26595 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
26596
26597 static int
26598 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
26599 {
26600   switch (XINT (x, 1))
26601     {
26602     case UNSPEC_GOTSMALLPIC:
26603     case UNSPEC_GOTSMALLPIC28K:
26604     case UNSPEC_GOTTINYPIC:
26605       return 0;
26606     default:
26607       break;
26608     }
26609
26610   return default_unspec_may_trap_p (x, flags);
26611 }
26612
26613
26614 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
26615    return the log2 of that value.  Otherwise return -1.  */
26616
26617 int
26618 aarch64_fpconst_pow_of_2 (rtx x)
26619 {
26620   const REAL_VALUE_TYPE *r;
26621
26622   if (!CONST_DOUBLE_P (x))
26623     return -1;
26624
26625   r = CONST_DOUBLE_REAL_VALUE (x);
26626
26627   if (REAL_VALUE_NEGATIVE (*r)
26628       || REAL_VALUE_ISNAN (*r)
26629       || REAL_VALUE_ISINF (*r)
26630       || !real_isinteger (r, DFmode))
26631     return -1;
26632
26633   return exact_log2 (real_to_integer (r));
26634 }
26635
26636 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
26637    power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
26638    return n. Otherwise return -1.  */
26639
26640 int
26641 aarch64_fpconst_pow2_recip (rtx x)
26642 {
26643   REAL_VALUE_TYPE r0;
26644
26645   if (!CONST_DOUBLE_P (x))
26646     return -1;
26647
26648   r0 = *CONST_DOUBLE_REAL_VALUE (x);
26649   if (exact_real_inverse (DFmode, &r0)
26650       && !REAL_VALUE_NEGATIVE (r0))
26651     {
26652         int ret = exact_log2 (real_to_integer (&r0));
26653         if (ret >= 1 && ret <= 32)
26654             return ret;
26655     }
26656   return -1;
26657 }
26658
26659 /* If X is a vector of equal CONST_DOUBLE values and that value is
26660    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
26661
26662 int
26663 aarch64_vec_fpconst_pow_of_2 (rtx x)
26664 {
26665   int nelts;
26666   if (!CONST_VECTOR_P (x)
26667       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
26668     return -1;
26669
26670   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
26671     return -1;
26672
26673   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
26674   if (firstval <= 0)
26675     return -1;
26676
26677   for (int i = 1; i < nelts; i++)
26678     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
26679       return -1;
26680
26681   return firstval;
26682 }
26683
26684 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
26685    to float.
26686
26687    __fp16 always promotes through this hook.
26688    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
26689    through the generic excess precision logic rather than here.  */
26690
26691 static tree
26692 aarch64_promoted_type (const_tree t)
26693 {
26694   if (SCALAR_FLOAT_TYPE_P (t)
26695       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
26696     return float_type_node;
26697
26698   return NULL_TREE;
26699 }
26700
26701 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
26702
26703 static bool
26704 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
26705                            optimization_type opt_type)
26706 {
26707   switch (op)
26708     {
26709     case rsqrt_optab:
26710       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
26711
26712     default:
26713       return true;
26714     }
26715 }
26716
26717 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
26718
26719 static unsigned int
26720 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
26721                                         int *offset)
26722 {
26723   /* Polynomial invariant 1 == (VG / 2) - 1.  */
26724   gcc_assert (i == 1);
26725   *factor = 2;
26726   *offset = 1;
26727   return AARCH64_DWARF_VG;
26728 }
26729
26730 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
26731    if MODE is HFmode, and punt to the generic implementation otherwise.  */
26732
26733 static bool
26734 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
26735 {
26736   return (mode == HFmode
26737           ? true
26738           : default_libgcc_floating_mode_supported_p (mode));
26739 }
26740
26741 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
26742    if MODE is HFmode, and punt to the generic implementation otherwise.  */
26743
26744 static bool
26745 aarch64_scalar_mode_supported_p (scalar_mode mode)
26746 {
26747   if (DECIMAL_FLOAT_MODE_P (mode))
26748     return default_decimal_float_supported_p ();
26749
26750   return (mode == HFmode
26751           ? true
26752           : default_scalar_mode_supported_p (mode));
26753 }
26754
26755 /* Set the value of FLT_EVAL_METHOD.
26756    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
26757
26758     0: evaluate all operations and constants, whose semantic type has at
26759        most the range and precision of type float, to the range and
26760        precision of float; evaluate all other operations and constants to
26761        the range and precision of the semantic type;
26762
26763     N, where _FloatN is a supported interchange floating type
26764        evaluate all operations and constants, whose semantic type has at
26765        most the range and precision of _FloatN type, to the range and
26766        precision of the _FloatN type; evaluate all other operations and
26767        constants to the range and precision of the semantic type;
26768
26769    If we have the ARMv8.2-A extensions then we support _Float16 in native
26770    precision, so we should set this to 16.  Otherwise, we support the type,
26771    but want to evaluate expressions in float precision, so set this to
26772    0.  */
26773
26774 static enum flt_eval_method
26775 aarch64_excess_precision (enum excess_precision_type type)
26776 {
26777   switch (type)
26778     {
26779       case EXCESS_PRECISION_TYPE_FAST:
26780       case EXCESS_PRECISION_TYPE_STANDARD:
26781         /* We can calculate either in 16-bit range and precision or
26782            32-bit range and precision.  Make that decision based on whether
26783            we have native support for the ARMv8.2-A 16-bit floating-point
26784            instructions or not.  */
26785         return (TARGET_FP_F16INST
26786                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
26787                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
26788       case EXCESS_PRECISION_TYPE_IMPLICIT:
26789       case EXCESS_PRECISION_TYPE_FLOAT16:
26790         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
26791       default:
26792         gcc_unreachable ();
26793     }
26794   return FLT_EVAL_METHOD_UNPREDICTABLE;
26795 }
26796
26797 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
26798    scheduled for speculative execution.  Reject the long-running division
26799    and square-root instructions.  */
26800
26801 static bool
26802 aarch64_sched_can_speculate_insn (rtx_insn *insn)
26803 {
26804   switch (get_attr_type (insn))
26805     {
26806       case TYPE_SDIV:
26807       case TYPE_UDIV:
26808       case TYPE_FDIVS:
26809       case TYPE_FDIVD:
26810       case TYPE_FSQRTS:
26811       case TYPE_FSQRTD:
26812       case TYPE_NEON_FP_SQRT_S:
26813       case TYPE_NEON_FP_SQRT_D:
26814       case TYPE_NEON_FP_SQRT_S_Q:
26815       case TYPE_NEON_FP_SQRT_D_Q:
26816       case TYPE_NEON_FP_DIV_S:
26817       case TYPE_NEON_FP_DIV_D:
26818       case TYPE_NEON_FP_DIV_S_Q:
26819       case TYPE_NEON_FP_DIV_D_Q:
26820         return false;
26821       default:
26822         return true;
26823     }
26824 }
26825
26826 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
26827
26828 static int
26829 aarch64_compute_pressure_classes (reg_class *classes)
26830 {
26831   int i = 0;
26832   classes[i++] = GENERAL_REGS;
26833   classes[i++] = FP_REGS;
26834   /* PR_REGS isn't a useful pressure class because many predicate pseudo
26835      registers need to go in PR_LO_REGS at some point during their
26836      lifetime.  Splitting it into two halves has the effect of making
26837      all predicates count against PR_LO_REGS, so that we try whenever
26838      possible to restrict the number of live predicates to 8.  This
26839      greatly reduces the amount of spilling in certain loops.  */
26840   classes[i++] = PR_LO_REGS;
26841   classes[i++] = PR_HI_REGS;
26842   return i;
26843 }
26844
26845 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
26846
26847 static bool
26848 aarch64_can_change_mode_class (machine_mode from,
26849                                machine_mode to, reg_class_t)
26850 {
26851   unsigned int from_flags = aarch64_classify_vector_mode (from);
26852   unsigned int to_flags = aarch64_classify_vector_mode (to);
26853
26854   bool from_sve_p = (from_flags & VEC_ANY_SVE);
26855   bool to_sve_p = (to_flags & VEC_ANY_SVE);
26856
26857   bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
26858   bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
26859
26860   bool from_pred_p = (from_flags & VEC_SVE_PRED);
26861   bool to_pred_p = (to_flags & VEC_SVE_PRED);
26862
26863   bool to_partial_advsimd_struct_p = (to_flags == (VEC_ADVSIMD | VEC_STRUCT
26864                                                    | VEC_PARTIAL));
26865   bool from_partial_advsimd_struct_p = (from_flags == (VEC_ADVSIMD | VEC_STRUCT
26866                                                    | VEC_PARTIAL));
26867
26868   /* Don't allow changes between predicate modes and other modes.
26869      Only predicate registers can hold predicate modes and only
26870      non-predicate registers can hold non-predicate modes, so any
26871      attempt to mix them would require a round trip through memory.  */
26872   if (from_pred_p != to_pred_p)
26873     return false;
26874
26875   /* Don't allow changes between partial SVE modes and other modes.
26876      The contents of partial SVE modes are distributed evenly across
26877      the register, whereas GCC expects them to be clustered together.  */
26878   if (from_partial_sve_p != to_partial_sve_p)
26879     return false;
26880
26881   /* Similarly reject changes between partial SVE modes that have
26882      different patterns of significant and insignificant bits.  */
26883   if (from_partial_sve_p
26884       && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
26885           || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
26886     return false;
26887
26888   /* Don't allow changes between partial and other registers only if
26889      one is a normal SIMD register, allow only if not larger than 64-bit.  */
26890   if ((to_partial_advsimd_struct_p ^ from_partial_advsimd_struct_p)
26891       && (known_gt (GET_MODE_SIZE (to), 8) || known_gt (GET_MODE_SIZE (to), 8)))
26892     return false;
26893
26894   if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
26895     {
26896       /* Don't allow changes between SVE modes and other modes that might
26897          be bigger than 128 bits.  In particular, OImode, CImode and XImode
26898          divide into 128-bit quantities while SVE modes divide into
26899          BITS_PER_SVE_VECTOR quantities.  */
26900       if (from_sve_p && !to_sve_p && maybe_gt (GET_MODE_BITSIZE (to), 128))
26901         return false;
26902       if (to_sve_p && !from_sve_p && maybe_gt (GET_MODE_BITSIZE (from), 128))
26903         return false;
26904     }
26905
26906   if (BYTES_BIG_ENDIAN)
26907     {
26908       /* Don't allow changes between SVE data modes and non-SVE modes.
26909          See the comment at the head of aarch64-sve.md for details.  */
26910       if (from_sve_p != to_sve_p)
26911         return false;
26912
26913       /* Don't allow changes in element size: lane 0 of the new vector
26914          would not then be lane 0 of the old vector.  See the comment
26915          above aarch64_maybe_expand_sve_subreg_move for a more detailed
26916          description.
26917
26918          In the worst case, this forces a register to be spilled in
26919          one mode and reloaded in the other, which handles the
26920          endianness correctly.  */
26921       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
26922         return false;
26923     }
26924   return true;
26925 }
26926
26927 /* Implement TARGET_EARLY_REMAT_MODES.  */
26928
26929 static void
26930 aarch64_select_early_remat_modes (sbitmap modes)
26931 {
26932   /* SVE values are not normally live across a call, so it should be
26933      worth doing early rematerialization even in VL-specific mode.  */
26934   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
26935     if (aarch64_sve_mode_p ((machine_mode) i))
26936       bitmap_set_bit (modes, i);
26937 }
26938
26939 /* Override the default target speculation_safe_value.  */
26940 static rtx
26941 aarch64_speculation_safe_value (machine_mode mode,
26942                                 rtx result, rtx val, rtx failval)
26943 {
26944   /* Maybe we should warn if falling back to hard barriers.  They are
26945      likely to be noticably more expensive than the alternative below.  */
26946   if (!aarch64_track_speculation)
26947     return default_speculation_safe_value (mode, result, val, failval);
26948
26949   if (!REG_P (val))
26950     val = copy_to_mode_reg (mode, val);
26951
26952   if (!aarch64_reg_or_zero (failval, mode))
26953     failval = copy_to_mode_reg (mode, failval);
26954
26955   emit_insn (gen_despeculate_copy (mode, result, val, failval));
26956   return result;
26957 }
26958
26959 /* Implement TARGET_ESTIMATED_POLY_VALUE.
26960    Look into the tuning structure for an estimate.
26961    KIND specifies the type of requested estimate: min, max or likely.
26962    For cores with a known SVE width all three estimates are the same.
26963    For generic SVE tuning we want to distinguish the maximum estimate from
26964    the minimum and likely ones.
26965    The likely estimate is the same as the minimum in that case to give a
26966    conservative behavior of auto-vectorizing with SVE when it is a win
26967    even for 128-bit SVE.
26968    When SVE width information is available VAL.coeffs[1] is multiplied by
26969    the number of VQ chunks over the initial Advanced SIMD 128 bits.  */
26970
26971 static HOST_WIDE_INT
26972 aarch64_estimated_poly_value (poly_int64 val,
26973                               poly_value_estimate_kind kind
26974                                 = POLY_VALUE_LIKELY)
26975 {
26976   unsigned int width_source = aarch64_tune_params.sve_width;
26977
26978   /* If there is no core-specific information then the minimum and likely
26979      values are based on 128-bit vectors and the maximum is based on
26980      the architectural maximum of 2048 bits.  */
26981   if (width_source == SVE_SCALABLE)
26982     switch (kind)
26983       {
26984       case POLY_VALUE_MIN:
26985       case POLY_VALUE_LIKELY:
26986         return val.coeffs[0];
26987       case POLY_VALUE_MAX:
26988           return val.coeffs[0] + val.coeffs[1] * 15;
26989       }
26990
26991   /* Allow sve_width to be a bitmask of different VL, treating the lowest
26992      as likely.  This could be made more general if future -mtune options
26993      need it to be.  */
26994   if (kind == POLY_VALUE_MAX)
26995     width_source = 1 << floor_log2 (width_source);
26996   else
26997     width_source = least_bit_hwi (width_source);
26998
26999   /* If the core provides width information, use that.  */
27000   HOST_WIDE_INT over_128 = width_source - 128;
27001   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
27002 }
27003
27004
27005 /* Return true for types that could be supported as SIMD return or
27006    argument types.  */
27007
27008 static bool
27009 supported_simd_type (tree t)
27010 {
27011   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
27012     {
27013       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
27014       return s == 1 || s == 2 || s == 4 || s == 8;
27015     }
27016   return false;
27017 }
27018
27019 /* Return true for types that currently are supported as SIMD return
27020    or argument types.  */
27021
27022 static bool
27023 currently_supported_simd_type (tree t, tree b)
27024 {
27025   if (COMPLEX_FLOAT_TYPE_P (t))
27026     return false;
27027
27028   if (TYPE_SIZE (t) != TYPE_SIZE (b))
27029     return false;
27030
27031   return supported_simd_type (t);
27032 }
27033
27034 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
27035
27036 static int
27037 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
27038                                         struct cgraph_simd_clone *clonei,
27039                                         tree base_type, int num,
27040                                         bool explicit_p)
27041 {
27042   tree t, ret_type;
27043   unsigned int elt_bits, count;
27044   unsigned HOST_WIDE_INT const_simdlen;
27045   poly_uint64 vec_bits;
27046
27047   if (!TARGET_SIMD)
27048     return 0;
27049
27050   /* For now, SVE simdclones won't produce illegal simdlen, So only check
27051      const simdlens here.  */
27052   if (maybe_ne (clonei->simdlen, 0U)
27053       && clonei->simdlen.is_constant (&const_simdlen)
27054       && (const_simdlen < 2
27055           || const_simdlen > 1024
27056           || (const_simdlen & (const_simdlen - 1)) != 0))
27057     {
27058       if (explicit_p)
27059         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
27060                     "unsupported simdlen %wd", const_simdlen);
27061       return 0;
27062     }
27063
27064   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
27065   if (TREE_CODE (ret_type) != VOID_TYPE
27066       && !currently_supported_simd_type (ret_type, base_type))
27067     {
27068       if (!explicit_p)
27069         ;
27070       else if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
27071         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
27072                     "GCC does not currently support mixed size types "
27073                     "for %<simd%> functions");
27074       else if (supported_simd_type (ret_type))
27075         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
27076                     "GCC does not currently support return type %qT "
27077                     "for %<simd%> functions", ret_type);
27078       else
27079         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
27080                     "unsupported return type %qT for %<simd%> functions",
27081                     ret_type);
27082       return 0;
27083     }
27084
27085   int i;
27086   tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
27087   bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
27088
27089   for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
27090        t && t != void_list_node; t = TREE_CHAIN (t), i++)
27091     {
27092       tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
27093
27094       if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
27095           && !currently_supported_simd_type (arg_type, base_type))
27096         {
27097           if (!explicit_p)
27098             ;
27099           else if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
27100             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
27101                         "GCC does not currently support mixed size types "
27102                         "for %<simd%> functions");
27103           else
27104             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
27105                         "GCC does not currently support argument type %qT "
27106                         "for %<simd%> functions", arg_type);
27107           return 0;
27108         }
27109     }
27110
27111   clonei->vecsize_mangle = 'n';
27112   clonei->mask_mode = VOIDmode;
27113   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
27114   if (known_eq (clonei->simdlen, 0U))
27115     {
27116       count = 2;
27117       vec_bits = (num == 0 ? 64 : 128);
27118       clonei->simdlen = exact_div (vec_bits, elt_bits);
27119     }
27120   else
27121     {
27122       count = 1;
27123       vec_bits = clonei->simdlen * elt_bits;
27124       /* For now, SVE simdclones won't produce illegal simdlen, So only check
27125          const simdlens here.  */
27126       if (clonei->simdlen.is_constant (&const_simdlen)
27127           && maybe_ne (vec_bits, 64U) && maybe_ne (vec_bits, 128U))
27128         {
27129           if (explicit_p)
27130             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
27131                         "GCC does not currently support simdlen %wd for "
27132                         "type %qT",
27133                         const_simdlen, base_type);
27134           return 0;
27135         }
27136     }
27137   clonei->vecsize_int = vec_bits;
27138   clonei->vecsize_float = vec_bits;
27139   return count;
27140 }
27141
27142 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
27143
27144 static void
27145 aarch64_simd_clone_adjust (struct cgraph_node *node)
27146 {
27147   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
27148      use the correct ABI.  */
27149
27150   tree t = TREE_TYPE (node->decl);
27151   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
27152                                         TYPE_ATTRIBUTES (t));
27153 }
27154
27155 /* Implement TARGET_SIMD_CLONE_USABLE.  */
27156
27157 static int
27158 aarch64_simd_clone_usable (struct cgraph_node *node)
27159 {
27160   switch (node->simdclone->vecsize_mangle)
27161     {
27162     case 'n':
27163       if (!TARGET_SIMD)
27164         return -1;
27165       return 0;
27166     default:
27167       gcc_unreachable ();
27168     }
27169 }
27170
27171 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
27172
27173 static int
27174 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
27175 {
27176   auto check_attr = [&](const char *name) {
27177     tree attr1 = lookup_attribute (name, TYPE_ATTRIBUTES (type1));
27178     tree attr2 = lookup_attribute (name, TYPE_ATTRIBUTES (type2));
27179     if (!attr1 && !attr2)
27180       return true;
27181
27182     return attr1 && attr2 && attribute_value_equal (attr1, attr2);
27183   };
27184
27185   if (!check_attr ("aarch64_vector_pcs"))
27186     return 0;
27187   if (!check_attr ("Advanced SIMD type"))
27188     return 0;
27189   if (!check_attr ("SVE type"))
27190     return 0;
27191   if (!check_attr ("SVE sizeless type"))
27192     return 0;
27193   return 1;
27194 }
27195
27196 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
27197
27198 static const char *
27199 aarch64_get_multilib_abi_name (void)
27200 {
27201   if (TARGET_BIG_END)
27202     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
27203   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
27204 }
27205
27206 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
27207    global variable based guard use the default else
27208    return a null tree.  */
27209 static tree
27210 aarch64_stack_protect_guard (void)
27211 {
27212   if (aarch64_stack_protector_guard == SSP_GLOBAL)
27213     return default_stack_protect_guard ();
27214
27215   return NULL_TREE;
27216 }
27217
27218 /* Return the diagnostic message string if conversion from FROMTYPE to
27219    TOTYPE is not allowed, NULL otherwise.  */
27220
27221 static const char *
27222 aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
27223 {
27224   if (element_mode (fromtype) != element_mode (totype))
27225     {
27226       /* Do no allow conversions to/from BFmode scalar types.  */
27227       if (TYPE_MODE (fromtype) == BFmode)
27228         return N_("invalid conversion from type %<bfloat16_t%>");
27229       if (TYPE_MODE (totype) == BFmode)
27230         return N_("invalid conversion to type %<bfloat16_t%>");
27231     }
27232
27233   /* Conversion allowed.  */
27234   return NULL;
27235 }
27236
27237 /* Return the diagnostic message string if the unary operation OP is
27238    not permitted on TYPE, NULL otherwise.  */
27239
27240 static const char *
27241 aarch64_invalid_unary_op (int op, const_tree type)
27242 {
27243   /* Reject all single-operand operations on BFmode except for &.  */
27244   if (element_mode (type) == BFmode && op != ADDR_EXPR)
27245     return N_("operation not permitted on type %<bfloat16_t%>");
27246
27247   /* Operation allowed.  */
27248   return NULL;
27249 }
27250
27251 /* Return the diagnostic message string if the binary operation OP is
27252    not permitted on TYPE1 and TYPE2, NULL otherwise.  */
27253
27254 static const char *
27255 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
27256                            const_tree type2)
27257 {
27258   /* Reject all 2-operand operations on BFmode.  */
27259   if (element_mode (type1) == BFmode
27260       || element_mode (type2) == BFmode)
27261     return N_("operation not permitted on type %<bfloat16_t%>");
27262
27263   if (VECTOR_TYPE_P (type1)
27264       && VECTOR_TYPE_P (type2)
27265       && !TYPE_INDIVISIBLE_P (type1)
27266       && !TYPE_INDIVISIBLE_P (type2)
27267       && (aarch64_sve::builtin_type_p (type1)
27268           != aarch64_sve::builtin_type_p (type2)))
27269     return N_("cannot combine GNU and SVE vectors in a binary operation");
27270
27271   /* Operation allowed.  */
27272   return NULL;
27273 }
27274
27275 /* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES.  Here we tell the rest of the
27276    compiler that we automatically ignore the top byte of our pointers, which
27277    allows using -fsanitize=hwaddress.  */
27278 bool
27279 aarch64_can_tag_addresses ()
27280 {
27281   return !TARGET_ILP32;
27282 }
27283
27284 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
27285    section at the end if needed.  */
27286 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
27287 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
27288 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
27289 void
27290 aarch64_file_end_indicate_exec_stack ()
27291 {
27292   file_end_indicate_exec_stack ();
27293
27294   unsigned feature_1_and = 0;
27295   if (aarch64_bti_enabled ())
27296     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
27297
27298   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
27299     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
27300
27301   if (feature_1_and)
27302     {
27303       /* Generate .note.gnu.property section.  */
27304       switch_to_section (get_section (".note.gnu.property",
27305                                       SECTION_NOTYPE, NULL));
27306
27307       /* PT_NOTE header: namesz, descsz, type.
27308          namesz = 4 ("GNU\0")
27309          descsz = 16 (Size of the program property array)
27310                   [(12 + padding) * Number of array elements]
27311          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
27312       assemble_align (POINTER_SIZE);
27313       assemble_integer (GEN_INT (4), 4, 32, 1);
27314       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
27315       assemble_integer (GEN_INT (5), 4, 32, 1);
27316
27317       /* PT_NOTE name.  */
27318       assemble_string ("GNU", 4);
27319
27320       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
27321          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
27322          datasz = 4
27323          data   = feature_1_and.  */
27324       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
27325       assemble_integer (GEN_INT (4), 4, 32, 1);
27326       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
27327
27328       /* Pad the size of the note to the required alignment.  */
27329       assemble_align (POINTER_SIZE);
27330     }
27331 }
27332 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
27333 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
27334 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
27335
27336 /* Helper function for straight line speculation.
27337    Return what barrier should be emitted for straight line speculation
27338    mitigation.
27339    When not mitigating against straight line speculation this function returns
27340    an empty string.
27341    When mitigating against straight line speculation, use:
27342    * SB when the v8.5-A SB extension is enabled.
27343    * DSB+ISB otherwise.  */
27344 const char *
27345 aarch64_sls_barrier (int mitigation_required)
27346 {
27347   return mitigation_required
27348     ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
27349     : "";
27350 }
27351
27352 static GTY (()) tree aarch64_sls_shared_thunks[30];
27353 static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
27354 const char *indirect_symbol_names[30] = {
27355     "__call_indirect_x0",
27356     "__call_indirect_x1",
27357     "__call_indirect_x2",
27358     "__call_indirect_x3",
27359     "__call_indirect_x4",
27360     "__call_indirect_x5",
27361     "__call_indirect_x6",
27362     "__call_indirect_x7",
27363     "__call_indirect_x8",
27364     "__call_indirect_x9",
27365     "__call_indirect_x10",
27366     "__call_indirect_x11",
27367     "__call_indirect_x12",
27368     "__call_indirect_x13",
27369     "__call_indirect_x14",
27370     "__call_indirect_x15",
27371     "", /* "__call_indirect_x16",  */
27372     "", /* "__call_indirect_x17",  */
27373     "__call_indirect_x18",
27374     "__call_indirect_x19",
27375     "__call_indirect_x20",
27376     "__call_indirect_x21",
27377     "__call_indirect_x22",
27378     "__call_indirect_x23",
27379     "__call_indirect_x24",
27380     "__call_indirect_x25",
27381     "__call_indirect_x26",
27382     "__call_indirect_x27",
27383     "__call_indirect_x28",
27384     "__call_indirect_x29",
27385 };
27386
27387 /* Function to create a BLR thunk.  This thunk is used to mitigate straight
27388    line speculation.  Instead of a simple BLR that can be speculated past,
27389    we emit a BL to this thunk, and this thunk contains a BR to the relevant
27390    register.  These thunks have the relevant speculation barries put after
27391    their indirect branch so that speculation is blocked.
27392
27393    We use such a thunk so the speculation barriers are kept off the
27394    architecturally executed path in order to reduce the performance overhead.
27395
27396    When optimizing for size we use stubs shared by the linked object.
27397    When optimizing for performance we emit stubs for each function in the hope
27398    that the branch predictor can better train on jumps specific for a given
27399    function.  */
27400 rtx
27401 aarch64_sls_create_blr_label (int regnum)
27402 {
27403   gcc_assert (STUB_REGNUM_P (regnum));
27404   if (optimize_function_for_size_p (cfun))
27405     {
27406       /* For the thunks shared between different functions in this compilation
27407          unit we use a named symbol -- this is just for users to more easily
27408          understand the generated assembly.  */
27409       aarch64_sls_shared_thunks_needed = true;
27410       const char *thunk_name = indirect_symbol_names[regnum];
27411       if (aarch64_sls_shared_thunks[regnum] == NULL)
27412         {
27413           /* Build a decl representing this function stub and record it for
27414              later.  We build a decl here so we can use the GCC machinery for
27415              handling sections automatically (through `get_named_section` and
27416              `make_decl_one_only`).  That saves us a lot of trouble handling
27417              the specifics of different output file formats.  */
27418           tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
27419                                   get_identifier (thunk_name),
27420                                   build_function_type_list (void_type_node,
27421                                                             NULL_TREE));
27422           DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
27423                                            NULL_TREE, void_type_node);
27424           TREE_PUBLIC (decl) = 1;
27425           TREE_STATIC (decl) = 1;
27426           DECL_IGNORED_P (decl) = 1;
27427           DECL_ARTIFICIAL (decl) = 1;
27428           make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
27429           resolve_unique_section (decl, 0, false);
27430           aarch64_sls_shared_thunks[regnum] = decl;
27431         }
27432
27433       return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
27434     }
27435
27436   if (cfun->machine->call_via[regnum] == NULL)
27437     cfun->machine->call_via[regnum]
27438       = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
27439   return cfun->machine->call_via[regnum];
27440 }
27441
27442 /* Helper function for aarch64_sls_emit_blr_function_thunks and
27443    aarch64_sls_emit_shared_blr_thunks below.  */
27444 static void
27445 aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
27446 {
27447   /* Save in x16 and branch to that function so this transformation does
27448      not prevent jumping to `BTI c` instructions.  */
27449   asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
27450   asm_fprintf (out_file, "\tbr\tx16\n");
27451 }
27452
27453 /* Emit all BLR stubs for this particular function.
27454    Here we emit all the BLR stubs needed for the current function.  Since we
27455    emit these stubs in a consecutive block we know there will be no speculation
27456    gadgets between each stub, and hence we only emit a speculation barrier at
27457    the end of the stub sequences.
27458
27459    This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook.  */
27460 void
27461 aarch64_sls_emit_blr_function_thunks (FILE *out_file)
27462 {
27463   if (! aarch64_harden_sls_blr_p ())
27464     return;
27465
27466   bool any_functions_emitted = false;
27467   /* We must save and restore the current function section since this assembly
27468      is emitted at the end of the function.  This means it can be emitted *just
27469      after* the cold section of a function.  That cold part would be emitted in
27470      a different section.  That switch would trigger a `.cfi_endproc` directive
27471      to be emitted in the original section and a `.cfi_startproc` directive to
27472      be emitted in the new section.  Switching to the original section without
27473      restoring would mean that the `.cfi_endproc` emitted as a function ends
27474      would happen in a different section -- leaving an unmatched
27475      `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
27476      in the standard text section.  */
27477   section *save_text_section = in_section;
27478   switch_to_section (function_section (current_function_decl));
27479   for (int regnum = 0; regnum < 30; ++regnum)
27480     {
27481       rtx specu_label = cfun->machine->call_via[regnum];
27482       if (specu_label == NULL)
27483         continue;
27484
27485       targetm.asm_out.print_operand (out_file, specu_label, 0);
27486       asm_fprintf (out_file, ":\n");
27487       aarch64_sls_emit_function_stub (out_file, regnum);
27488       any_functions_emitted = true;
27489     }
27490   if (any_functions_emitted)
27491     /* Can use the SB if needs be here, since this stub will only be used
27492       by the current function, and hence for the current target.  */
27493     asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
27494   switch_to_section (save_text_section);
27495 }
27496
27497 /* Emit shared BLR stubs for the current compilation unit.
27498    Over the course of compiling this unit we may have converted some BLR
27499    instructions to a BL to a shared stub function.  This is where we emit those
27500    stub functions.
27501    This function is for the stubs shared between different functions in this
27502    compilation unit.  We share when optimizing for size instead of speed.
27503
27504    This function is called through the TARGET_ASM_FILE_END hook.  */
27505 void
27506 aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
27507 {
27508   if (! aarch64_sls_shared_thunks_needed)
27509     return;
27510
27511   for (int regnum = 0; regnum < 30; ++regnum)
27512     {
27513       tree decl = aarch64_sls_shared_thunks[regnum];
27514       if (!decl)
27515         continue;
27516
27517       const char *name = indirect_symbol_names[regnum];
27518       switch_to_section (get_named_section (decl, NULL, 0));
27519       ASM_OUTPUT_ALIGN (out_file, 2);
27520       targetm.asm_out.globalize_label (out_file, name);
27521       /* Only emits if the compiler is configured for an assembler that can
27522          handle visibility directives.  */
27523       targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
27524       ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
27525       ASM_OUTPUT_LABEL (out_file, name);
27526       aarch64_sls_emit_function_stub (out_file, regnum);
27527       /* Use the most conservative target to ensure it can always be used by any
27528          function in the translation unit.  */
27529       asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
27530       ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
27531     }
27532 }
27533
27534 /* Implement TARGET_ASM_FILE_END.  */
27535 void
27536 aarch64_asm_file_end ()
27537 {
27538   aarch64_sls_emit_shared_blr_thunks (asm_out_file);
27539   /* Since this function will be called for the ASM_FILE_END hook, we ensure
27540      that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
27541      for FreeBSD) still gets called.  */
27542 #ifdef TARGET_ASM_FILE_END
27543   TARGET_ASM_FILE_END ();
27544 #endif
27545 }
27546
27547 const char *
27548 aarch64_indirect_call_asm (rtx addr)
27549 {
27550   gcc_assert (REG_P (addr));
27551   if (aarch64_harden_sls_blr_p ())
27552     {
27553       rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
27554       output_asm_insn ("bl\t%0", &stub_label);
27555     }
27556   else
27557    output_asm_insn ("blr\t%0", &addr);
27558   return "";
27559 }
27560
27561 /* Target-specific selftests.  */
27562
27563 #if CHECKING_P
27564
27565 namespace selftest {
27566
27567 /* Selftest for the RTL loader.
27568    Verify that the RTL loader copes with a dump from
27569    print_rtx_function.  This is essentially just a test that class
27570    function_reader can handle a real dump, but it also verifies
27571    that lookup_reg_by_dump_name correctly handles hard regs.
27572    The presence of hard reg names in the dump means that the test is
27573    target-specific, hence it is in this file.  */
27574
27575 static void
27576 aarch64_test_loading_full_dump ()
27577 {
27578   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
27579
27580   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
27581
27582   rtx_insn *insn_1 = get_insn_by_uid (1);
27583   ASSERT_EQ (NOTE, GET_CODE (insn_1));
27584
27585   rtx_insn *insn_15 = get_insn_by_uid (15);
27586   ASSERT_EQ (INSN, GET_CODE (insn_15));
27587   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
27588
27589   /* Verify crtl->return_rtx.  */
27590   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
27591   ASSERT_EQ (0, REGNO (crtl->return_rtx));
27592   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
27593 }
27594
27595 /* Test the fractional_cost class.  */
27596
27597 static void
27598 aarch64_test_fractional_cost ()
27599 {
27600   using cf = fractional_cost;
27601
27602   ASSERT_EQ (cf (0, 20), 0);
27603
27604   ASSERT_EQ (cf (4, 2), 2);
27605   ASSERT_EQ (3, cf (9, 3));
27606
27607   ASSERT_NE (cf (5, 2), 2);
27608   ASSERT_NE (3, cf (8, 3));
27609
27610   ASSERT_EQ (cf (7, 11) + cf (15, 11), 2);
27611   ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15));
27612   ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1);
27613
27614   ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3));
27615   ASSERT_EQ (cf (1, 4) - cf (1, 2), 0);
27616   ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2));
27617   ASSERT_EQ (cf (11, 3) - 3, cf (2, 3));
27618   ASSERT_EQ (3 - cf (7, 3), cf (2, 3));
27619   ASSERT_EQ (3 - cf (10, 3), 0);
27620
27621   ASSERT_EQ (cf (2, 3) * 5, cf (10, 3));
27622   ASSERT_EQ (14 * cf (11, 21), cf (22, 3));
27623
27624   ASSERT_TRUE (cf (4, 15) < cf (5, 15));
27625   ASSERT_FALSE (cf (5, 15) < cf (5, 15));
27626   ASSERT_FALSE (cf (6, 15) < cf (5, 15));
27627   ASSERT_TRUE (cf (1, 3) < cf (2, 5));
27628   ASSERT_TRUE (cf (1, 12) < cf (1, 6));
27629   ASSERT_FALSE (cf (5, 3) < cf (5, 3));
27630   ASSERT_TRUE (cf (239, 240) < 1);
27631   ASSERT_FALSE (cf (240, 240) < 1);
27632   ASSERT_FALSE (cf (241, 240) < 1);
27633   ASSERT_FALSE (2 < cf (207, 104));
27634   ASSERT_FALSE (2 < cf (208, 104));
27635   ASSERT_TRUE (2 < cf (209, 104));
27636
27637   ASSERT_TRUE (cf (4, 15) < cf (5, 15));
27638   ASSERT_FALSE (cf (5, 15) < cf (5, 15));
27639   ASSERT_FALSE (cf (6, 15) < cf (5, 15));
27640   ASSERT_TRUE (cf (1, 3) < cf (2, 5));
27641   ASSERT_TRUE (cf (1, 12) < cf (1, 6));
27642   ASSERT_FALSE (cf (5, 3) < cf (5, 3));
27643   ASSERT_TRUE (cf (239, 240) < 1);
27644   ASSERT_FALSE (cf (240, 240) < 1);
27645   ASSERT_FALSE (cf (241, 240) < 1);
27646   ASSERT_FALSE (2 < cf (207, 104));
27647   ASSERT_FALSE (2 < cf (208, 104));
27648   ASSERT_TRUE (2 < cf (209, 104));
27649
27650   ASSERT_FALSE (cf (4, 15) >= cf (5, 15));
27651   ASSERT_TRUE (cf (5, 15) >= cf (5, 15));
27652   ASSERT_TRUE (cf (6, 15) >= cf (5, 15));
27653   ASSERT_FALSE (cf (1, 3) >= cf (2, 5));
27654   ASSERT_FALSE (cf (1, 12) >= cf (1, 6));
27655   ASSERT_TRUE (cf (5, 3) >= cf (5, 3));
27656   ASSERT_FALSE (cf (239, 240) >= 1);
27657   ASSERT_TRUE (cf (240, 240) >= 1);
27658   ASSERT_TRUE (cf (241, 240) >= 1);
27659   ASSERT_TRUE (2 >= cf (207, 104));
27660   ASSERT_TRUE (2 >= cf (208, 104));
27661   ASSERT_FALSE (2 >= cf (209, 104));
27662
27663   ASSERT_FALSE (cf (4, 15) > cf (5, 15));
27664   ASSERT_FALSE (cf (5, 15) > cf (5, 15));
27665   ASSERT_TRUE (cf (6, 15) > cf (5, 15));
27666   ASSERT_FALSE (cf (1, 3) > cf (2, 5));
27667   ASSERT_FALSE (cf (1, 12) > cf (1, 6));
27668   ASSERT_FALSE (cf (5, 3) > cf (5, 3));
27669   ASSERT_FALSE (cf (239, 240) > 1);
27670   ASSERT_FALSE (cf (240, 240) > 1);
27671   ASSERT_TRUE (cf (241, 240) > 1);
27672   ASSERT_TRUE (2 > cf (207, 104));
27673   ASSERT_FALSE (2 > cf (208, 104));
27674   ASSERT_FALSE (2 > cf (209, 104));
27675
27676   ASSERT_EQ (cf (1, 2).ceil (), 1);
27677   ASSERT_EQ (cf (11, 7).ceil (), 2);
27678   ASSERT_EQ (cf (20, 1).ceil (), 20);
27679   ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe);
27680   ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff);
27681   ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff);
27682   ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe);
27683   ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff);
27684
27685   ASSERT_EQ (cf (1, 2).as_double (), 0.5);
27686 }
27687
27688 /* Run all target-specific selftests.  */
27689
27690 static void
27691 aarch64_run_selftests (void)
27692 {
27693   aarch64_test_loading_full_dump ();
27694   aarch64_test_fractional_cost ();
27695 }
27696
27697 } // namespace selftest
27698
27699 #endif /* #if CHECKING_P */
27700
27701 #undef TARGET_STACK_PROTECT_GUARD
27702 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
27703
27704 #undef TARGET_ADDRESS_COST
27705 #define TARGET_ADDRESS_COST aarch64_address_cost
27706
27707 /* This hook will determines whether unnamed bitfields affect the alignment
27708    of the containing structure.  The hook returns true if the structure
27709    should inherit the alignment requirements of an unnamed bitfield's
27710    type.  */
27711 #undef TARGET_ALIGN_ANON_BITFIELD
27712 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
27713
27714 #undef TARGET_ASM_ALIGNED_DI_OP
27715 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
27716
27717 #undef TARGET_ASM_ALIGNED_HI_OP
27718 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
27719
27720 #undef TARGET_ASM_ALIGNED_SI_OP
27721 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
27722
27723 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
27724 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
27725   hook_bool_const_tree_hwi_hwi_const_tree_true
27726
27727 #undef TARGET_ASM_FILE_START
27728 #define TARGET_ASM_FILE_START aarch64_start_file
27729
27730 #undef TARGET_ASM_OUTPUT_MI_THUNK
27731 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
27732
27733 #undef TARGET_ASM_SELECT_RTX_SECTION
27734 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
27735
27736 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
27737 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
27738
27739 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
27740 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
27741
27742 #undef TARGET_BUILD_BUILTIN_VA_LIST
27743 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
27744
27745 #undef TARGET_CALLEE_COPIES
27746 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
27747
27748 #undef TARGET_CAN_ELIMINATE
27749 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
27750
27751 #undef TARGET_CAN_INLINE_P
27752 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
27753
27754 #undef TARGET_CANNOT_FORCE_CONST_MEM
27755 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
27756
27757 #undef TARGET_CASE_VALUES_THRESHOLD
27758 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
27759
27760 #undef TARGET_CONDITIONAL_REGISTER_USAGE
27761 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
27762
27763 #undef TARGET_MEMBER_TYPE_FORCES_BLK
27764 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
27765
27766 /* Only the least significant bit is used for initialization guard
27767    variables.  */
27768 #undef TARGET_CXX_GUARD_MASK_BIT
27769 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
27770
27771 #undef TARGET_C_MODE_FOR_SUFFIX
27772 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
27773
27774 #ifdef TARGET_BIG_ENDIAN_DEFAULT
27775 #undef  TARGET_DEFAULT_TARGET_FLAGS
27776 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
27777 #endif
27778
27779 #undef TARGET_CLASS_MAX_NREGS
27780 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
27781
27782 #undef TARGET_BUILTIN_DECL
27783 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
27784
27785 #undef TARGET_BUILTIN_RECIPROCAL
27786 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
27787
27788 #undef TARGET_C_EXCESS_PRECISION
27789 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
27790
27791 #undef  TARGET_EXPAND_BUILTIN
27792 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
27793
27794 #undef TARGET_EXPAND_BUILTIN_VA_START
27795 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
27796
27797 #undef TARGET_FOLD_BUILTIN
27798 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
27799
27800 #undef TARGET_FUNCTION_ARG
27801 #define TARGET_FUNCTION_ARG aarch64_function_arg
27802
27803 #undef TARGET_FUNCTION_ARG_ADVANCE
27804 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
27805
27806 #undef TARGET_FUNCTION_ARG_BOUNDARY
27807 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
27808
27809 #undef TARGET_FUNCTION_ARG_PADDING
27810 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
27811
27812 #undef TARGET_GET_RAW_RESULT_MODE
27813 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
27814 #undef TARGET_GET_RAW_ARG_MODE
27815 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
27816
27817 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
27818 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
27819
27820 #undef TARGET_FUNCTION_VALUE
27821 #define TARGET_FUNCTION_VALUE aarch64_function_value
27822
27823 #undef TARGET_FUNCTION_VALUE_REGNO_P
27824 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
27825
27826 #undef TARGET_GIMPLE_FOLD_BUILTIN
27827 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
27828
27829 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
27830 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
27831
27832 #undef  TARGET_INIT_BUILTINS
27833 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
27834
27835 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
27836 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
27837   aarch64_ira_change_pseudo_allocno_class
27838
27839 #undef TARGET_LEGITIMATE_ADDRESS_P
27840 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
27841
27842 #undef TARGET_LEGITIMATE_CONSTANT_P
27843 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
27844
27845 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
27846 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
27847   aarch64_legitimize_address_displacement
27848
27849 #undef TARGET_LIBGCC_CMP_RETURN_MODE
27850 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
27851
27852 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
27853 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
27854 aarch64_libgcc_floating_mode_supported_p
27855
27856 #undef TARGET_MANGLE_TYPE
27857 #define TARGET_MANGLE_TYPE aarch64_mangle_type
27858
27859 #undef TARGET_INVALID_CONVERSION
27860 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
27861
27862 #undef TARGET_INVALID_UNARY_OP
27863 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
27864
27865 #undef TARGET_INVALID_BINARY_OP
27866 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
27867
27868 #undef TARGET_VERIFY_TYPE_CONTEXT
27869 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
27870
27871 #undef TARGET_MEMORY_MOVE_COST
27872 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
27873
27874 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
27875 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
27876
27877 #undef TARGET_MUST_PASS_IN_STACK
27878 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
27879
27880 /* This target hook should return true if accesses to volatile bitfields
27881    should use the narrowest mode possible.  It should return false if these
27882    accesses should use the bitfield container type.  */
27883 #undef TARGET_NARROW_VOLATILE_BITFIELD
27884 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
27885
27886 #undef  TARGET_OPTION_OVERRIDE
27887 #define TARGET_OPTION_OVERRIDE aarch64_override_options
27888
27889 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
27890 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
27891   aarch64_override_options_after_change
27892
27893 #undef TARGET_OFFLOAD_OPTIONS
27894 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
27895
27896 #undef TARGET_OPTION_RESTORE
27897 #define TARGET_OPTION_RESTORE aarch64_option_restore
27898
27899 #undef TARGET_OPTION_PRINT
27900 #define TARGET_OPTION_PRINT aarch64_option_print
27901
27902 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
27903 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
27904
27905 #undef TARGET_SET_CURRENT_FUNCTION
27906 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
27907
27908 #undef TARGET_PASS_BY_REFERENCE
27909 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
27910
27911 #undef TARGET_PREFERRED_RELOAD_CLASS
27912 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
27913
27914 #undef TARGET_SCHED_REASSOCIATION_WIDTH
27915 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
27916
27917 #undef TARGET_DWARF_FRAME_REG_MODE
27918 #define TARGET_DWARF_FRAME_REG_MODE aarch64_dwarf_frame_reg_mode
27919
27920 #undef TARGET_PROMOTED_TYPE
27921 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
27922
27923 #undef TARGET_SECONDARY_RELOAD
27924 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
27925
27926 #undef TARGET_SECONDARY_MEMORY_NEEDED
27927 #define TARGET_SECONDARY_MEMORY_NEEDED aarch64_secondary_memory_needed
27928
27929 #undef TARGET_SHIFT_TRUNCATION_MASK
27930 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
27931
27932 #undef TARGET_SETUP_INCOMING_VARARGS
27933 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
27934
27935 #undef TARGET_STRUCT_VALUE_RTX
27936 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
27937
27938 #undef TARGET_REGISTER_MOVE_COST
27939 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
27940
27941 #undef TARGET_RETURN_IN_MEMORY
27942 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
27943
27944 #undef TARGET_RETURN_IN_MSB
27945 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
27946
27947 #undef TARGET_RTX_COSTS
27948 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
27949
27950 #undef TARGET_SCALAR_MODE_SUPPORTED_P
27951 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
27952
27953 #undef TARGET_SCHED_ISSUE_RATE
27954 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
27955
27956 #undef TARGET_SCHED_VARIABLE_ISSUE
27957 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
27958
27959 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
27960 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
27961   aarch64_sched_first_cycle_multipass_dfa_lookahead
27962
27963 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
27964 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
27965   aarch64_first_cycle_multipass_dfa_lookahead_guard
27966
27967 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
27968 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
27969   aarch64_get_separate_components
27970
27971 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
27972 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
27973   aarch64_components_for_bb
27974
27975 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
27976 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
27977   aarch64_disqualify_components
27978
27979 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
27980 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
27981   aarch64_emit_prologue_components
27982
27983 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
27984 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
27985   aarch64_emit_epilogue_components
27986
27987 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
27988 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
27989   aarch64_set_handled_components
27990
27991 #undef TARGET_TRAMPOLINE_INIT
27992 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
27993
27994 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
27995 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
27996
27997 #undef TARGET_VECTOR_MODE_SUPPORTED_P
27998 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
27999
28000 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
28001 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
28002
28003 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
28004 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
28005   aarch64_builtin_support_vector_misalignment
28006
28007 #undef TARGET_ARRAY_MODE
28008 #define TARGET_ARRAY_MODE aarch64_array_mode
28009
28010 #undef TARGET_ARRAY_MODE_SUPPORTED_P
28011 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
28012
28013 #undef TARGET_VECTORIZE_CREATE_COSTS
28014 #define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
28015
28016 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
28017 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
28018   aarch64_builtin_vectorization_cost
28019
28020 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
28021 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
28022
28023 #undef TARGET_VECTORIZE_BUILTINS
28024 #define TARGET_VECTORIZE_BUILTINS
28025
28026 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
28027 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
28028   aarch64_autovectorize_vector_modes
28029
28030 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
28031 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
28032   aarch64_atomic_assign_expand_fenv
28033
28034 /* Section anchor support.  */
28035
28036 #undef TARGET_MIN_ANCHOR_OFFSET
28037 #define TARGET_MIN_ANCHOR_OFFSET -256
28038
28039 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
28040    byte offset; we can do much more for larger data types, but have no way
28041    to determine the size of the access.  We assume accesses are aligned.  */
28042 #undef TARGET_MAX_ANCHOR_OFFSET
28043 #define TARGET_MAX_ANCHOR_OFFSET 4095
28044
28045 #undef TARGET_VECTOR_ALIGNMENT
28046 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
28047
28048 #undef TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
28049 #define TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST \
28050   aarch64_vectorize_can_special_div_by_constant
28051
28052 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
28053 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
28054   aarch64_vectorize_preferred_vector_alignment
28055 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
28056 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
28057   aarch64_simd_vector_alignment_reachable
28058
28059 /* vec_perm support.  */
28060
28061 #undef TARGET_VECTORIZE_VEC_PERM_CONST
28062 #define TARGET_VECTORIZE_VEC_PERM_CONST \
28063   aarch64_vectorize_vec_perm_const
28064
28065 #undef TARGET_VECTORIZE_RELATED_MODE
28066 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
28067 #undef TARGET_VECTORIZE_GET_MASK_MODE
28068 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
28069 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
28070 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
28071   aarch64_empty_mask_is_expensive
28072 #undef TARGET_PREFERRED_ELSE_VALUE
28073 #define TARGET_PREFERRED_ELSE_VALUE \
28074   aarch64_preferred_else_value
28075
28076 #undef TARGET_INIT_LIBFUNCS
28077 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
28078
28079 #undef TARGET_FIXED_CONDITION_CODE_REGS
28080 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
28081
28082 #undef TARGET_FLAGS_REGNUM
28083 #define TARGET_FLAGS_REGNUM CC_REGNUM
28084
28085 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
28086 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
28087
28088 #undef TARGET_ASAN_SHADOW_OFFSET
28089 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
28090
28091 #undef TARGET_LEGITIMIZE_ADDRESS
28092 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
28093
28094 #undef TARGET_SCHED_CAN_SPECULATE_INSN
28095 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
28096
28097 #undef TARGET_CAN_USE_DOLOOP_P
28098 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
28099
28100 #undef TARGET_SCHED_ADJUST_PRIORITY
28101 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
28102
28103 #undef TARGET_SCHED_MACRO_FUSION_P
28104 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
28105
28106 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
28107 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
28108
28109 #undef TARGET_SCHED_FUSION_PRIORITY
28110 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
28111
28112 #undef TARGET_UNSPEC_MAY_TRAP_P
28113 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
28114
28115 #undef TARGET_USE_PSEUDO_PIC_REG
28116 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
28117
28118 #undef TARGET_PRINT_OPERAND
28119 #define TARGET_PRINT_OPERAND aarch64_print_operand
28120
28121 #undef TARGET_PRINT_OPERAND_ADDRESS
28122 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
28123
28124 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
28125 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
28126
28127 #undef TARGET_OPTAB_SUPPORTED_P
28128 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
28129
28130 #undef TARGET_OMIT_STRUCT_RETURN_REG
28131 #define TARGET_OMIT_STRUCT_RETURN_REG true
28132
28133 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
28134 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
28135   aarch64_dwarf_poly_indeterminate_value
28136
28137 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
28138 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
28139 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
28140
28141 #undef TARGET_HARD_REGNO_NREGS
28142 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
28143 #undef TARGET_HARD_REGNO_MODE_OK
28144 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
28145
28146 #undef TARGET_MODES_TIEABLE_P
28147 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
28148
28149 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
28150 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
28151   aarch64_hard_regno_call_part_clobbered
28152
28153 #undef TARGET_INSN_CALLEE_ABI
28154 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
28155
28156 #undef TARGET_CONSTANT_ALIGNMENT
28157 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
28158
28159 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
28160 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
28161   aarch64_stack_clash_protection_alloca_probe_range
28162
28163 #undef TARGET_COMPUTE_PRESSURE_CLASSES
28164 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
28165
28166 #undef TARGET_CAN_CHANGE_MODE_CLASS
28167 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
28168
28169 #undef TARGET_SELECT_EARLY_REMAT_MODES
28170 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
28171
28172 #undef TARGET_SPECULATION_SAFE_VALUE
28173 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
28174
28175 #undef TARGET_ESTIMATED_POLY_VALUE
28176 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
28177
28178 #undef TARGET_ATTRIBUTE_TABLE
28179 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
28180
28181 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
28182 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
28183   aarch64_simd_clone_compute_vecsize_and_simdlen
28184
28185 #undef TARGET_SIMD_CLONE_ADJUST
28186 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
28187
28188 #undef TARGET_SIMD_CLONE_USABLE
28189 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
28190
28191 #undef TARGET_COMP_TYPE_ATTRIBUTES
28192 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
28193
28194 #undef TARGET_GET_MULTILIB_ABI_NAME
28195 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
28196
28197 #undef TARGET_FNTYPE_ABI
28198 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
28199
28200 #undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
28201 #define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
28202
28203 #if CHECKING_P
28204 #undef TARGET_RUN_TARGET_SELFTESTS
28205 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
28206 #endif /* #if CHECKING_P */
28207
28208 #undef TARGET_ASM_POST_CFI_STARTPROC
28209 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
28210
28211 #undef TARGET_STRICT_ARGUMENT_NAMING
28212 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
28213
28214 #undef TARGET_MD_ASM_ADJUST
28215 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
28216
28217 #undef TARGET_ASM_FILE_END
28218 #define TARGET_ASM_FILE_END aarch64_asm_file_end
28219
28220 #undef TARGET_ASM_FUNCTION_EPILOGUE
28221 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
28222
28223 #undef TARGET_HAVE_SHADOW_CALL_STACK
28224 #define TARGET_HAVE_SHADOW_CALL_STACK true
28225
28226 #undef TARGET_CONST_ANCHOR
28227 #define TARGET_CONST_ANCHOR 0x1000000
28228
28229 struct gcc_target targetm = TARGET_INITIALIZER;
28230
28231 #include "gt-aarch64.h"